My favorites | Sign in
Project Logo
Project hosting will be READ-ONLY Wednesday at 8am PST due to brief network maintenance.
                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
package org.karticks.mapreduce;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Hashtable;
import java.util.Map;

/**
* Implements the Mapper interface to do word counts. Does absolutely no processing
* on the stream i.e. assumes that there are no punctuation marks or line endings.
*
* @author Kartick Suriamoorthy
*/
public class WordCountMapper implements Mapper
{
/**
* Reads the InputStream and parses words out of the stream. Assumes that the
* words are just separated by spaces (" ") and that there are no punctuation
* marks or line endings. Converts all the words into lower case and stores
* them (and their counts) in the resulting Map.
*/
public Map<String, Integer> doMap(InputStream is)
{
Map<String, Integer> map = new Hashtable<String, Integer>();

// reads the input stream into a String
String s = readInputStream(is);

// splits the string int words (using the space character as the separator)
String[] words = s.split("\\s+");

// iterates over the words and stores them in hashtable
for (String word : words)
{
if (word.length() > 0)
{
word = word.toLowerCase();

Integer value = map.get(word);

if (value == null)
{
// the word is being inserted into the hashtable for the first time
map.put(word, new Integer(1));
}
else
{
// the word already exists in the hashable, so increment its counter
// by one, and then insert it back into the hashtable
value++;
map.put(word, value);
}
}
}

return map;
}

// reads a input stream into a string. assumes that the source stream is
// not too big that it cannot be stored in a String (and hence does not
// take into account out-of-memory conditions)
private String readInputStream(InputStream is)
{
try
{
BufferedInputStream buis = new BufferedInputStream(is);

ByteArrayOutputStream baos = new ByteArrayOutputStream();

byte[] buffer = new byte[512];
int read = 0;

while (true)
{
read = buis.read(buffer, 0, buffer.length);

// check for end of stream
if (read == -1)
{
break;
}
else
{
baos.write(buffer, 0, read);
}
}

baos.flush();

String s = baos.toString();

baos.close();

return s;
}
catch (IOException ioe)
{
throw new RuntimeException("Error while reading from input stream. Error message : " + ioe.getMessage(), ioe);
}
}

}
Show details Hide details

Change log

r69 by kartick.suriamoorthy on Jul 31, 2009   Diff
Use a regular expression to handle
whitespace. And also checked to see if the
word's length is greater than zero (i.e.
it is not an empty string).
Go to: 
Project members, sign in to write a code review

Older revisions

r66 by kartick.suriamoorthy on Jul 29, 2009   Diff
added more documentation
r59 by kartick.suriamoorthy on Jul 29, 2009   Diff
initial checkin of map-reduce sources
All revisions of this file

File info

Size: 2658 bytes, 104 lines
Hosted by Google Code