WordCount v1.0
This works with a local-standalone, pseudo-distributed or fully-distributed Hadoop installation.
Source Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCount { public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context ) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } } |
Usage
1 |
export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar |
Compile WordCount.java:
1 2 3 4 5 6 |
$ hadoop com.sun.tools.javac.Main WordCount.java $ ls -l -rw-r--r-- 1 hduser hadoop 1501 พ.ย. 23 15:55 WordCount.class -rw-r--r-- 1 hduser hadoop 1739 พ.ย. 23 15:55 WordCount$IntSumReducer.class -rw-r--r-- 1 hduser hadoop 2089 พ.ย. 23 15:54 WordCount.java -rw-r--r-- 1 hduser hadoop 1736 พ.ย. 23 15:55 WordCount$TokenizerMapper.class |
create a jar:
1 |
$ jar cf WordCount.jar WordCount*.class |
Sample text-files as input:
1 2 3 4 5 6 7 |
$ hadoop fs -ls input -rw-r--r-- 1 hduser supergroup 22 2015-11-23 15:25 input/file01 -rw-r--r-- 1 hduser supergroup 28 2015-11-23 15:25 input/file02 $ hadoop fs -cat input/file01 Hello World Bye World $ hadoop fs -cat input/file02 Hello Hadoop Goodbye Hadoop |
Run the application:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
$ hadoop jar WordCount.jar WordCount input output 15/11/23 15:42:58 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 15/11/23 15:42:59 INFO Configuration.deprecation: session.id is deprecated. Instead, use dfs.metrics.session-id 15/11/23 15:42:59 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId= 15/11/23 15:42:59 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this. 15/11/23 15:42:59 INFO input.FileInputFormat: Total input paths to process : 2 15/11/23 15:42:59 INFO mapreduce.JobSubmitter: number of splits:2 15/11/23 15:43:00 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local1232649592_0001 15/11/23 15:43:00 INFO mapreduce.Job: The url to track the job: http://localhost:8080/ 15/11/23 15:43:00 INFO mapreduce.Job: Running job: job_local1232649592_0001 15/11/23 15:43:00 INFO mapred.LocalJobRunner: OutputCommitter set in config null ... File Input Format Counters Bytes Read=50 File Output Format Counters Bytes Written=41 |
Output:
1 2 3 4 5 |
$ hadoop fs -ls output 15/11/23 15:51:54 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Found 2 items -rw-r--r-- 1 hduser supergroup 0 2015-11-23 15:26 output/_SUCCESS -rw-r--r-- 1 hduser supergroup 41 2015-11-23 15:26 output/part-r-00000 |
1 2 3 4 5 6 7 |
$ hadoop fs -cat output/part-r-00000 15/11/23 15:52:19 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Bye 1 Goodbye 1 Hadoop 2 Hello 2 World 2 |