package org.myorg; import java.io.IOException; import java.util.regex.Pattern; // utilisation de regex.Pattern pour extraire les mots des fichiers d'entrée (input files). import org.apache.hadoop.conf.Configured; // Cette application etends (hérite de) la classe Configured, import org.apache.hadoop.util.Tool; // et implemente l'interface Tool. import org.apache.hadoop.util.ToolRunner; //Hadoop exécutera le programme dans une objet configuration object. // We use ToolRunner to run the MapReduce application. import org.apache.hadoop.mapreduce.Job; // We need the Job class to create, configure, and run an instance // of the MapReduce application import org.apache.hadoop.mapreduce.Mapper; // We extend the Mapper class with our own Mapclass // and add our own processing instructions import org.apache.hadoop.mapreduce.Reducer; // We extend Reducer class and customize our own Reduce class. import org.apache.hadoop.fs.Path; // Use the Path class to access files in HDFS. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; // In our job configuration instructions, // we pass required paths using the FileInputFormat import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; // and FileOutputFormat classes. import org.apache.hadoop.io.IntWritable; // Writable objects have convenience methods for writing, reading, and comparing import org.apache.hadoop.io.LongWritable; //values during map and reduce processing import org.apache.hadoop.io.Text; import org.apache.log4j.Logger;// The Logger class sends debugging messages from inside the mapper and reducer classes // Messages passed to Logger are displayed in the map or reduce logs for the job on the Hadoop server. //****** WordCount includes main and run methods, and the inner classes Map andReduce. //****** The class begins by initializing the logger public class WordCount extends Configured implements Tool { //private static final Logger LOG = Logger.getLogger(WordCount.class); //**** The main method invokes ToolRunner, which creates and runs a new instance of WordCount, //**** passing the command line arguments. public static void main(String[] args) throws Exception { int res = ToolRunner.run(new WordCount(), args); System.exit(res); } //**** The run method configures the job (which includes setting paths passed in at the command line), //**** starts the job, waits for the job to complete, and then returns an integer value as the success flag. public int run(String[] args) throws Exception { final long MAX_SPLIT_SIZE = ; final long MIN_SPLIT_SIZE = ; Configuration conf = getConf(); conf.set("mapred.max.split.size", MAX_SPLIT_SIZE); conf.set("mapred.min.split.size", MIN_SPLIT_SIZE); //***** Create a new Job Job job = Job.getInstance(conf, this.getClass().toString()); // Job Class:used to configure the job, // submit it, control its execution, // and query the state. job.setJarByClass(this.getClass()); //Hadoop will find out the relevant jar by finding out that the class specified as // it's parameter to be present as part of that jar //************ Specify various job-specific parameters //** Set the input and output paths for the application. //** We store our input files in HDFS, and then pass the input and output paths as command-line arguments at runtime. FileInputFormat.addInputPath(job, new Path(args[0])); // we can have more than one input path FileOutputFormat.setOutputPath(job, new Path(args[1])); //** Set the map class and reduce class for the job. job.setMapperClass(Map.class); // In this case, use the Map and Reduce inner classes defined in this class. job.setReducerClass(Reduce.class); //** Use a Text object to output the key (in this case, the word being counted) job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // and the value (in this case, the number of times the word appears). //**** Launch the job and wait for it to finish. The method syntax iswaitForCompletion(boolean verbose). //**** When true, the method reports its progress as the Map and Reduce classes run. //**** When false, the method reports progress up to, but not including, the Map and Reduce processes. return job.waitForCompletion(true) ? 0 : 1; // 0=Success, anything other than 0 indicates a failure like 1. } //**** Hadoop invokes the map method once for evespry key/value pair from your input source. //**** the map method receives the offset of the first character in the current line of input as the key, //**** and a Text object representing an entire line of text from the input file as the value. public static class Map extends Mapper { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); private long numRecords = 0; //Create a regular expression pattern you can use to parse each line of input text on word boundaries ("\b"). // Word boundaries include spaces, tabs, and punctuation. private static final Pattern WORD_BOUNDARY = Pattern.compile("\\s*\\b\\s*"); @Override public void map(LongWritable offset, Text lineText, Context context) throws IOException, InterruptedException { String line = lineText.toString();// Convert the Text object to a string Text currentWord ; for (String word : WORD_BOUNDARY.split(line)) { if (word.isEmpty()) { continue; } // Comment faire pour avoir la même clé pour tous les tuples? currentWord = new Text(word); context.write(currentWord,one); } } } public static class Reduce extends Reducer { @Override public void reduce(Text word, Iterable counts, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable count : counts) { sum += count.get(); } context.write(word, new IntWritable(sum)); } } } /* Jars can have files inside them that point to other classes and jars located elsewhere and when those other things they point to don't exist, you see warnings like the ones you're getting */