diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index f3d53348d4..d3d3fc7314 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -2,6 +2,13 @@ Hadoop MapReduce Change Log Trunk (unreleased changes) + INCOMPATIBLE CHANGES + + NEW FEATURES + + MAPREDUCE-2669. Add new examples for Mean, Median, and Standard Deviation. + (Plamen Jeliazkov via shv) + IMPROVEMENTS MAPREDUCE-2887 due to HADOOP-7524 Change RPC to allow multiple protocols diff --git a/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordMean.java b/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordMean.java new file mode 100644 index 0000000000..bc2d658b23 --- /dev/null +++ b/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordMean.java @@ -0,0 +1,196 @@ +package org.apache.hadoop.examples; + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.StringTokenizer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +public class WordMean extends Configured implements Tool { + + private double mean = 0; + + private final static Text COUNT = new Text("count"); + private final static Text LENGTH = new Text("length"); + private final static LongWritable ONE = new LongWritable(1); + + /** + * Maps words from line of text into 2 key-value pairs; one key-value pair for + * counting the word, another for counting its length. + */ + public static class WordMeanMapper extends + Mapper { + + private LongWritable wordLen = new LongWritable(); + + /** + * Emits 2 key-value pairs for counting the word and its length. Outputs are + * (Text, LongWritable). + * + * @param value + * This will be a line of text coming in from our input file. + */ + public void map(Object key, Text value, Context context) + throws IOException, InterruptedException { + StringTokenizer itr = new StringTokenizer(value.toString()); + while (itr.hasMoreTokens()) { + String string = itr.nextToken(); + this.wordLen.set(string.length()); + context.write(LENGTH, this.wordLen); + context.write(COUNT, ONE); + } + } + } + + /** + * Performs integer summation of all the values for each key. + */ + public static class WordMeanReducer extends + Reducer { + + private LongWritable sum = new LongWritable(); + + /** + * Sums all the individual values within the iterator and writes them to the + * same key. + * + * @param key + * This will be one of 2 constants: LENGTH_STR or COUNT_STR. + * @param values + * This will be an iterator of all the values associated with that + * key. + */ + public void reduce(Text key, Iterable values, Context context) + throws IOException, InterruptedException { + + int theSum = 0; + for (LongWritable val : values) { + theSum += val.get(); + } + sum.set(theSum); + context.write(key, sum); + } + } + + /** + * Reads the output file and parses the summation of lengths, and the word + * count, to perform a quick calculation of the mean. + * + * @param path + * The path to find the output file in. Set in main to the output + * directory. + * @throws IOException + * If it cannot access the output directory, we throw an exception. + */ + private double readAndCalcMean(Path path, Configuration conf) + throws IOException { + FileSystem fs = FileSystem.get(conf); + Path file = new Path(path, "part-r-00000"); + + if (!fs.exists(file)) + throw new IOException("Output not found!"); + + BufferedReader br = null; + + // average = total sum / number of elements; + try { + br = new BufferedReader(new InputStreamReader(fs.open(file))); + + long count = 0; + long length = 0; + + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + + // grab type + String type = st.nextToken(); + + // differentiate + if (type.equals(COUNT.toString())) { + String countLit = st.nextToken(); + count = Long.parseLong(countLit); + } else if (type.equals(LENGTH.toString())) { + String lengthLit = st.nextToken(); + length = Long.parseLong(lengthLit); + } + } + + double theMean = (((double) length) / ((double) count)); + System.out.println("The mean is: " + theMean); + return theMean; + } finally { + br.close(); + } + } + + public static void main(String[] args) throws Exception { + ToolRunner.run(new Configuration(), new WordMean(), args); + } + + @Override + public int run(String[] args) throws Exception { + if (args.length != 2) { + System.err.println("Usage: wordmean "); + return 0; + } + + Configuration conf = getConf(); + + @SuppressWarnings("deprecation") + Job job = new Job(conf, "word mean"); + job.setJarByClass(WordMean.class); + job.setMapperClass(WordMeanMapper.class); + job.setCombinerClass(WordMeanReducer.class); + job.setReducerClass(WordMeanReducer.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(LongWritable.class); + FileInputFormat.addInputPath(job, new Path(args[0])); + Path outputpath = new Path(args[1]); + FileOutputFormat.setOutputPath(job, outputpath); + boolean result = job.waitForCompletion(true); + mean = readAndCalcMean(outputpath, conf); + + return (result ? 0 : 1); + } + + /** + * Only valuable after run() called. + * + * @return Returns the mean value. + */ + public double getMean() { + return mean; + } +} \ No newline at end of file diff --git a/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordMedian.java b/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordMedian.java new file mode 100644 index 0000000000..406d19ed4f --- /dev/null +++ b/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordMedian.java @@ -0,0 +1,208 @@ +package org.apache.hadoop.examples; + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.StringTokenizer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.TaskCounter; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +public class WordMedian extends Configured implements Tool { + + private double median = 0; + private final static IntWritable ONE = new IntWritable(1); + + /** + * Maps words from line of text into a key-value pair; the length of the word + * as the key, and 1 as the value. + */ + public static class WordMedianMapper extends + Mapper { + + private IntWritable length = new IntWritable(); + + /** + * Emits a key-value pair for counting the word. Outputs are (IntWritable, + * IntWritable). + * + * @param value + * This will be a line of text coming in from our input file. + */ + public void map(Object key, Text value, Context context) + throws IOException, InterruptedException { + StringTokenizer itr = new StringTokenizer(value.toString()); + while (itr.hasMoreTokens()) { + String string = itr.nextToken(); + length.set(string.length()); + context.write(length, ONE); + } + } + } + + /** + * Performs integer summation of all the values for each key. + */ + public static class WordMedianReducer extends + Reducer { + + private IntWritable val = new IntWritable(); + + /** + * Sums all the individual values within the iterator and writes them to the + * same key. + * + * @param key + * This will be a length of a word that was read. + * @param values + * This will be an iterator of all the values associated with that + * key. + */ + public void reduce(IntWritable key, Iterable values, + Context context) throws IOException, InterruptedException { + + int sum = 0; + for (IntWritable value : values) { + sum += value.get(); + } + val.set(sum); + context.write(key, val); + } + } + + /** + * This is a standard program to read and find a median value based on a file + * of word counts such as: 1 456, 2 132, 3 56... Where the first values are + * the word lengths and the following values are the number of times that + * words of that length appear. + * + * @param path + * The path to read the HDFS file from (part-r-00000...00001...etc). + * @param medianIndex1 + * The first length value to look for. + * @param medianIndex2 + * The second length value to look for (will be the same as the first + * if there are an even number of words total). + * @throws IOException + * If file cannot be found, we throw an exception. + * */ + private double readAndFindMedian(String path, int medianIndex1, + int medianIndex2, Configuration conf) throws IOException { + FileSystem fs = FileSystem.get(conf); + Path file = new Path(path, "part-r-00000"); + + if (!fs.exists(file)) + throw new IOException("Output not found!"); + + BufferedReader br = null; + + try { + br = new BufferedReader(new InputStreamReader(fs.open(file))); + int num = 0; + + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + + // grab length + String currLen = st.nextToken(); + + // grab count + String lengthFreq = st.nextToken(); + + int prevNum = num; + num += Integer.parseInt(lengthFreq); + + if (medianIndex2 >= prevNum && medianIndex1 <= num) { + System.out.println("The median is: " + currLen); + br.close(); + return Double.parseDouble(currLen); + } else if (medianIndex2 >= prevNum && medianIndex1 < num) { + String nextCurrLen = st.nextToken(); + double theMedian = (Integer.parseInt(currLen) + Integer + .parseInt(nextCurrLen)) / 2.0; + System.out.println("The median is: " + theMedian); + br.close(); + return theMedian; + } + } + } finally { + br.close(); + } + // error, no median found + return -1; + } + + public static void main(String[] args) throws Exception { + ToolRunner.run(new Configuration(), new WordMedian(), args); + } + + @Override + public int run(String[] args) throws Exception { + if (args.length != 2) { + System.err.println("Usage: wordmedian "); + return 0; + } + + setConf(new Configuration()); + Configuration conf = getConf(); + + @SuppressWarnings("deprecation") + Job job = new Job(conf, "word median"); + job.setJarByClass(WordMedian.class); + job.setMapperClass(WordMedianMapper.class); + job.setCombinerClass(WordMedianReducer.class); + job.setReducerClass(WordMedianReducer.class); + job.setOutputKeyClass(IntWritable.class); + job.setOutputValueClass(IntWritable.class); + FileInputFormat.addInputPath(job, new Path(args[0])); + FileOutputFormat.setOutputPath(job, new Path(args[1])); + boolean result = job.waitForCompletion(true); + + // Wait for JOB 1 -- get middle value to check for Median + + long totalWords = job.getCounters() + .getGroup(TaskCounter.class.getCanonicalName()) + .findCounter("MAP_OUTPUT_RECORDS", "Map output records").getValue(); + int medianIndex1 = (int) Math.ceil((totalWords / 2.0)); + int medianIndex2 = (int) Math.floor((totalWords / 2.0)); + + median = readAndFindMedian(args[1], medianIndex1, medianIndex2, conf); + + return (result ? 0 : 1); + } + + public double getMedian() { + return median; + } +} diff --git a/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordStandardDeviation.java b/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordStandardDeviation.java new file mode 100644 index 0000000000..6a122df652 --- /dev/null +++ b/hadoop-mapreduce-project/src/examples/org/apache/hadoop/examples/WordStandardDeviation.java @@ -0,0 +1,210 @@ +package org.apache.hadoop.examples; + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.StringTokenizer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +public class WordStandardDeviation extends Configured implements Tool { + + private double stddev = 0; + + private final static Text LENGTH = new Text("length"); + private final static Text SQUARE = new Text("square"); + private final static Text COUNT = new Text("count"); + private final static LongWritable ONE = new LongWritable(1); + + /** + * Maps words from line of text into 3 key-value pairs; one key-value pair for + * counting the word, one for counting its length, and one for counting the + * square of its length. + */ + public static class WordStandardDeviationMapper extends + Mapper { + + private LongWritable wordLen = new LongWritable(); + private LongWritable wordLenSq = new LongWritable(); + + /** + * Emits 3 key-value pairs for counting the word, its length, and the + * squares of its length. Outputs are (Text, LongWritable). + * + * @param value + * This will be a line of text coming in from our input file. + */ + public void map(Object key, Text value, Context context) + throws IOException, InterruptedException { + StringTokenizer itr = new StringTokenizer(value.toString()); + while (itr.hasMoreTokens()) { + String string = itr.nextToken(); + + this.wordLen.set(string.length()); + + // the square of an integer is an integer... + this.wordLenSq.set((long) Math.pow(string.length(), 2.0)); + + context.write(LENGTH, this.wordLen); + context.write(SQUARE, this.wordLenSq); + context.write(COUNT, ONE); + } + } + } + + /** + * Performs integer summation of all the values for each key. + */ + public static class WordStandardDeviationReducer extends + Reducer { + + private LongWritable val = new LongWritable(); + + /** + * Sums all the individual values within the iterator and writes them to the + * same key. + * + * @param key + * This will be one of 2 constants: LENGTH_STR, COUNT_STR, or + * SQUARE_STR. + * @param values + * This will be an iterator of all the values associated with that + * key. + */ + public void reduce(Text key, Iterable values, Context context) + throws IOException, InterruptedException { + + int sum = 0; + for (LongWritable value : values) { + sum += value.get(); + } + val.set(sum); + context.write(key, val); + } + } + + /** + * Reads the output file and parses the summation of lengths, the word count, + * and the lengths squared, to perform a quick calculation of the standard + * deviation. + * + * @param path + * The path to find the output file in. Set in main to the output + * directory. + * @throws IOException + * If it cannot access the output directory, we throw an exception. + */ + private double readAndCalcStdDev(Path path, Configuration conf) + throws IOException { + FileSystem fs = FileSystem.get(conf); + Path file = new Path(path, "part-r-00000"); + + if (!fs.exists(file)) + throw new IOException("Output not found!"); + + double stddev = 0; + BufferedReader br = null; + try { + br = new BufferedReader(new InputStreamReader(fs.open(file))); + long count = 0; + long length = 0; + long square = 0; + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + + // grab type + String type = st.nextToken(); + + // differentiate + if (type.equals(COUNT.toString())) { + String countLit = st.nextToken(); + count = Long.parseLong(countLit); + } else if (type.equals(LENGTH.toString())) { + String lengthLit = st.nextToken(); + length = Long.parseLong(lengthLit); + } else if (type.equals(SQUARE.toString())) { + String squareLit = st.nextToken(); + square = Long.parseLong(squareLit); + } + } + // average = total sum / number of elements; + double mean = (((double) length) / ((double) count)); + // standard deviation = sqrt((sum(lengths ^ 2)/count) - (mean ^ 2)) + mean = Math.pow(mean, 2.0); + double term = (((double) square / ((double) count))); + stddev = Math.sqrt((term - mean)); + System.out.println("The standard deviation is: " + stddev); + } finally { + br.close(); + } + return stddev; + } + + public static void main(String[] args) throws Exception { + ToolRunner.run(new Configuration(), new WordStandardDeviation(), + args); + } + + @Override + public int run(String[] args) throws Exception { + if (args.length != 2) { + System.err.println("Usage: wordstddev "); + return 0; + } + + Configuration conf = getConf(); + + @SuppressWarnings("deprecation") + Job job = new Job(conf, "word stddev"); + job.setJarByClass(WordStandardDeviation.class); + job.setMapperClass(WordStandardDeviationMapper.class); + job.setCombinerClass(WordStandardDeviationReducer.class); + job.setReducerClass(WordStandardDeviationReducer.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(LongWritable.class); + FileInputFormat.addInputPath(job, new Path(args[0])); + Path outputpath = new Path(args[1]); + FileOutputFormat.setOutputPath(job, outputpath); + boolean result = job.waitForCompletion(true); + + // read output and calculate standard deviation + stddev = readAndCalcStdDev(outputpath, conf); + + return (result ? 0 : 1); + } + + public double getStandardDeviation() { + return stddev; + } +} \ No newline at end of file diff --git a/hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/examples/TestWordStats.java b/hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/examples/TestWordStats.java new file mode 100644 index 0000000000..a728386e62 --- /dev/null +++ b/hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/examples/TestWordStats.java @@ -0,0 +1,272 @@ +package org.apache.hadoop.examples; + +import static org.junit.Assert.assertEquals; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.StringTokenizer; +import java.util.TreeMap; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.ToolRunner; +import org.junit.Before; +import org.junit.Test; + +public class TestWordStats { + + private final static String INPUT = "src/test/mapred/org/apache/hadoop/examples/pi/math"; + private final static String MEAN_OUTPUT = "build/data/mean_output"; + private final static String MEDIAN_OUTPUT = "build/data/median_output"; + private final static String STDDEV_OUTPUT = "build/data/stddev_output"; + + /** + * Modified internal test class that is designed to read all the files in the + * input directory, and find the standard deviation between all of the word + * lengths. + */ + public static class WordStdDevReader { + private long wordsRead = 0; + private long wordLengthsRead = 0; + private long wordLengthsReadSquared = 0; + + public WordStdDevReader() { + } + + public double read(String path) throws IOException { + FileSystem fs = FileSystem.get(new Configuration()); + FileStatus[] files = fs.listStatus(new Path(path)); + + for (FileStatus fileStat : files) { + if (!fileStat.isFile()) + continue; + + BufferedReader br = null; + + try { + br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); + + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + String word; + while (st.hasMoreTokens()) { + word = st.nextToken(); + this.wordsRead++; + this.wordLengthsRead += word.length(); + this.wordLengthsReadSquared += (long) Math.pow(word.length(), 2.0); + } + } + + } catch (IOException e) { + System.out.println("Output could not be read!"); + throw e; + } finally { + br.close(); + } + } + + double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead)); + mean = Math.pow(mean, 2.0); + double term = (((double) this.wordLengthsReadSquared / ((double) this.wordsRead))); + double stddev = Math.sqrt((term - mean)); + return stddev; + } + + } + + /** + * Modified internal test class that is designed to read all the files in the + * input directory, and find the median length of all the words. + */ + public static class WordMedianReader { + private long wordsRead = 0; + private TreeMap map = new TreeMap(); + + public WordMedianReader() { + } + + public double read(String path) throws IOException { + FileSystem fs = FileSystem.get(new Configuration()); + FileStatus[] files = fs.listStatus(new Path(path)); + + int num = 0; + + for (FileStatus fileStat : files) { + if (!fileStat.isFile()) + continue; + + BufferedReader br = null; + + try { + br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); + + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + String word; + while (st.hasMoreTokens()) { + word = st.nextToken(); + this.wordsRead++; + if (this.map.get(word.length()) == null) { + this.map.put(word.length(), 1); + } else { + int count = this.map.get(word.length()); + this.map.put(word.length(), count + 1); + } + } + } + } catch (IOException e) { + System.out.println("Output could not be read!"); + throw e; + } finally { + br.close(); + } + } + + int medianIndex1 = (int) Math.ceil((this.wordsRead / 2.0)); + int medianIndex2 = (int) Math.floor((this.wordsRead / 2.0)); + + for (Integer key : this.map.navigableKeySet()) { + int prevNum = num; + num += this.map.get(key); + + if (medianIndex2 >= prevNum && medianIndex1 <= num) { + return key; + } else if (medianIndex2 >= prevNum && medianIndex1 < num) { + Integer nextCurrLen = this.map.navigableKeySet().iterator().next(); + double median = (key + nextCurrLen) / 2.0; + return median; + } + } + return -1; + } + + } + + /** + * Modified internal test class that is designed to read all the files in the + * input directory, and find the mean length of all the words. + */ + public static class WordMeanReader { + private long wordsRead = 0; + private long wordLengthsRead = 0; + + public WordMeanReader() { + } + + public double read(String path) throws IOException { + FileSystem fs = FileSystem.get(new Configuration()); + FileStatus[] files = fs.listStatus(new Path(path)); + + for (FileStatus fileStat : files) { + if (!fileStat.isFile()) + continue; + + BufferedReader br = null; + + try { + br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath()))); + + String line; + while ((line = br.readLine()) != null) { + StringTokenizer st = new StringTokenizer(line); + String word; + while (st.hasMoreTokens()) { + word = st.nextToken(); + this.wordsRead++; + this.wordLengthsRead += word.length(); + } + } + } catch (IOException e) { + System.out.println("Output could not be read!"); + throw e; + } finally { + br.close(); + } + } + + double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead)); + return mean; + } + + } + + /** + * Internal class designed to delete the output directory. Meant solely for + * use before and after the test is run; this is so next iterations of the + * test do not encounter a "file already exists" error. + * + * @param dir + * The directory to delete. + * @return Returns whether the deletion was successful or not. + */ + public static boolean deleteDir(File dir) { + if (dir.isDirectory()) { + String[] children = dir.list(); + for (int i = 0; i < children.length; i++) { + boolean success = deleteDir(new File(dir, children[i])); + if (!success) { + System.out.println("Could not delete directory after test!"); + return false; + } + } + } + + // The directory is now empty so delete it + return dir.delete(); + } + + @Before public void setup() throws Exception { + deleteDir(new File(MEAN_OUTPUT)); + deleteDir(new File(MEDIAN_OUTPUT)); + deleteDir(new File(STDDEV_OUTPUT)); + } + + @Test public void testGetTheMean() throws Exception { + String args[] = new String[2]; + args[0] = INPUT; + args[1] = MEAN_OUTPUT; + + WordMean wm = new WordMean(); + ToolRunner.run(new Configuration(), wm, args); + double mean = wm.getMean(); + + // outputs MUST match + WordMeanReader wr = new WordMeanReader(); + assertEquals(mean, wr.read(INPUT), 0.0); + } + + @Test public void testGetTheMedian() throws Exception { + String args[] = new String[2]; + args[0] = INPUT; + args[1] = MEDIAN_OUTPUT; + + WordMedian wm = new WordMedian(); + ToolRunner.run(new Configuration(), wm, args); + double median = wm.getMedian(); + + // outputs MUST match + WordMedianReader wr = new WordMedianReader(); + assertEquals(median, wr.read(INPUT), 0.0); + } + + @Test public void testGetTheStandardDeviation() throws Exception { + String args[] = new String[2]; + args[0] = INPUT; + args[1] = STDDEV_OUTPUT; + + WordStandardDeviation wsd = new WordStandardDeviation(); + ToolRunner.run(new Configuration(), wsd, args); + double stddev = wsd.getStandardDeviation(); + + // outputs MUST match + WordStdDevReader wr = new WordStdDevReader(); + assertEquals(stddev, wr.read(INPUT), 0.0); + } + +}