Hi, I have a SequenceFile which contains several jpeg images with (image name, image bytes) as key-value pairs. My objective is to count the no. of images by grouping them by the source, something like this :
Nikon Coolpix 100 Sony Cybershot 251 N82 100 The MR code is : package com.hadoop.basics; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import com.drew.imaging.ImageMetadataReader; import com.drew.imaging.ImageProcessingException; import com.drew.metadata.Directory; import com.drew.metadata.Metadata; import com.drew.metadata.exif.ExifIFD0Directory; public class ImageSummary extends Configured implements Tool { public static class ImageSourceMapper extends Mapper<Text, BytesWritable, Text, IntWritable> { private static int tagId = 272; private static final IntWritable one = new IntWritable(1); public void map(Text imageName, BytesWritable imageBytes, Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub System.out.println("In the map method, image is " + imageName.toString()); byte[] imageInBytes = imageBytes.getBytes(); ByteArrayInputStream bais = new ByteArrayInputStream(imageInBytes); BufferedInputStream bis = new BufferedInputStream(bais); Metadata imageMD = null; try { imageMD = ImageMetadataReader.readMetadata(bis, true); } catch (ImageProcessingException e) { // TODO Auto-generated catch block System.out.println("Got an ImageProcessingException !"); e.printStackTrace(); } Directory exifIFD0Directory = imageMD .getDirectory(ExifIFD0Directory.class); String imageSource = exifIFD0Directory.getString(tagId); System.out.println(imageName.toString() + " is taken using " + imageSource); context.write(new Text(imageSource), one); System.out.println("Returning from the map method"); } } public static class ImageSourceReducer extends Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text imageSource, Iterator<IntWritable> counts, Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub System.out.println("In the reduce method"); int finalCount = 0; while (counts.hasNext()) { finalCount += counts.next().get(); } context.write(imageSource, new IntWritable(finalCount)); System.out.println("Returning from the reduce method"); } } public static void main(String[] args) throws Exception { ToolRunner.run(new ImageSummary(), args); } @Override public int run(String[] args) throws Exception { // TODO Auto-generated method stub System.out.println("In ImageSummary.run(...)"); Configuration configuration = getConf(); Job job = new Job(configuration, "Image_Source"); job.setJarByClass(getClass()); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(ImageSourceMapper.class); job.setCombinerClass(ImageSourceReducer.class); job.setReducerClass(ImageSourceReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); SequenceFileInputFormat.addInputPath(job, new Path(args[0])); TextOutputFormat.setOutputPath(job, new Path(args[1])); System.out.println("Submitting job"); job.waitForCompletion(true); int jobStatus = job.isSuccessful() ? 0 : -1; System.out.println("Returning jobStatus = " + jobStatus); return jobStatus; } } The command : hadoop jar /home/hduser/dumphere/codes/hadoop/imageops.jar com.hadoop.basics.ImageSummary "/scratchpad/imageOps/WholeImageSeqFile" "/scratchpad/imageOps/cnt" The part-file (/<http://172.25.6.71:50075/browseDirectory.jsp?dir=/&namenodeInfoPort=50070>scratchpad<http://172.25.6.71:50075/browseDirectory.jsp?dir=/scratchpad&namenodeInfoPort=50070>/imageOps<http://172.25.6.71:50075/browseDirectory.jsp?dir=/scratchpad/imageOps&namenodeInfoPort=50070>/cnt<http://172.25.6.71:50075/browseDirectory.jsp?dir=/scratchpad/imageOps/cnt&namenodeInfoPort=50070>/part-r-00000) COOLPIX L120 1 COOLPIX L120 1 K750i 1 The mapper stdout logs : stdout logs In the map method, image is It's a long road....JPG It's a long road....JPG is taken using COOLPIX L120 Returning from the map method In the map method, image is Every man is a mountainside....JPG Every man is a mountainside....JPG is taken using COOLPIX L120 Returning from the map method In the map method, image is mystic.JPG mystic.JPG is taken using K750i Returning from the map method But nothing is reflected in stdout logs of the reducer. What have I missed? Regards, Omkar Joshi ________________________________ The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"