Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to
find out
if it is a MR issue or something which comes from the xml-parser logic or
the data ...

Usually it should be not that slow. But what cluster do you have and how
many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko



2013/11/28 Chhaya Vishwakarma <chhaya.vishwaka...@lntinfotech.com>

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
> ------------------------------
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>

Reply via email to