Re: XML parsing in Hadoop

Mirko Kämpf Wed, 27 Nov 2013 23:24:25 -0800

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to
find out
if it is a MR issue or something which comes from the xml-parser logic or
the data ...


Usually it should be not that slow. But what cluster do you have and how
many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko



2013/11/28 Chhaya Vishwakarma <chhaya.vishwaka...@lntinfotech.com>

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
> ------------------------------
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>

Re: XML parsing in Hadoop

Reply via email to