Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java (original) +++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java Fri Jan 9 06:34:33 2015 @@ -39,136 +39,125 @@ import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.ext.LexicalHandler; + /** - * This class takes SAX events (in addition to some extra events - * that SAX doesn't handle yet) and adds the result to a document - * or document fragment. + * This class takes SAX events (in addition to some extra events that SAX + * doesn't handle yet) and adds the result to a document or document fragment. */ -public class DOMBuilder - implements ContentHandler, LexicalHandler -{ +public class DOMBuilder implements ContentHandler, LexicalHandler { - /** Root document */ + /** Root document */ public Document m_doc; - /** Current node */ + /** Current node */ protected Node m_currentNode = null; - /** First node of document fragment or null if not a DocumentFragment */ + /** First node of document fragment or null if not a DocumentFragment */ public DocumentFragment m_docFrag = null; - /** Vector of element nodes */ + /** Vector of element nodes */ protected Stack<Element> m_elemStack = new Stack<Element>(); /** - * DOMBuilder instance constructor... it will add the DOM nodes - * to the document fragment. - * - * @param doc Root document - * @param node Current node + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document fragment. + * + * @param doc + * Root document + * @param node + * Current node */ - public DOMBuilder(Document doc, Node node) - { + public DOMBuilder(Document doc, Node node) { m_doc = doc; m_currentNode = node; } /** - * DOMBuilder instance constructor... it will add the DOM nodes - * to the document fragment. - * - * @param doc Root document - * @param docFrag Document fragment + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document fragment. + * + * @param doc + * Root document + * @param docFrag + * Document fragment */ - public DOMBuilder(Document doc, DocumentFragment docFrag) - { + public DOMBuilder(Document doc, DocumentFragment docFrag) { m_doc = doc; m_docFrag = docFrag; } /** - * DOMBuilder instance constructor... it will add the DOM nodes - * to the document. - * - * @param doc Root document + * DOMBuilder instance constructor... it will add the DOM nodes to the + * document. + * + * @param doc + * Root document */ - public DOMBuilder(Document doc) - { + public DOMBuilder(Document doc) { m_doc = doc; } /** - * Get the root node of the DOM being created. This - * is either a Document or a DocumentFragment. - * + * Get the root node of the DOM being created. This is either a Document or a + * DocumentFragment. + * * @return The root document or document fragment if not null */ - public Node getRootNode() - { + public Node getRootNode() { return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc; } /** * Get the node currently being processed. - * + * * @return the current node being processed */ - public Node getCurrentNode() - { + public Node getCurrentNode() { return m_currentNode; } /** * Return null since there is no Writer for this class. - * + * * @return null */ - public java.io.Writer getWriter() - { + public java.io.Writer getWriter() { return null; } /** * Append a node to the current container. - * - * @param newNode New node to append + * + * @param newNode + * New node to append */ - protected void append(Node newNode) throws org.xml.sax.SAXException - { + protected void append(Node newNode) throws org.xml.sax.SAXException { Node currentNode = m_currentNode; - if (null != currentNode) - { + if (null != currentNode) { currentNode.appendChild(newNode); // System.out.println(newNode.getNodeName()); - } - else if (null != m_docFrag) - { + } else if (null != m_docFrag) { m_docFrag.appendChild(newNode); - } - else - { + } else { boolean ok = true; short type = newNode.getNodeType(); - if (type == Node.TEXT_NODE) - { + if (type == Node.TEXT_NODE) { String data = newNode.getNodeValue(); - if ((null != data) && (data.trim().length() > 0)) - { - throw new org.xml.sax.SAXException("Warning: can't output text before document element! Ignoring..."); + if ((null != data) && (data.trim().length() > 0)) { + throw new org.xml.sax.SAXException( + "Warning: can't output text before document element! Ignoring..."); } ok = false; - } - else if (type == Node.ELEMENT_NODE) - { - if (m_doc.getDocumentElement() != null) - { - throw new org.xml.sax.SAXException("Can't have more than one root on a DOM!"); + } else if (type == Node.ELEMENT_NODE) { + if (m_doc.getDocumentElement() != null) { + throw new org.xml.sax.SAXException( + "Can't have more than one root on a DOM!"); } } @@ -179,132 +168,139 @@ public class DOMBuilder /** * Receive an object for locating the origin of SAX document events. - * - * <p>SAX parsers are strongly encouraged (though not absolutely - * required) to supply a locator: if it does so, it must supply - * the locator to the application by invoking this method before - * invoking any of the other methods in the ContentHandler - * interface.</p> - * - * <p>The locator allows the application to determine the end - * position of any document-related event, even if the parser is - * not reporting an error. Typically, the application will - * use this information for reporting its own errors (such as - * character content that does not match an application's - * business rules). The information returned by the locator - * is probably not sufficient for use with a search engine.</p> - * - * <p>Note that the locator will return correct information only - * during the invocation of the events in this interface. The - * application should not attempt to use it at any other time.</p> - * - * @param locator An object that can return the location of - * any SAX document event. + * + * <p> + * SAX parsers are strongly encouraged (though not absolutely required) to + * supply a locator: if it does so, it must supply the locator to the + * application by invoking this method before invoking any of the other + * methods in the ContentHandler interface. + * </p> + * + * <p> + * The locator allows the application to determine the end position of any + * document-related event, even if the parser is not reporting an error. + * Typically, the application will use this information for reporting its own + * errors (such as character content that does not match an application's + * business rules). The information returned by the locator is probably not + * sufficient for use with a search engine. + * </p> + * + * <p> + * Note that the locator will return correct information only during the + * invocation of the events in this interface. The application should not + * attempt to use it at any other time. + * </p> + * + * @param locator + * An object that can return the location of any SAX document event. * @see org.xml.sax.Locator */ - public void setDocumentLocator(Locator locator) - { + public void setDocumentLocator(Locator locator) { // No action for the moment. } /** * Receive notification of the beginning of a document. - * - * <p>The SAX parser will invoke this method only once, before any - * other methods in this interface or in DTDHandler (except for - * setDocumentLocator).</p> + * + * <p> + * The SAX parser will invoke this method only once, before any other methods + * in this interface or in DTDHandler (except for setDocumentLocator). + * </p> */ - public void startDocument() throws org.xml.sax.SAXException - { + public void startDocument() throws org.xml.sax.SAXException { // No action for the moment. } /** * Receive notification of the end of a document. - * - * <p>The SAX parser will invoke this method only once, and it will - * be the last method invoked during the parse. The parser shall - * not invoke this method until it has either abandoned parsing - * (because of an unrecoverable error) or reached the end of - * input.</p> + * + * <p> + * The SAX parser will invoke this method only once, and it will be the last + * method invoked during the parse. The parser shall not invoke this method + * until it has either abandoned parsing (because of an unrecoverable error) + * or reached the end of input. + * </p> */ - public void endDocument() throws org.xml.sax.SAXException - { + public void endDocument() throws org.xml.sax.SAXException { // No action for the moment. } /** * Receive notification of the beginning of an element. - * - * <p>The Parser will invoke this method at the beginning of every - * element in the XML document; there will be a corresponding - * endElement() event for every startElement() event (even when the - * element is empty). All of the element's content will be - * reported, in order, before the corresponding endElement() - * event.</p> - * - * <p>If the element name has a namespace prefix, the prefix will - * still be attached. Note that the attribute list provided will - * contain only attributes with explicit values (specified or - * defaulted): #IMPLIED attributes will be omitted.</p> - * - * - * @param ns The namespace of the node - * @param localName The local part of the qualified name - * @param name The element name. - * @param atts The attributes attached to the element, if any. + * + * <p> + * The Parser will invoke this method at the beginning of every element in the + * XML document; there will be a corresponding endElement() event for every + * startElement() event (even when the element is empty). All of the element's + * content will be reported, in order, before the corresponding endElement() + * event. + * </p> + * + * <p> + * If the element name has a namespace prefix, the prefix will still be + * attached. Note that the attribute list provided will contain only + * attributes with explicit values (specified or defaulted): #IMPLIED + * attributes will be omitted. + * </p> + * + * + * @param ns + * The namespace of the node + * @param localName + * The local part of the qualified name + * @param name + * The element name. + * @param atts + * The attributes attached to the element, if any. * @see #endElement * @see org.xml.sax.Attributes */ - public void startElement( - String ns, String localName, String name, Attributes atts) - throws org.xml.sax.SAXException - { + public void startElement(String ns, String localName, String name, + Attributes atts) throws org.xml.sax.SAXException { Element elem; - // Note that the namespace-aware call must be used to correctly - // construct a Level 2 DOM, even for non-namespaced nodes. + // Note that the namespace-aware call must be used to correctly + // construct a Level 2 DOM, even for non-namespaced nodes. if ((null == ns) || (ns.length() == 0)) - elem = m_doc.createElementNS(null,name); + elem = m_doc.createElementNS(null, name); else elem = m_doc.createElementNS(ns, name); append(elem); - try - { + try { int nAtts = atts.getLength(); - if (0 != nAtts) - { - for (int i = 0; i < nAtts; i++) - { + if (0 != nAtts) { + for (int i = 0; i < nAtts; i++) { - //System.out.println("type " + atts.getType(i) + " name " + atts.getLocalName(i) ); + // System.out.println("type " + atts.getType(i) + " name " + + // atts.getLocalName(i) ); // First handle a possible ID attribute if (atts.getType(i).equalsIgnoreCase("ID")) setIDAttribute(atts.getValue(i), elem); String attrNS = atts.getURI(i); - if("".equals(attrNS)) + if ("".equals(attrNS)) attrNS = null; // DOM represents no-namespace as null // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i) - // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i)); + // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i)); // Crimson won't let us set an xmlns: attribute on the DOM. String attrQName = atts.getQName(i); - // In SAX, xmlns: attributes have an empty namespace, while in DOM they should have the xmlns namespace + // In SAX, xmlns: attributes have an empty namespace, while in DOM + // they should have the xmlns namespace if (attrQName.startsWith("xmlns:")) attrNS = "http://www.w3.org/2000/xmlns/"; // ALWAYS use the DOM Level 2 call! - elem.setAttributeNS(attrNS,attrQName, atts.getValue(i)); + elem.setAttributeNS(attrNS, attrQName, atts.getValue(i)); } } @@ -315,9 +311,7 @@ public class DOMBuilder m_currentNode = elem; // append(elem); - } - catch(java.lang.Exception de) - { + } catch (java.lang.Exception de) { // de.printStackTrace(); throw new org.xml.sax.SAXException(de); } @@ -325,74 +319,87 @@ public class DOMBuilder } /** - - - + * + * + * * Receive notification of the end of an element. - * - * <p>The SAX parser will invoke this method at the end of every - * element in the XML document; there will be a corresponding - * startElement() event for every endElement() event (even when the - * element is empty).</p> - * - * <p>If the element name has a namespace prefix, the prefix will - * still be attached to the name.</p> - * - * - * @param ns the namespace of the element - * @param localName The local part of the qualified name of the element - * @param name The element name + * + * <p> + * The SAX parser will invoke this method at the end of every element in the + * XML document; there will be a corresponding startElement() event for every + * endElement() event (even when the element is empty). + * </p> + * + * <p> + * If the element name has a namespace prefix, the prefix will still be + * attached to the name. + * </p> + * + * + * @param ns + * the namespace of the element + * @param localName + * The local part of the qualified name of the element + * @param name + * The element name */ public void endElement(String ns, String localName, String name) - throws org.xml.sax.SAXException - { + throws org.xml.sax.SAXException { m_elemStack.pop(); m_currentNode = m_elemStack.isEmpty() ? null : m_elemStack.peek(); } /** * Set an ID string to node association in the ID table. - * - * @param id The ID string. - * @param elem The associated ID. + * + * @param id + * The ID string. + * @param elem + * The associated ID. */ - public void setIDAttribute(String id, Element elem) - { + public void setIDAttribute(String id, Element elem) { // Do nothing. This method is meant to be overiden. } /** * Receive notification of character data. - * - * <p>The Parser will call this method to report each chunk of - * character data. SAX parsers may return all contiguous character - * data in a single chunk, or they may split it into several - * chunks; however, all of the characters in any single event - * must come from the same external entity, so that the Locator - * provides useful information.</p> - * - * <p>The application must not attempt to read from the array - * outside of the specified range.</p> - * - * <p>Note that some parsers will report whitespace using the - * ignorableWhitespace() method rather than this one (validating - * parsers must do so).</p> - * - * @param ch The characters from the XML document. - * @param start The start position in the array. - * @param length The number of characters to read from the array. + * + * <p> + * The Parser will call this method to report each chunk of character data. + * SAX parsers may return all contiguous character data in a single chunk, or + * they may split it into several chunks; however, all of the characters in + * any single event must come from the same external entity, so that the + * Locator provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * <p> + * Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating parsers must + * do so). + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. * @see #ignorableWhitespace * @see org.xml.sax.Locator */ - public void characters(char ch[], int start, int length) throws org.xml.sax.SAXException - { - if(isOutsideDocElem() - && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) - return; // avoid DOM006 Hierarchy request error + public void characters(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error - if (m_inCData) - { + if (m_inCData) { cdata(ch, start, length); return; @@ -400,57 +407,55 @@ public class DOMBuilder String s = new String(ch, start, length); Node childNode; - childNode = m_currentNode != null ? m_currentNode.getLastChild(): null; - if( childNode != null && childNode.getNodeType() == Node.TEXT_NODE ){ - ((Text)childNode).appendData(s); - } - else{ - Text text = m_doc.createTextNode(s); - append(text); + childNode = m_currentNode != null ? m_currentNode.getLastChild() : null; + if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) { + ((Text) childNode).appendData(s); + } else { + Text text = m_doc.createTextNode(s); + append(text); } } /** - * If available, when the disable-output-escaping attribute is used, - * output raw text without escaping. A PI will be inserted in front - * of the node with the name "lotusxsl-next-is-raw" and a value of - * "formatter-to-dom". - * - * @param ch Array containing the characters - * @param start Index to start of characters in the array - * @param length Number of characters in the array + * If available, when the disable-output-escaping attribute is used, output + * raw text without escaping. A PI will be inserted in front of the node with + * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom". + * + * @param ch + * Array containing the characters + * @param start + * Index to start of characters in the array + * @param length + * Number of characters in the array */ public void charactersRaw(char ch[], int start, int length) - throws org.xml.sax.SAXException - { - if(isOutsideDocElem() - && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) - return; // avoid DOM006 Hierarchy request error - + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error String s = new String(ch, start, length); append(m_doc.createProcessingInstruction("xslt-next-is-raw", - "formatter-to-dom")); + "formatter-to-dom")); append(m_doc.createTextNode(s)); } /** * Report the beginning of an entity. - * - * The start and end of the document entity are not reported. - * The start and end of the external DTD subset are reported - * using the pseudo-name "[dtd]". All other events must be - * properly nested within start/end entity events. - * - * @param name The name of the entity. If it is a parameter - * entity, the name will begin with '%'. + * + * The start and end of the document entity are not reported. The start and + * end of the external DTD subset are reported using the pseudo-name "[dtd]". + * All other events must be properly nested within start/end entity events. + * + * @param name + * The name of the entity. If it is a parameter entity, the name will + * begin with '%'. * @see #endEntity * @see org.xml.sax.ext.DeclHandler#internalEntityDecl * @see org.xml.sax.ext.DeclHandler#externalEntityDecl */ - public void startEntity(String name) throws org.xml.sax.SAXException - { + public void startEntity(String name) throws org.xml.sax.SAXException { // Almost certainly the wrong behavior... // entityReference(name); @@ -458,49 +463,58 @@ public class DOMBuilder /** * Report the end of an entity. - * - * @param name The name of the entity that is ending. + * + * @param name + * The name of the entity that is ending. * @see #startEntity */ - public void endEntity(String name) throws org.xml.sax.SAXException{} + public void endEntity(String name) throws org.xml.sax.SAXException { + } /** * Receive notivication of a entityReference. - * - * @param name name of the entity reference + * + * @param name + * name of the entity reference */ - public void entityReference(String name) throws org.xml.sax.SAXException - { + public void entityReference(String name) throws org.xml.sax.SAXException { append(m_doc.createEntityReference(name)); } /** * Receive notification of ignorable whitespace in element content. - * - * <p>Validating Parsers must use this method to report each chunk - * of ignorable whitespace (see the W3C XML 1.0 recommendation, - * section 2.10): non-validating parsers may also use this method - * if they are capable of parsing and using content models.</p> - * - * <p>SAX parsers may return all contiguous whitespace in a single - * chunk, or they may split it into several chunks; however, all of - * the characters in any single event must come from the same - * external entity, so that the Locator provides useful - * information.</p> - * - * <p>The application must not attempt to read from the array - * outside of the specified range.</p> - * - * @param ch The characters from the XML document. - * @param start The start position in the array. - * @param length The number of characters to read from the array. + * + * <p> + * Validating Parsers must use this method to report each chunk of ignorable + * whitespace (see the W3C XML 1.0 recommendation, section 2.10): + * non-validating parsers may also use this method if they are capable of + * parsing and using content models. + * </p> + * + * <p> + * SAX parsers may return all contiguous whitespace in a single chunk, or they + * may split it into several chunks; however, all of the characters in any + * single event must come from the same external entity, so that the Locator + * provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. * @see #characters */ public void ignorableWhitespace(char ch[], int start, int length) - throws org.xml.sax.SAXException - { - if(isOutsideDocElem()) - return; // avoid DOM006 Hierarchy request error + throws org.xml.sax.SAXException { + if (isOutsideDocElem()) + return; // avoid DOM006 Hierarchy request error String s = new String(ch, start, length); @@ -509,232 +523,244 @@ public class DOMBuilder /** * Tell if the current node is outside the document element. - * + * * @return true if the current node is outside the document element. */ - private boolean isOutsideDocElem() - { - return (null == m_docFrag) && m_elemStack.size() == 0 && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE); - } + private boolean isOutsideDocElem() { + return (null == m_docFrag) + && m_elemStack.size() == 0 + && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE); + } /** * Receive notification of a processing instruction. - * - * <p>The Parser will invoke this method once for each processing - * instruction found: note that processing instructions may occur - * before or after the main document element.</p> - * - * <p>A SAX parser should never report an XML declaration (XML 1.0, - * section 2.8) or a text declaration (XML 1.0, section 4.3.1) - * using this method.</p> - * - * @param target The processing instruction target. - * @param data The processing instruction data, or null if - * none was supplied. + * + * <p> + * The Parser will invoke this method once for each processing instruction + * found: note that processing instructions may occur before or after the main + * document element. + * </p> + * + * <p> + * A SAX parser should never report an XML declaration (XML 1.0, section 2.8) + * or a text declaration (XML 1.0, section 4.3.1) using this method. + * </p> + * + * @param target + * The processing instruction target. + * @param data + * The processing instruction data, or null if none was supplied. */ public void processingInstruction(String target, String data) - throws org.xml.sax.SAXException - { + throws org.xml.sax.SAXException { append(m_doc.createProcessingInstruction(target, data)); } /** * Report an XML comment anywhere in the document. - * - * This callback will be used for comments inside or outside the - * document element, including comments in the external DTD - * subset (if read). - * - * @param ch An array holding the characters in the comment. - * @param start The starting position in the array. - * @param length The number of characters to use from the array. + * + * This callback will be used for comments inside or outside the document + * element, including comments in the external DTD subset (if read). + * + * @param ch + * An array holding the characters in the comment. + * @param start + * The starting position in the array. + * @param length + * The number of characters to use from the array. */ - public void comment(char ch[], int start, int length) throws org.xml.sax.SAXException - { + public void comment(char ch[], int start, int length) + throws org.xml.sax.SAXException { // tagsoup sometimes submits invalid values here - if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) return; + if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) + return; append(m_doc.createComment(new String(ch, start, length))); } - /** Flag indicating that we are processing a CData section */ + /** Flag indicating that we are processing a CData section */ protected boolean m_inCData = false; /** * Report the start of a CDATA section. - * + * * @see #endCDATA */ - public void startCDATA() throws org.xml.sax.SAXException - { + public void startCDATA() throws org.xml.sax.SAXException { m_inCData = true; append(m_doc.createCDATASection("")); } /** * Report the end of a CDATA section. - * + * * @see #startCDATA */ - public void endCDATA() throws org.xml.sax.SAXException - { + public void endCDATA() throws org.xml.sax.SAXException { m_inCData = false; } /** * Receive notification of cdata. - * - * <p>The Parser will call this method to report each chunk of - * character data. SAX parsers may return all contiguous character - * data in a single chunk, or they may split it into several - * chunks; however, all of the characters in any single event - * must come from the same external entity, so that the Locator - * provides useful information.</p> - * - * <p>The application must not attempt to read from the array - * outside of the specified range.</p> - * - * <p>Note that some parsers will report whitespace using the - * ignorableWhitespace() method rather than this one (validating - * parsers must do so).</p> - * - * @param ch The characters from the XML document. - * @param start The start position in the array. - * @param length The number of characters to read from the array. + * + * <p> + * The Parser will call this method to report each chunk of character data. + * SAX parsers may return all contiguous character data in a single chunk, or + * they may split it into several chunks; however, all of the characters in + * any single event must come from the same external entity, so that the + * Locator provides useful information. + * </p> + * + * <p> + * The application must not attempt to read from the array outside of the + * specified range. + * </p> + * + * <p> + * Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating parsers must + * do so). + * </p> + * + * @param ch + * The characters from the XML document. + * @param start + * The start position in the array. + * @param length + * The number of characters to read from the array. * @see #ignorableWhitespace * @see org.xml.sax.Locator */ - public void cdata(char ch[], int start, int length) throws org.xml.sax.SAXException - { - if(isOutsideDocElem() - && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) - return; // avoid DOM006 Hierarchy request error + public void cdata(char ch[], int start, int length) + throws org.xml.sax.SAXException { + if (isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error String s = new String(ch, start, length); - // XXX a...@apache.org: modified from the original, to accomodate TagSoup. + // XXX a...@apache.org: modified from the original, to accomodate TagSoup. Node n = m_currentNode.getLastChild(); if (n instanceof CDATASection) - ((CDATASection)n).appendData(s); + ((CDATASection) n).appendData(s); else if (n instanceof Comment) - ((Comment)n).appendData(s); + ((Comment) n).appendData(s); } /** * Report the start of DTD declarations, if any. - * - * Any declarations are assumed to be in the internal subset - * unless otherwise indicated. - * - * @param name The document type name. - * @param publicId The declared public identifier for the - * external DTD subset, or null if none was declared. - * @param systemId The declared system identifier for the - * external DTD subset, or null if none was declared. + * + * Any declarations are assumed to be in the internal subset unless otherwise + * indicated. + * + * @param name + * The document type name. + * @param publicId + * The declared public identifier for the external DTD subset, or + * null if none was declared. + * @param systemId + * The declared system identifier for the external DTD subset, or + * null if none was declared. * @see #endDTD * @see #startEntity */ public void startDTD(String name, String publicId, String systemId) - throws org.xml.sax.SAXException - { + throws org.xml.sax.SAXException { // Do nothing for now. } /** * Report the end of DTD declarations. - * + * * @see #startDTD */ - public void endDTD() throws org.xml.sax.SAXException - { + public void endDTD() throws org.xml.sax.SAXException { // Do nothing for now. } /** * Begin the scope of a prefix-URI Namespace mapping. - * - * <p>The information from this event is not necessary for - * normal Namespace processing: the SAX XML reader will - * automatically replace prefixes for element and attribute - * names when the http://xml.org/sax/features/namespaces - * feature is true (the default).</p> - * - * <p>There are cases, however, when applications need to - * use prefixes in character data or in attribute values, - * where they cannot safely be expanded automatically; the - * start/endPrefixMapping event supplies the information - * to the application to expand prefixes in those contexts - * itself, if necessary.</p> - * - * <p>Note that start/endPrefixMapping events are not - * guaranteed to be properly nested relative to each-other: - * all startPrefixMapping events will occur before the - * corresponding startElement event, and all endPrefixMapping - * events will occur after the corresponding endElement event, - * but their order is not guaranteed.</p> - * - * @param prefix The Namespace prefix being declared. - * @param uri The Namespace URI the prefix is mapped to. + * + * <p> + * The information from this event is not necessary for normal Namespace + * processing: the SAX XML reader will automatically replace prefixes for + * element and attribute names when the http://xml.org/sax/features/namespaces + * feature is true (the default). + * </p> + * + * <p> + * There are cases, however, when applications need to use prefixes in + * character data or in attribute values, where they cannot safely be expanded + * automatically; the start/endPrefixMapping event supplies the information to + * the application to expand prefixes in those contexts itself, if necessary. + * </p> + * + * <p> + * Note that start/endPrefixMapping events are not guaranteed to be properly + * nested relative to each-other: all startPrefixMapping events will occur + * before the corresponding startElement event, and all endPrefixMapping + * events will occur after the corresponding endElement event, but their order + * is not guaranteed. + * </p> + * + * @param prefix + * The Namespace prefix being declared. + * @param uri + * The Namespace URI the prefix is mapped to. * @see #endPrefixMapping * @see #startElement */ public void startPrefixMapping(String prefix, String uri) - throws org.xml.sax.SAXException - { + throws org.xml.sax.SAXException { /* - // Not sure if this is needed or wanted - // Also, it fails in the stree. - if((null != m_currentNode) - && (m_currentNode.getNodeType() == Node.ELEMENT_NODE)) - { - String qname; - if(((null != prefix) && (prefix.length() == 0)) - || (null == prefix)) - qname = "xmlns"; - else - qname = "xmlns:"+prefix; - - Element elem = (Element)m_currentNode; - String val = elem.getAttribute(qname); // Obsolete, should be DOM2...? - if(val == null) - { - elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", - qname, uri); - } - } - */ + * // Not sure if this is needed or wanted // Also, it fails in the stree. + * if((null != m_currentNode) && (m_currentNode.getNodeType() == + * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) && + * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname + * = "xmlns:"+prefix; + * + * Element elem = (Element)m_currentNode; String val = + * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null) + * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname, + * uri); } } + */ } /** * End the scope of a prefix-URI mapping. - * - * <p>See startPrefixMapping for details. This event will - * always occur after the corresponding endElement event, - * but the order of endPrefixMapping events is not otherwise - * guaranteed.</p> - * - * @param prefix The prefix that was being mapping. + * + * <p> + * See startPrefixMapping for details. This event will always occur after the + * corresponding endElement event, but the order of endPrefixMapping events is + * not otherwise guaranteed. + * </p> + * + * @param prefix + * The prefix that was being mapping. * @see #startPrefixMapping * @see #endElement */ - public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException{} + public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException { + } /** * Receive notification of a skipped entity. - * - * <p>The Parser will invoke this method once for each entity - * skipped. Non-validating processors may skip entities if they - * have not seen the declarations (because, for example, the - * entity was declared in an external DTD subset). All processors - * may skip external entities, depending on the values of the - * http://xml.org/sax/features/external-general-entities and the - * http://xml.org/sax/features/external-parameter-entities - * properties.</p> - * - * @param name The name of the skipped entity. If it is a - * parameter entity, the name will begin with '%'. + * + * <p> + * The Parser will invoke this method once for each entity skipped. + * Non-validating processors may skip entities if they have not seen the + * declarations (because, for example, the entity was declared in an external + * DTD subset). All processors may skip external entities, depending on the + * values of the http://xml.org/sax/features/external-general-entities and the + * http://xml.org/sax/features/external-parameter-entities properties. + * </p> + * + * @param name + * The name of the skipped entity. If it is a parameter entity, the + * name will begin with '%'. */ - public void skippedEntity(String name) throws org.xml.sax.SAXException{} + public void skippedEntity(String name) throws org.xml.sax.SAXException { + } }
Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original) +++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Jan 9 06:34:33 2015 @@ -33,34 +33,34 @@ import org.w3c.dom.*; /** * A collection of methods for extracting content from DOM trees. * - * This class holds a few utility methods for pulling content out of - * DOM nodes, such as getOutlinks, getText, etc. - * + * This class holds a few utility methods for pulling content out of DOM nodes, + * such as getOutlinks, getText, etc. + * */ public class DOMContentUtils { public static class LinkParams { public String elName; public String attrName; - public int childLen; - - public LinkParams(String elName, String attrName, int childLen) { - this.elName = elName; - this.attrName = attrName; - this.childLen = childLen; - } - - public String toString() { - return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; - } + public int childLen; + + public LinkParams(String elName, String attrName, int childLen) { + this.elName = elName; + this.attrName = attrName; + this.childLen = childLen; + } + + public String toString() { + return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; + } } - + private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>(); - + public DOMContentUtils(Configuration conf) { setConf(conf); } - + public void setConf(Configuration conf) { // forceTags is used to override configurable tag ignoring, later on Collection<String> forceTags = new ArrayList<String>(1); @@ -81,59 +81,57 @@ public class DOMContentUtils { // remove unwanted link tags from the linkParams map String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); - for ( int i = 0 ; ignoreTags != null && i < ignoreTags.length ; i++ ) { - if ( ! forceTags.contains(ignoreTags[i]) ) + for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) { + if (!forceTags.contains(ignoreTags[i])) linkParams.remove(ignoreTags[i]); } } - + /** - * This method takes a {@link StringBuilder} and a DOM {@link Node}, - * and will append all the content text found beneath the DOM node to - * the <code>StringBuilder</code>. - * + * This method takes a {@link StringBuilder} and a DOM {@link Node}, and will + * append all the content text found beneath the DOM node to the + * <code>StringBuilder</code>. + * * <p> - * - * If <code>abortOnNestedAnchors</code> is true, DOM traversal will - * be aborted and the <code>StringBuffer</code> will not contain - * any text encountered after a nested anchor is found. + * + * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted + * and the <code>StringBuffer</code> will not contain any text encountered + * after a nested anchor is found. * * <p> - * + * * @return true if nested anchors were found */ - public boolean getText(StringBuilder sb, Node node, - boolean abortOnNestedAnchors) { + public boolean getText(StringBuilder sb, Node node, + boolean abortOnNestedAnchors) { if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { return true; - } + } return false; } - /** - * This is a convinience method, equivalent to {@link - * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * This is a convinience method, equivalent to + * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. * */ public void getText(StringBuilder sb, Node node) { getText(sb, node, false); } - // returns true if abortOnNestedAnchors is true and we find nested + // returns true if abortOnNestedAnchors is true and we find nested // anchors - private boolean getTextHelper(StringBuilder sb, Node node, - boolean abortOnNestedAnchors, - int anchorDepth) { + private boolean getTextHelper(StringBuilder sb, Node node, + boolean abortOnNestedAnchors, int anchorDepth) { boolean abort = false; NodeWalker walker = new NodeWalker(node); - + while (walker.hasNext()) { - + Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); - + if ("script".equalsIgnoreCase(nodeName)) { walker.skipChildren(); } @@ -145,7 +143,7 @@ public class DOMContentUtils { if (anchorDepth > 1) { abort = true; break; - } + } } if (nodeType == Node.COMMENT_NODE) { walker.skipChildren(); @@ -156,44 +154,45 @@ public class DOMContentUtils { text = text.replaceAll("\\s+", " "); text = text.trim(); if (text.length() > 0) { - if (sb.length() > 0) sb.append(' '); - sb.append(text); + if (sb.length() > 0) + sb.append(' '); + sb.append(text); } } } - + return abort; } /** - * This method takes a {@link StringBuffer} and a DOM {@link Node}, - * and will append the content text found beneath the first - * <code>title</code> node to the <code>StringBuffer</code>. - * + * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will + * append the content text found beneath the first <code>title</code> node to + * the <code>StringBuffer</code>. + * * @return true if a title node was found, false otherwise */ public boolean getTitle(StringBuilder sb, Node node) { - + NodeWalker walker = new NodeWalker(node); - + while (walker.hasNext()) { - + Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); - + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD return false; } - + if (nodeType == Node.ELEMENT_NODE) { if ("title".equalsIgnoreCase(nodeName)) { getText(sb, currentNode); return true; } } - } - + } + return false; } @@ -201,28 +200,29 @@ public class DOMContentUtils { public URL getBase(Node node) { NodeWalker walker = new NodeWalker(node); - + while (walker.hasNext()) { - + Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); - + // is this node a BASE tag? if (nodeType == Node.ELEMENT_NODE) { - + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD return null; } - + if ("base".equalsIgnoreCase(nodeName)) { NamedNodeMap attrs = currentNode.getAttributes(); - for (int i= 0; i < attrs.getLength(); i++ ) { + for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); if ("href".equalsIgnoreCase(attr.getNodeName())) { try { return new URL(attr.getNodeValue()); - } catch (MalformedURLException e) {} + } catch (MalformedURLException e) { + } } } } @@ -233,10 +233,9 @@ public class DOMContentUtils { return null; } - private boolean hasOnlyWhiteSpace(Node node) { - String val= node.getNodeValue(); - for (int i= 0; i < val.length(); i++) { + String val = node.getNodeValue(); + for (int i = 0; i < val.length(); i++) { if (!Character.isWhitespace(val.charAt(i))) return false; } @@ -245,50 +244,49 @@ public class DOMContentUtils { // this only covers a few cases of empty links that are symptomatic // of nekohtml's DOM-fixup process... - private boolean shouldThrowAwayLink(Node node, NodeList children, - int childLen, LinkParams params) { + private boolean shouldThrowAwayLink(Node node, NodeList children, + int childLen, LinkParams params) { if (childLen == 0) { - // this has no inner structure - if (params.childLen == 0) return false; - else return true; - } else if ((childLen == 1) - && (children.item(0).getNodeType() == Node.ELEMENT_NODE) - && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { + // this has no inner structure + if (params.childLen == 0) + return false; + else + return true; + } else if ((childLen == 1) + && (children.item(0).getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { // single nested link return true; } else if (childLen == 2) { - Node c0= children.item(0); - Node c1= children.item(1); + Node c0 = children.item(0); + Node c1 = children.item(1); if ((c0.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c0.getNodeName())) - && (c1.getNodeType() == Node.TEXT_NODE) - && hasOnlyWhiteSpace(c1) ) { + && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) { // single link followed by whitespace node return true; } if ((c1.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c1.getNodeName())) - && (c0.getNodeType() == Node.TEXT_NODE) - && hasOnlyWhiteSpace(c0) ) { + && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) { // whitespace node followed by single link return true; } } else if (childLen == 3) { - Node c0= children.item(0); - Node c1= children.item(1); - Node c2= children.item(2); - + Node c0 = children.item(0); + Node c1 = children.item(1); + Node c2 = children.item(2); + if ((c1.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c1.getNodeName())) - && (c0.getNodeType() == Node.TEXT_NODE) - && (c2.getNodeType() == Node.TEXT_NODE) - && hasOnlyWhiteSpace(c0) - && hasOnlyWhiteSpace(c2) ) { + && (c0.getNodeType() == Node.TEXT_NODE) + && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0) + && hasOnlyWhiteSpace(c2)) { // single link surrounded by whitespace nodes return true; } @@ -296,57 +294,54 @@ public class DOMContentUtils { return false; } - + /** - * This method finds all anchors below the supplied DOM - * <code>node</code>, and creates appropriate {@link Outlink} - * records for each (relative to the supplied <code>base</code> - * URL), and adds them to the <code>outlinks</code> {@link - * ArrayList}. - * + * This method finds all anchors below the supplied DOM <code>node</code>, and + * creates appropriate {@link Outlink} records for each (relative to the + * supplied <code>base</code> URL), and adds them to the <code>outlinks</code> + * {@link ArrayList}. + * * <p> - * - * Links without inner structure (tags, text, etc) are discarded, as - * are links which contain only single nested links and empty text - * nodes (this is a common DOM-fixup artifact, at least with - * nekohtml). + * + * Links without inner structure (tags, text, etc) are discarded, as are links + * which contain only single nested links and empty text nodes (this is a + * common DOM-fixup artifact, at least with nekohtml). */ - public void getOutlinks(URL base, ArrayList<Outlink> outlinks, - Node node) { - + public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) { + NodeWalker walker = new NodeWalker(node); while (walker.hasNext()) { - + Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); - short nodeType = currentNode.getNodeType(); + short nodeType = currentNode.getNodeType(); NodeList children = currentNode.getChildNodes(); - int childLen = (children != null) ? children.getLength() : 0; - + int childLen = (children != null) ? children.getLength() : 0; + if (nodeType == Node.ELEMENT_NODE) { - + nodeName = nodeName.toLowerCase(); LinkParams params = linkParams.get(nodeName); if (params != null) { if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { - + StringBuilder linkText = new StringBuilder(); getText(linkText, currentNode, true); - + NamedNodeMap attrs = currentNode.getAttributes(); String target = null; boolean noFollow = false; boolean post = false; - for (int i= 0; i < attrs.getLength(); i++ ) { + for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); String attrName = attr.getNodeName(); if (params.attrName.equalsIgnoreCase(attrName)) { target = attr.getNodeValue(); - } else if ("rel".equalsIgnoreCase(attrName) && - "nofollow".equalsIgnoreCase(attr.getNodeValue())) { + } else if ("rel".equalsIgnoreCase(attrName) + && "nofollow".equalsIgnoreCase(attr.getNodeValue())) { noFollow = true; - } else if ("method".equalsIgnoreCase(attrName) && - "post".equalsIgnoreCase(attr.getNodeValue())) { + } else if ("method".equalsIgnoreCase(attrName) + && "post".equalsIgnoreCase(attr.getNodeValue())) { post = true; } } @@ -354,18 +349,18 @@ public class DOMContentUtils { try { URL url = URLUtil.resolveURL(base, target); - outlinks.add(new Outlink(url.toString(), - linkText.toString().trim())); + outlinks.add(new Outlink(url.toString(), linkText.toString() + .trim())); } catch (MalformedURLException e) { // don't care } } // this should not have any children, skip them - if (params.childLen == 0) continue; + if (params.childLen == 0) + continue; } } } } } - Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (original) +++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java Fri Jan 9 06:34:33 2015 @@ -23,32 +23,31 @@ import org.apache.nutch.parse.HTMLMetaTa import org.w3c.dom.*; /** - * Class for parsing META Directives from DOM trees. This class - * handles specifically Robots META directives (all, none, nofollow, - * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache - * instructions. All meta directives are stored in a HTMLMetaTags instance. + * Class for parsing META Directives from DOM trees. This class handles + * specifically Robots META directives (all, none, nofollow, noindex), finding + * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are + * stored in a HTMLMetaTags instance. */ public class HTMLMetaProcessor { /** - * Utility class with indicators for the robots directives "noindex" - * and "nofollow", and HTTP-EQUIV/no-cache + * Utility class with indicators for the robots directives "noindex" and + * "nofollow", and HTTP-EQUIV/no-cache */ - + /** - * Sets the indicators in <code>robotsMeta</code> to appropriate - * values, based on any META tags found under the given - * <code>node</code>. + * Sets the indicators in <code>robotsMeta</code> to appropriate values, based + * on any META tags found under the given <code>node</code>. */ - public static final void getMetaTags ( - HTMLMetaTags metaTags, Node node, URL currURL) { + public static final void getMetaTags(HTMLMetaTags metaTags, Node node, + URL currURL) { metaTags.reset(); getMetaTagsHelper(metaTags, node, currURL); } - private static final void getMetaTagsHelper( - HTMLMetaTags metaTags, Node node, URL currURL) { + private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node, + URL currURL) { if (node.getNodeType() == Node.ELEMENT_NODE) { @@ -63,7 +62,7 @@ public class HTMLMetaProcessor { Node equivNode = null; Node contentNode = null; // Retrieves name, http-equiv and content attribues - for (int i=0; i<attrs.getLength(); i++) { + for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); String attrName = attr.getNodeName().toLowerCase(); if (attrName.equals("name")) { @@ -74,44 +73,43 @@ public class HTMLMetaProcessor { contentNode = attr; } } - + if (nameNode != null) { if (contentNode != null) { String name = nameNode.getNodeValue().toLowerCase(); metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); if ("robots".equals(name)) { - + if (contentNode != null) { - String directives = - contentNode.getNodeValue().toLowerCase(); + String directives = contentNode.getNodeValue().toLowerCase(); int index = directives.indexOf("none"); - + if (index >= 0) { metaTags.setNoIndex(); metaTags.setNoFollow(); } - + index = directives.indexOf("all"); if (index >= 0) { // do nothing... } - + index = directives.indexOf("noindex"); if (index >= 0) { metaTags.setNoIndex(); } - + index = directives.indexOf("nofollow"); if (index >= 0) { metaTags.setNoFollow(); } - + index = directives.indexOf("noarchive"); if (index >= 0) { metaTags.setNoCache(); } - } - + } + } // end if (name == robots) } } @@ -124,14 +122,15 @@ public class HTMLMetaProcessor { if ("pragma".equals(name)) { content = content.toLowerCase(); int index = content.indexOf("no-cache"); - if (index >= 0) + if (index >= 0) metaTags.setNoCache(); } else if ("refresh".equals(name)) { int idx = content.indexOf(';'); String time = null; if (idx == -1) { // just the refresh time time = content; - } else time = content.substring(0, idx); + } else + time = content.substring(0, idx); try { metaTags.setRefreshTime(Integer.parseInt(time)); // skip this if we couldn't parse the time @@ -142,9 +141,11 @@ public class HTMLMetaProcessor { URL refreshUrl = null; if (metaTags.getRefresh() && idx != -1) { // set the URL idx = content.toLowerCase().indexOf("url="); - if (idx == -1) { // assume a mis-formatted entry with just the url + if (idx == -1) { // assume a mis-formatted entry with just the + // url idx = content.indexOf(';') + 1; - } else idx += 4; + } else + idx += 4; if (idx != -1) { String url = content.substring(idx); try { @@ -187,13 +188,13 @@ public class HTMLMetaProcessor { try { if (currURL == null) url = new URL(urlString); - else + else url = new URL(currURL, urlString); } catch (Exception e) { ; } - if (url != null) + if (url != null) metaTags.setBaseHref(url); } Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original) +++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Fri Jan 9 06:34:33 2015 @@ -61,23 +61,23 @@ import org.xml.sax.InputSource; import org.xml.sax.SAXException; public class HtmlParser implements Parser { - public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.html"); + public static final Logger LOG = LoggerFactory + .getLogger("org.apache.nutch.parse.html"); - // I used 1000 bytes at first, but found that some documents have + // I used 1000 bytes at first, but found that some documents have // meta tag well past the first 1000 bytes. // (e.g. http://cn.promo.yahoo.com/customcare/music.html) private static final int CHUNK_SIZE = 2000; // NUTCH-1006 Meta equiv with single quotes not accepted - private static Pattern metaPattern = - Pattern.compile("<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", - Pattern.CASE_INSENSITIVE); - private static Pattern charsetPattern = - Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", - Pattern.CASE_INSENSITIVE); - private static Pattern charsetPatternHTML5 = - Pattern.compile("<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>", - Pattern.CASE_INSENSITIVE); + private static Pattern metaPattern = Pattern.compile( + "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", + Pattern.CASE_INSENSITIVE); + private static Pattern charsetPattern = Pattern.compile( + "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); + private static Pattern charsetPatternHTML5 = Pattern.compile( + "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>", + Pattern.CASE_INSENSITIVE); private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); @@ -89,19 +89,19 @@ public class HtmlParser implements Parse /** * Given a <code>ByteBuffer</code> representing an html file of an - * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag - * from the first <code>CHUNK_SIZE</code> bytes. - * If there's no meta tag for Content-Type or no charset is specified, - * the content is checked for a Unicode Byte Order Mark (BOM). - * This will also cover non-byte oriented character encodings (UTF-16 only). - * If no character set can be determined, - * <code>null</code> is returned. <br /> - * See also http://www.w3.org/International/questions/qa-html-encoding-declarations, + * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag + * from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for + * Content-Type or no charset is specified, the content is checked for a + * Unicode Byte Order Mark (BOM). This will also cover non-byte oriented + * character encodings (UTF-16 only). If no character set can be determined, + * <code>null</code> is returned. <br /> + * See also + * http://www.w3.org/International/questions/qa-html-encoding-declarations, * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and - * http://www.w3.org/TR/REC-xml/#sec-guessing - * <br /> - * - * @param content <code>ByteBuffer</code> representation of an html file + * http://www.w3.org/TR/REC-xml/#sec-guessing <br /> + * + * @param content + * <code>ByteBuffer</code> representation of an html file */ private static String sniffCharacterEncoding(ByteBuffer content) { @@ -113,8 +113,8 @@ public class HtmlParser implements Parse // {U+0041, U+0082, U+00B7}. String str = ""; try { - str = new String(content.array(), content.arrayOffset() + content.position(), - length, Charset.forName("ASCII").toString()); + str = new String(content.array(), content.arrayOffset() + + content.position(), length, Charset.forName("ASCII").toString()); } catch (UnsupportedEncodingException e) { // code should never come here, but just in case... return null; @@ -136,17 +136,14 @@ public class HtmlParser implements Parse } if (encoding == null) { // check for BOM - if (length >= 3 - && content.get(0) == (byte) 0xEF - && content.get(1) == (byte) 0xBB - && content.get(2) == (byte) 0xBF) { + if (length >= 3 && content.get(0) == (byte) 0xEF + && content.get(1) == (byte) 0xBB && content.get(2) == (byte) 0xBF) { encoding = "UTF-8"; } else if (length >= 2) { - if (content.get(0) == (byte)0xFF - && content.get(1) == (byte)0xFE) { + if (content.get(0) == (byte) 0xFF && content.get(1) == (byte) 0xFE) { encoding = "UTF-16LE"; - } else if (content.get(0) == (byte)0xFE - && content.get(1) == (byte)0xFF) { + } else if (content.get(0) == (byte) 0xFE + && content.get(1) == (byte) 0xFF) { encoding = "UTF-16BE"; } } @@ -184,19 +181,24 @@ public class HtmlParser implements Parse DocumentFragment root; try { ByteBuffer contentInOctets = page.getContent(); - InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets.array(), - contentInOctets.arrayOffset() + contentInOctets.position(), contentInOctets.remaining())); + InputSource input = new InputSource(new ByteArrayInputStream( + contentInOctets.array(), contentInOctets.arrayOffset() + + contentInOctets.position(), contentInOctets.remaining())); EncodingDetector detector = new EncodingDetector(conf); detector.autoDetectClues(page, true); detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed"); String encoding = detector.guessEncoding(page, defaultCharEncoding); - page.getMetadata().put(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING), ByteBuffer.wrap(Bytes.toBytes(encoding))); - page.getMetadata().put(new Utf8(Metadata.CHAR_ENCODING_FOR_CONVERSION), ByteBuffer.wrap(Bytes.toBytes(encoding))); + page.getMetadata().put(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING), + ByteBuffer.wrap(Bytes.toBytes(encoding))); + page.getMetadata().put(new Utf8(Metadata.CHAR_ENCODING_FOR_CONVERSION), + ByteBuffer.wrap(Bytes.toBytes(encoding))); input.setEncoding(encoding); - if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); } + if (LOG.isTraceEnabled()) { + LOG.trace("Parsing..."); + } root = parse(input); } catch (IOException e) { LOG.error("Failed with the following IOException: ", e); @@ -218,40 +220,47 @@ public class HtmlParser implements Parse LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); } // check meta directives - if (!metaTags.getNoIndex()) { // okay to index + if (!metaTags.getNoIndex()) { // okay to index StringBuilder sb = new StringBuilder(); - if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } - utils.getText(sb, root); // extract text + if (LOG.isTraceEnabled()) { + LOG.trace("Getting text..."); + } + utils.getText(sb, root); // extract text text = sb.toString(); sb.setLength(0); - if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); } - utils.getTitle(sb, root); // extract title + if (LOG.isTraceEnabled()) { + LOG.trace("Getting title..."); + } + utils.getTitle(sb, root); // extract title title = sb.toString().trim(); } - if (!metaTags.getNoFollow()) { // okay to follow links - ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks + if (!metaTags.getNoFollow()) { // okay to follow links + ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks URL baseTag = utils.getBase(root); - if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } - utils.getOutlinks(baseTag!=null?baseTag:base, l, root); + if (LOG.isTraceEnabled()) { + LOG.trace("Getting links..."); + } + utils.getOutlinks(baseTag != null ? baseTag : base, l, root); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { - LOG.trace("found "+outlinks.length+" outlinks in "+ url); + LOG.trace("found " + outlinks.length + " outlinks in " + url); } } ParseStatus status = ParseStatus.newBuilder().build(); - status.setMajorCode((int)ParseStatusCodes.SUCCESS); + status.setMajorCode((int) ParseStatusCodes.SUCCESS); if (metaTags.getRefresh()) { - status.setMinorCode((int)ParseStatusCodes.SUCCESS_REDIRECT); + status.setMinorCode((int) ParseStatusCodes.SUCCESS_REDIRECT); status.getArgs().add(new Utf8(metaTags.getRefreshHref().toString())); - status.getArgs().add(new Utf8(Integer.toString(metaTags.getRefreshTime()))); + status.getArgs().add( + new Utf8(Integer.toString(metaTags.getRefreshTime()))); } Parse parse = new Parse(text, title, outlinks, status); parse = htmlParseFilters.filter(url, page, parse, metaTags, root); - if (metaTags.getNoCache()) { // not okay to cache + if (metaTags.getNoCache()) { // not okay to cache page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY), ByteBuffer.wrap(Bytes.toBytes(cachingPolicy))); } @@ -262,7 +271,8 @@ public class HtmlParser implements Parse private DocumentFragment parse(InputSource input) throws Exception { if (parserImpl.equalsIgnoreCase("tagsoup")) return parseTagSoup(input); - else return parseNeko(input); + else + return parseNeko(input); } private DocumentFragment parseTagSoup(InputSource input) throws Exception { @@ -273,7 +283,8 @@ public class HtmlParser implements Parse reader.setContentHandler(builder); reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false); - reader.setProperty("http://xml.org/sax/properties/lexical-handler", builder); + reader + .setProperty("http://xml.org/sax/properties/lexical-handler", builder); reader.parse(input); return frag; } @@ -281,21 +292,30 @@ public class HtmlParser implements Parse private DocumentFragment parseNeko(InputSource input) throws Exception { DOMFragmentParser parser = new DOMFragmentParser(); try { - parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", true); parser.setFeature("http://cyberneko.org/html/features/augmentations", true); - parser.setProperty("http://cyberneko.org/html/properties/default-encoding", + parser.setProperty( + "http://cyberneko.org/html/properties/default-encoding", defaultCharEncoding); - parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", - true); - parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", - false); - parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/ignore-specified-charset", + true); + parser + .setFeature( + "http://cyberneko.org/html/features/balance-tags/ignore-outside-content", + false); + parser.setFeature( + "http://cyberneko.org/html/features/balance-tags/document-fragment", true); parser.setFeature("http://cyberneko.org/html/features/report-errors", LOG.isTraceEnabled()); - } catch (SAXException e) {} + } catch (SAXException e) { + } // convert Document to DocumentFragment HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); @@ -305,18 +325,21 @@ public class HtmlParser implements Parse res.appendChild(frag); try { - while(true) { + while (true) { frag = doc.createDocumentFragment(); parser.parse(input, frag); - if (!frag.hasChildNodes()) break; + if (!frag.hasChildNodes()) + break; if (LOG.isInfoEnabled()) { - LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes."); + LOG.info(" - new frag, " + frag.getChildNodes().getLength() + + " nodes."); } res.appendChild(frag); } - } catch (Exception x) { + } catch (Exception x) { LOG.error("Failed with the following Exception: ", x); - }; + } + ; return res; } @@ -341,11 +364,11 @@ public class HtmlParser implements Parse } public static void main(String[] args) throws Exception { - //LOG.setLevel(Level.FINE); + // LOG.setLevel(Level.FINE); String name = args[0]; - String url = "file:"+name; + String url = "file:" + name; File file = new File(name); - byte[] bytes = new byte[(int)file.length()]; + byte[] bytes = new byte[(int) file.length()]; DataInputStream in = new DataInputStream(new FileInputStream(file)); in.readFully(bytes); Configuration conf = NutchConfiguration.create(); @@ -356,8 +379,8 @@ public class HtmlParser implements Parse page.setContent(ByteBuffer.wrap(bytes)); page.setContentType(new Utf8("text/html")); Parse parse = parser.getParse(url, page); - System.out.println("title: "+parse.getTitle()); - System.out.println("text: "+parse.getText()); + System.out.println("title: " + parse.getTitle()); + System.out.println("text: " + parse.getText()); System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks())); } Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java (original) +++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java Fri Jan 9 06:34:33 2015 @@ -26,40 +26,42 @@ package org.apache.nutch.parse.html; /** - * Class used to verify whether the specified <var>ch</var> - * conforms to the XML 1.0 definition of whitespace. + * Class used to verify whether the specified <var>ch</var> conforms to the XML + * 1.0 definition of whitespace. */ -public class XMLCharacterRecognizer -{ +public class XMLCharacterRecognizer { /** - * Returns whether the specified <var>ch</var> conforms to the XML 1.0 definition - * of whitespace. Refer to <A href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> - * the definition of <CODE>S</CODE></A> for details. - * @param ch Character to check as XML whitespace. + * Returns whether the specified <var>ch</var> conforms to the XML 1.0 + * definition of whitespace. Refer to <A + * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of + * <CODE>S</CODE></A> for details. + * + * @param ch + * Character to check as XML whitespace. * @return =true if <var>ch</var> is XML whitespace; otherwise =false. */ - public static boolean isWhiteSpace(char ch) - { + public static boolean isWhiteSpace(char ch) { return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA); } /** * Tell if the string is whitespace. - * - * @param ch Character array to check as XML whitespace. - * @param start Start index of characters in the array - * @param length Number of characters in the array - * @return True if the characters in the array are - * XML whitespace; otherwise, false. + * + * @param ch + * Character array to check as XML whitespace. + * @param start + * Start index of characters in the array + * @param length + * Number of characters in the array + * @return True if the characters in the array are XML whitespace; otherwise, + * false. */ - public static boolean isWhiteSpace(char ch[], int start, int length) - { + public static boolean isWhiteSpace(char ch[], int start, int length) { int end = start + length; - for (int s = start; s < end; s++) - { + for (int s = start; s < end; s++) { if (!isWhiteSpace(ch[s])) return false; } @@ -69,39 +71,36 @@ public class XMLCharacterRecognizer /** * Tell if the string is whitespace. - * - * @param buf StringBuffer to check as XML whitespace. + * + * @param buf + * StringBuffer to check as XML whitespace. * @return True if characters in buffer are XML whitespace, false otherwise */ - public static boolean isWhiteSpace(StringBuffer buf) - { + public static boolean isWhiteSpace(StringBuffer buf) { int n = buf.length(); - for (int i = 0; i < n; i++) - { + for (int i = 0; i < n; i++) { if (!isWhiteSpace(buf.charAt(i))) return false; } return true; } - + /** * Tell if the string is whitespace. - * - * @param s String to check as XML whitespace. + * + * @param s + * String to check as XML whitespace. * @return True if characters in buffer are XML whitespace, false otherwise */ - public static boolean isWhiteSpace(String s) - { + public static boolean isWhiteSpace(String s) { - if(null != s) - { + if (null != s) { int n = s.length(); - - for (int i = 0; i < n; i++) - { + + for (int i = 0; i < n; i++) { if (!isWhiteSpace(s.charAt(i))) return false; }