Modified: nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -38,101 +38,101 @@ import java.util.StringTokenizer; /** Adds basic searchable fields to a document. */ public class CCIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory.getLogger(CCIndexingFilter.class); + public static final Logger LOG = LoggerFactory + .getLogger(CCIndexingFilter.class); - /** The name of the document field we use. */ - public static String FIELD = "cc"; + /** The name of the document field we use. */ + public static String FIELD = "cc"; - private Configuration conf; + private Configuration conf; - private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); + private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); + + static { + FIELDS.add(WebPage.Field.BASE_URL); + FIELDS.add(WebPage.Field.METADATA); + } + + /** + * Add the features represented by a license URL. Urls are of the form + * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a + * license feature. + */ + public void addUrlFeatures(NutchDocument doc, String urlString) { + try { + URL url = new URL(urlString); + + // tokenize the path of the url, breaking at slashes and dashes + StringTokenizer names = new StringTokenizer(url.getPath(), "/-"); + + if (names.hasMoreTokens()) + names.nextToken(); // throw away "licenses" + + // add a feature per component after "licenses" + while (names.hasMoreTokens()) { + String feature = names.nextToken(); + addFeature(doc, feature); + } + } catch (MalformedURLException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("CC: failed to parse url: " + urlString + " : " + e); + } + } + } + + private void addFeature(NutchDocument doc, String feature) { + doc.add(FIELD, feature); + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + + @Override + public Collection<Field> getFields() { + return FIELDS; + } + + @Override + public NutchDocument filter(NutchDocument doc, String url, WebPage page) + throws IndexingException { + + ByteBuffer blicense = page.getMetadata().get( + new Utf8(CreativeCommons.LICENSE_URL)); + if (blicense != null) { + String licenseUrl = Bytes.toString(blicense); + if (LOG.isInfoEnabled()) { + LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString()); + } + + // add the entire license as cc:license=xxx + addFeature(doc, "license=" + licenseUrl); + + // index license attributes extracted of the license url + addUrlFeatures(doc, licenseUrl); + } + + // index the license location as cc:meta=xxx + ByteBuffer blicenseloc = page.getMetadata().get( + new Utf8(CreativeCommons.LICENSE_LOCATION)); + if (blicenseloc != null) { + String licenseLocation = Bytes.toString(blicenseloc); + addFeature(doc, "meta=" + licenseLocation); + } + + // index the work type cc:type=xxx + ByteBuffer bworkType = page.getMetadata().get( + new Utf8(CreativeCommons.WORK_TYPE)); + if (bworkType != null) { + String workType = Bytes.toString(bworkType); + addFeature(doc, workType); + } - static { - FIELDS.add(WebPage.Field.BASE_URL); - FIELDS.add(WebPage.Field.METADATA); - } - - /** - * Add the features represented by a license URL. Urls are of the form - * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a - * license feature. - */ - public void addUrlFeatures(NutchDocument doc, String urlString) { - try { - URL url = new URL(urlString); - - // tokenize the path of the url, breaking at slashes and dashes - StringTokenizer names = new StringTokenizer(url.getPath(), "/-"); - - if (names.hasMoreTokens()) - names.nextToken(); // throw away "licenses" - - // add a feature per component after "licenses" - while (names.hasMoreTokens()) { - String feature = names.nextToken(); - addFeature(doc, feature); - } - } catch (MalformedURLException e) { - if (LOG.isWarnEnabled()) { - LOG.warn("CC: failed to parse url: " + urlString + " : " + e); - } - } - } - - private void addFeature(NutchDocument doc, String feature) { - doc.add(FIELD, feature); - } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return this.conf; - } - - @Override - public Collection<Field> getFields() { - return FIELDS; - } - - @Override - public NutchDocument filter(NutchDocument doc, String url, WebPage page) - throws IndexingException { - - ByteBuffer blicense = page.getMetadata().get(new Utf8( - CreativeCommons.LICENSE_URL)); - if (blicense != null) { - String licenseUrl = Bytes.toString(blicense); - if (LOG.isInfoEnabled()) { - LOG.info("CC: indexing " + licenseUrl + " for: " - + url.toString()); - } - - // add the entire license as cc:license=xxx - addFeature(doc, "license=" + licenseUrl); - - // index license attributes extracted of the license url - addUrlFeatures(doc, licenseUrl); - } - - // index the license location as cc:meta=xxx - ByteBuffer blicenseloc = page.getMetadata().get(new Utf8( - CreativeCommons.LICENSE_LOCATION)); - if (blicenseloc != null) { - String licenseLocation = Bytes.toString(blicenseloc); - addFeature(doc, "meta=" + licenseLocation); - } - - // index the work type cc:type=xxx - ByteBuffer bworkType = page.getMetadata().get(new Utf8( - CreativeCommons.WORK_TYPE)); - if (bworkType != null) { - String workType = Bytes.toString(bworkType); - addFeature(doc, workType); - } - - return doc; - } + return doc; + } }
Modified: nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original) +++ nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Fri Jan 9 06:34:33 2015 @@ -55,8 +55,8 @@ public class CCParseFilter implements Pa } /** Scan the document adding attributes to metadata. */ - public static void walk(Node doc, URL base, WebPage page, - Configuration conf) throws ParseException { + public static void walk(Node doc, URL base, WebPage page, Configuration conf) + throws ParseException { // walk the DOM tree, scanning for license data Walker walker = new Walker(base); @@ -67,36 +67,37 @@ public class CCParseFilter implements Pa String licenseLocation = null; if (walker.rdfLicense != null) { // 1st choice: subject in RDF licenseLocation = "rdf"; - licenseUrl = walker.rdfLicense; + licenseUrl = walker.rdfLicense; } else if (walker.relLicense != null) { // 2nd: anchor w/ // rel=license licenseLocation = "rel"; licenseUrl = walker.relLicense.toString(); } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC // license - licenseLocation = "a"; - licenseUrl = walker.anchorLicense.toString(); + licenseLocation = "a"; + licenseUrl = walker.anchorLicense.toString(); } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) { - throw new ParseException("No CC license. Excluding."); + throw new ParseException("No CC license. Excluding."); } // add license to metadata if (licenseUrl != null) { if (LOG.isDebugEnabled()) { - LOG.debug("CC: found " + licenseUrl + " in " + licenseLocation + " of " + base); - } - page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_URL), - ByteBuffer.wrap(licenseUrl.getBytes())); - page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_LOCATION), - ByteBuffer.wrap(licenseLocation.getBytes())); + LOG.debug("CC: found " + licenseUrl + " in " + licenseLocation + + " of " + base); + } + page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_URL), + ByteBuffer.wrap(licenseUrl.getBytes())); + page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_LOCATION), + ByteBuffer.wrap(licenseLocation.getBytes())); } if (walker.workType != null) { if (LOG.isDebugEnabled()) { - LOG.debug("CC: found " + walker.workType + " in " + base); - } - page.getMetadata().put(new Utf8(CreativeCommons.WORK_TYPE), - ByteBuffer.wrap(walker.workType.getBytes())); + LOG.debug("CC: found " + walker.workType + " in " + base); + } + page.getMetadata().put(new Utf8(CreativeCommons.WORK_TYPE), + ByteBuffer.wrap(walker.workType.getBytes())); } } @@ -121,8 +122,8 @@ public class CCParseFilter implements Pa } /** - * Extract license url from element, if any. Thse are the href attribute - * of anchor elements with rel="license". These must also point to + * Extract license url from element, if any. Thse are the href attribute of + * anchor elements with rel="license". These must also point to * http://creativecommons.org/licenses/. */ private void findLicenseUrl(Element element) { @@ -137,27 +138,27 @@ public class CCParseFilter implements Pa try { URL url = new URL(base, href); // resolve the url // check that it's a CC license URL - if ("http".equalsIgnoreCase(url.getProtocol()) - && "creativecommons.org".equalsIgnoreCase(url.getHost()) - && url.getPath() != null && url.getPath().startsWith("/licenses/") - && url.getPath().length() > "/licenses/".length()) { - - // check rel="license" - String rel = element.getAttribute("rel"); - if (rel != null && "license".equals(rel) - && this.relLicense == null) { - this.relLicense = url; // found rel license - } else if (this.anchorLicense == null) { - this.anchorLicense = url; // found anchor license - } - } + if ("http".equalsIgnoreCase(url.getProtocol()) + && "creativecommons.org".equalsIgnoreCase(url.getHost()) + && url.getPath() != null && url.getPath().startsWith("/licenses/") + && url.getPath().length() > "/licenses/".length()) { + + // check rel="license" + String rel = element.getAttribute("rel"); + if (rel != null && "license".equals(rel) && this.relLicense == null) { + this.relLicense = url; // found rel license + } else if (this.anchorLicense == null) { + this.anchorLicense = url; // found anchor license + } + } } catch (MalformedURLException e) { // ignore malformed urls } } /** Configure a namespace aware XML parser. */ - private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory.newInstance(); - + private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory + .newInstance(); + static { FACTORY.setNamespaceAware(true); } @@ -177,129 +178,132 @@ public class CCParseFilter implements Pa if (rdfPosition < 0) return; // no RDF, abort int nsPosition = comment.indexOf(CC_NS); - if (nsPosition < 0) - return; // no RDF, abort - // try to parse the XML - Document doc; - try { - DocumentBuilder parser = FACTORY.newDocumentBuilder(); - doc = parser.parse(new InputSource(new StringReader(comment))); - } catch (Exception e) { - if (LOG.isWarnEnabled()) { - LOG.warn("CC: Failed to parse RDF in " + base + ": " + e); - } - // e.printStackTrace(); - return; - } - - // check that root is rdf:RDF - NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF"); - if (roots.getLength() != 1) { - if (LOG.isWarnEnabled()) { - LOG.warn("CC: No RDF root in " + base); - } - return; - } - Element rdf = (Element) roots.item(0); - - // get cc:License nodes inside rdf:RDF - NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License"); - for (int i = 0; i < licenses.getLength(); i++) { - Element l = (Element) licenses.item(i); - // license is rdf:about= attribute from cc:License - this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue(); - - // walk predicates of cc:License - NodeList predicates = l.getChildNodes(); - for (int j = 0; j < predicates.getLength(); j++) { - Node predicateNode = predicates.item(j); - if (!(predicateNode instanceof Element)) - continue; - Element predicateElement = (Element) predicateNode; - // extract predicates of cc:xxx predicates - if (!CC_NS.equals(predicateElement.getNamespaceURI())) { - continue; - } - String predicate = predicateElement.getLocalName(); - // object is rdf:resource from cc:xxx predicates - String object = predicateElement.getAttributeNodeNS(RDF_NS, "resource").getValue(); - // add object and predicate to metadata - // metadata.put(object, predicate); - //if (LOG.isInfoEnabled()) { - // LOG.info("CC: found: "+predicate+"="+object); - // } - } - } - - // get cc:Work nodes from rdf:RDF - NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work"); - for (int i = 0; i < works.getLength(); i++) { - Element l = (Element) works.item(i); - - // get dc:type nodes from cc:Work - NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type"); - for (int j = 0; j < types.getLength(); j++) { - Element type = (Element) types.item(j); - String workUri = type.getAttributeNodeNS(RDF_NS, "resource").getValue(); - this.workType = (String) WORK_TYPE_NAMES.get(workUri); - break; - } - } - } - } - - private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); - - static { - FIELDS.add(WebPage.Field.BASE_URL); - FIELDS.add(WebPage.Field.METADATA); - } - - private static final HashMap<String,String> WORK_TYPE_NAMES = new HashMap<String,String>(); - - static { - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software"); - WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image"); - } - - private Configuration conf; - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return this.conf; - } - - @Override - public Collection<Field> getFields() { - return FIELDS; - } - - /** - * Adds metadata or otherwise modifies a parse of an HTML document, given - * the DOM tree of a page. - */ - @Override - public Parse filter(String url, WebPage page, Parse parse, - HTMLMetaTags metaTags, DocumentFragment doc) { - // construct base url - URL base; + if (nsPosition < 0) + return; // no RDF, abort + // try to parse the XML + Document doc; try { - base = new URL(page.getBaseUrl().toString()); - // extract license metadata - Walker.walk(doc, base, page, getConf()); + DocumentBuilder parser = FACTORY.newDocumentBuilder(); + doc = parser.parse(new InputSource(new StringReader(comment))); } catch (Exception e) { - LOG.error("Error parsing " + url, e); - return ParseStatusUtils.getEmptyParse(e, getConf()); + if (LOG.isWarnEnabled()) { + LOG.warn("CC: Failed to parse RDF in " + base + ": " + e); + } + // e.printStackTrace(); + return; + } + + // check that root is rdf:RDF + NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF"); + if (roots.getLength() != 1) { + if (LOG.isWarnEnabled()) { + LOG.warn("CC: No RDF root in " + base); + } + return; } + Element rdf = (Element) roots.item(0); - return parse; + // get cc:License nodes inside rdf:RDF + NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License"); + for (int i = 0; i < licenses.getLength(); i++) { + Element l = (Element) licenses.item(i); + // license is rdf:about= attribute from cc:License + this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue(); + + // walk predicates of cc:License + NodeList predicates = l.getChildNodes(); + for (int j = 0; j < predicates.getLength(); j++) { + Node predicateNode = predicates.item(j); + if (!(predicateNode instanceof Element)) + continue; + Element predicateElement = (Element) predicateNode; + // extract predicates of cc:xxx predicates + if (!CC_NS.equals(predicateElement.getNamespaceURI())) { + continue; + } + String predicate = predicateElement.getLocalName(); + // object is rdf:resource from cc:xxx predicates + String object = predicateElement.getAttributeNodeNS(RDF_NS, + "resource").getValue(); + // add object and predicate to metadata + // metadata.put(object, predicate); + // if (LOG.isInfoEnabled()) { + // LOG.info("CC: found: "+predicate+"="+object); + // } + } + } + + // get cc:Work nodes from rdf:RDF + NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work"); + for (int i = 0; i < works.getLength(); i++) { + Element l = (Element) works.item(i); + + // get dc:type nodes from cc:Work + NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type"); + for (int j = 0; j < types.getLength(); j++) { + Element type = (Element) types.item(j); + String workUri = type.getAttributeNodeNS(RDF_NS, "resource") + .getValue(); + this.workType = (String) WORK_TYPE_NAMES.get(workUri); + break; + } + } + } + } + + private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); + + static { + FIELDS.add(WebPage.Field.BASE_URL); + FIELDS.add(WebPage.Field.METADATA); + } + + private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<String, String>(); + + static { + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", + "interactive"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software"); + WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image"); + } + + private Configuration conf; + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + + @Override + public Collection<Field> getFields() { + return FIELDS; + } + + /** + * Adds metadata or otherwise modifies a parse of an HTML document, given the + * DOM tree of a page. + */ + @Override + public Parse filter(String url, WebPage page, Parse parse, + HTMLMetaTags metaTags, DocumentFragment doc) { + // construct base url + URL base; + try { + base = new URL(page.getBaseUrl().toString()); + // extract license metadata + Walker.walk(doc, base, page, getConf()); + } catch (Exception e) { + LOG.error("Error parsing " + url, e); + return ParseStatusUtils.getEmptyParse(e, getConf()); } + + return parse; + } } Modified: nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original) +++ nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Fri Jan 9 06:34:33 2015 @@ -36,52 +36,50 @@ import static org.junit.Assert.assertEqu public class TestCCParseFilter { - private static final File testDir = new File( - System.getProperty("test.input")); + private static final File testDir = new File(System.getProperty("test.input")); @Test - public void testPages() throws Exception { - pageTest(new File(testDir, "anchor.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null); - // Tika returns <a> whereas parse-html returns <rel> - // check later - pageTest(new File(testDir, "rel.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc/2.0", "rel", null); - // Tika returns <a> whereas parse-html returns <rdf> - // check later - pageTest(new File(testDir, "rdf.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text"); - } - - public void pageTest(File file, String url, String license, - String location, String type) throws Exception { - - InputStream in = new FileInputStream(file); - ByteArrayOutputStream out = new ByteArrayOutputStream( - (int) file.length()); - byte[] buffer = new byte[1024]; - int i; - while ((i = in.read(buffer)) != -1) { - out.write(buffer, 0, i); - } - in.close(); - byte[] bytes = out.toByteArray(); - Configuration conf = NutchConfiguration.create(); - - WebPage page = WebPage.newBuilder().build(); - page.setBaseUrl(new Utf8(url)); - page.setContent(ByteBuffer.wrap(bytes)); - MimeUtil mimeutil = new MimeUtil(conf); - String mtype = mimeutil.getMimeType(file); - page.setContentType(new Utf8(mtype)); - - new ParseUtil(conf).parse(url, page); - - ByteBuffer bb = page.getMetadata().get(new Utf8("License-Url")); - assertEquals(license, Bytes.toString(bb)); - bb = page.getMetadata().get(new Utf8("License-Location")); - assertEquals(location, Bytes.toString(bb)); - bb = page.getMetadata().get(new Utf8("Work-Type")); - assertEquals(type, Bytes.toString(bb)); - } + public void testPages() throws Exception { + pageTest(new File(testDir, "anchor.html"), "http://foo.com/", + "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null); + // Tika returns <a> whereas parse-html returns <rel> + // check later + pageTest(new File(testDir, "rel.html"), "http://foo.com/", + "http://creativecommons.org/licenses/by-nc/2.0", "rel", null); + // Tika returns <a> whereas parse-html returns <rdf> + // check later + pageTest(new File(testDir, "rdf.html"), "http://foo.com/", + "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text"); + } + + public void pageTest(File file, String url, String license, String location, + String type) throws Exception { + + InputStream in = new FileInputStream(file); + ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length()); + byte[] buffer = new byte[1024]; + int i; + while ((i = in.read(buffer)) != -1) { + out.write(buffer, 0, i); + } + in.close(); + byte[] bytes = out.toByteArray(); + Configuration conf = NutchConfiguration.create(); + + WebPage page = WebPage.newBuilder().build(); + page.setBaseUrl(new Utf8(url)); + page.setContent(ByteBuffer.wrap(bytes)); + MimeUtil mimeutil = new MimeUtil(conf); + String mtype = mimeutil.getMimeType(file); + page.setContentType(new Utf8(mtype)); + + new ParseUtil(conf).parse(url, page); + + ByteBuffer bb = page.getMetadata().get(new Utf8("License-Url")); + assertEquals(license, Bytes.toString(bb)); + bb = page.getMetadata().get(new Utf8("License-Location")); + assertEquals(location, Bytes.toString(bb)); + bb = page.getMetadata().get(new Utf8("Work-Type")); + assertEquals(type, Bytes.toString(bb)); + } } Modified: nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -32,13 +32,15 @@ import java.util.HashSet; import java.util.Map.Entry; /** - * Indexing filter that offers an option to either index all inbound anchor text for - * a document or deduplicate anchors. Deduplication does have it's con's, + * Indexing filter that offers an option to either index all inbound anchor text + * for a document or deduplicate anchors. Deduplication does have it's con's, + * * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. */ public class AnchorIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory.getLogger(AnchorIndexingFilter.class); + public static final Logger LOG = LoggerFactory + .getLogger(AnchorIndexingFilter.class); private Configuration conf; private boolean deduplicate = false; @@ -47,7 +49,7 @@ public class AnchorIndexingFilter implem static { FIELDS.add(WebPage.Field.INLINKS); } - + /** * Set the {@link Configuration} object */ @@ -57,40 +59,44 @@ public class AnchorIndexingFilter implem deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false); LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off")); } - + /** * Get the {@link Configuration} object */ public Configuration getConf() { return this.conf; } - + public void addIndexBackendOptions(Configuration conf) { } - + /** - * The {@link AnchorIndexingFilter} filter object which supports boolean - * configuration settings for the deduplication of anchors. - * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. - * - * @param doc The {@link NutchDocument} object - * @param url URL to be filtered for anchor text - * @param page {@link WebPage} object relative to the URL + * The {@link AnchorIndexingFilter} filter object which supports boolean + * configuration settings for the deduplication of anchors. See + * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. + * + * @param doc + * The {@link NutchDocument} object + * @param url + * URL to be filtered for anchor text + * @param page + * {@link WebPage} object relative to the URL * @return filtered NutchDocument */ @Override public NutchDocument filter(NutchDocument doc, String url, WebPage page) throws IndexingException { HashSet<String> set = null; - + for (Entry<CharSequence, CharSequence> e : page.getInlinks().entrySet()) { String anchor = TableUtil.toString(e.getValue()); - - if(anchor.equals("")) + + if (anchor.equals("")) continue; - + if (deduplicate) { - if (set == null) set = new HashSet<String>(); + if (set == null) + set = new HashSet<String>(); String lcAnchor = anchor.toLowerCase(); // Check if already processed the current anchor @@ -104,15 +110,14 @@ public class AnchorIndexingFilter implem doc.add("anchor", anchor); } } - + return doc; } - + /** - * Gets all the fields for a given {@link WebPage} - * Many datastores need to setup the mapreduce job by specifying the fields - * needed. All extensions that work on WebPage are able to specify what fields - * they need. + * Gets all the fields for a given {@link WebPage} Many datastores need to + * setup the mapreduce job by specifying the fields needed. All extensions + * that work on WebPage are able to specify what fields they need. */ @Override public Collection<WebPage.Field> getFields() { Modified: nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -25,13 +25,12 @@ import org.junit.Test; import static org.junit.Assert.*; /** - * JUnit test case which tests - * 1. that anchor text is obtained - * 2. that anchor deduplication functionality is working - * + * JUnit test case which tests 1. that anchor text is obtained 2. that anchor + * deduplication functionality is working + * */ public class TestAnchorIndexingFilter { - + @Test public void testDeduplicateAnchor() throws Exception { Configuration conf = NutchConfiguration.create(); @@ -40,14 +39,19 @@ public class TestAnchorIndexingFilter { filter.setConf(conf); NutchDocument doc = new NutchDocument(); WebPage page = WebPage.newBuilder().build(); - page.getInlinks().put(new Utf8("http://example1.com/"), new Utf8("cool site")); - page.getInlinks().put(new Utf8("http://example2.com/"), new Utf8("cool site")); - page.getInlinks().put(new Utf8("http://example3.com/"), new Utf8("fun site")); + page.getInlinks().put(new Utf8("http://example1.com/"), + new Utf8("cool site")); + page.getInlinks().put(new Utf8("http://example2.com/"), + new Utf8("cool site")); + page.getInlinks().put(new Utf8("http://example3.com/"), + new Utf8("fun site")); filter.filter(doc, "http://myurldoesnotmatter.com/", page); - - assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor")); - - assertEquals("test dedup, we expect 2", 2, doc.getFieldValues("anchor").size()); + + assertTrue("test if there is an anchor at all", doc.getFieldNames() + .contains("anchor")); + + assertEquals("test dedup, we expect 2", 2, doc.getFieldValues("anchor") + .size()); } } Modified: nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -36,17 +36,17 @@ import java.util.Collection; import java.util.Date; import java.util.HashSet; -/** Adds basic searchable fields to a document. The fields are: - * host - add host as un-stored, indexed and tokenized - * url - url is both stored and indexed, so it's both searchable and returned. - * This is also a required field. - * content - content is indexed, so that it's searchable, but not stored in index - * title - title is stored and indexed - * cache - add cached content/summary display policy, if available - * tstamp - add timestamp when fetched, for deduplication +/** + * Adds basic searchable fields to a document. The fields are: host - add host + * as un-stored, indexed and tokenized url - url is both stored and indexed, so + * it's both searchable and returned. This is also a required field. content - + * content is indexed, so that it's searchable, but not stored in index title - + * title is stored and indexed cache - add cached content/summary display + * policy, if available tstamp - add timestamp when fetched, for deduplication */ public class BasicIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory.getLogger(BasicIndexingFilter.class); + public static final Logger LOG = LoggerFactory + .getLogger(BasicIndexingFilter.class); private int MAX_TITLE_LENGTH; private Configuration conf; @@ -60,22 +60,25 @@ public class BasicIndexingFilter impleme } /** - * The {@link BasicIndexingFilter} filter object which supports boolean - * configurable value for length of characters permitted within the - * title @see {@code indexer.max.title.length} in nutch-default.xml - * - * @param doc The {@link NutchDocument} object - * @param url URL to be filtered for anchor text - * @param page {@link WebPage} object relative to the URL + * The {@link BasicIndexingFilter} filter object which supports boolean + * configurable value for length of characters permitted within the title @see + * {@code indexer.max.title.length} in nutch-default.xml + * + * @param doc + * The {@link NutchDocument} object + * @param url + * URL to be filtered for anchor text + * @param page + * {@link WebPage} object relative to the URL * @return filtered NutchDocument */ public NutchDocument filter(NutchDocument doc, String url, WebPage page) throws IndexingException { String reprUrl = null; -// if (page.isReadable(WebPage.Field.REPR_URL.getIndex())) { - reprUrl = TableUtil.toString(page.getReprUrl()); -// } + // if (page.isReadable(WebPage.Field.REPR_URL.getIndex())) { + reprUrl = TableUtil.toString(page.getReprUrl()); + // } String host = null; try { @@ -103,7 +106,10 @@ public class BasicIndexingFilter impleme // title String title = TableUtil.toString(page.getTitle()); - if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate title if needed + if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate + // title + // if + // needed title = title.substring(0, MAX_TITLE_LENGTH); } if (title.length() > 0) { @@ -111,15 +117,16 @@ public class BasicIndexingFilter impleme doc.add("title", title); } // add cached content/summary display policy, if available - ByteBuffer cachingRaw = page - .getMetadata().get(Nutch.CACHING_FORBIDDEN_KEY_UTF8); + ByteBuffer cachingRaw = page.getMetadata().get( + Nutch.CACHING_FORBIDDEN_KEY_UTF8); String caching = Bytes.toString(cachingRaw); if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) { doc.add("cache", caching); } // add timestamp when fetched, for deduplication - String tstamp = DateUtil.getThreadLocalDateFormat().format(new Date(page.getFetchTime())); + String tstamp = DateUtil.getThreadLocalDateFormat().format( + new Date(page.getFetchTime())); doc.add("tstamp", tstamp); return doc; @@ -134,7 +141,8 @@ public class BasicIndexingFilter impleme public void setConf(Configuration conf) { this.conf = conf; this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); - LOG.info("Maximum title length for indexing set to: " + this.MAX_TITLE_LENGTH); + LOG.info("Maximum title length for indexing set to: " + + this.MAX_TITLE_LENGTH); } /** @@ -145,10 +153,9 @@ public class BasicIndexingFilter impleme } /** - * Gets all the fields for a given {@link WebPage} - * Many datastores need to setup the mapreduce job by specifying the fields - * needed. All extensions that work on WebPage are able to specify what fields - * they need. + * Gets all the fields for a given {@link WebPage} Many datastores need to + * setup the mapreduce job by specifying the fields needed. All extensions + * that work on WebPage are able to specify what fields they need. */ @Override public Collection<WebPage.Field> getFields() { Modified: nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -29,66 +29,69 @@ import java.nio.ByteBuffer; import static org.junit.Assert.*; /** - * JUnit test case which tests - * 1. that the host, url, content, title, cache and tstamp fields - * are obtained by the filter. - * 2. that configurable maximum length functionality for titles actually works. . - * This property defaults at 100 characters @see {@code indexer.max.title.length} - * in nutch-default.xml but has been set to 10 for this test. + * JUnit test case which tests 1. that the host, url, content, title, cache and + * tstamp fields are obtained by the filter. 2. that configurable maximum length + * functionality for titles actually works. . This property defaults at 100 + * characters @see {@code indexer.max.title.length} in nutch-default.xml but has + * been set to 10 for this test. * * @author lewismc */ public class TestBasicIndexingFilter { - + @Test public void testBasicFields() throws Exception { - Configuration conf = NutchConfiguration.create(); - BasicIndexingFilter filter = new BasicIndexingFilter(); - filter.setConf(conf); - assertNotNull(filter); - NutchDocument doc = new NutchDocument(); - WebPage page = WebPage.newBuilder().build(); - page.getInlinks().put(new Utf8("http://nutch.apache.org/"), new Utf8("Welcome to Nutch")); - page.setTitle(new Utf8("Welcome to Nutch")); + Configuration conf = NutchConfiguration.create(); + BasicIndexingFilter filter = new BasicIndexingFilter(); + filter.setConf(conf); + assertNotNull(filter); + NutchDocument doc = new NutchDocument(); + WebPage page = WebPage.newBuilder().build(); + page.getInlinks().put(new Utf8("http://nutch.apache.org/"), + new Utf8("Welcome to Nutch")); + page.setTitle(new Utf8("Welcome to Nutch")); page.setReprUrl(new Utf8("http://www.urldoesnotmatter.org")); byte[] bytes = new byte[10]; ByteBuffer bbuf = ByteBuffer.wrap(bytes); page.getMetadata().put(Nutch.CACHING_FORBIDDEN_KEY_UTF8, bbuf); page.setFetchTime(System.currentTimeMillis()); - try { - filter.filter(doc, "http://www.apache.org/", page); - } catch(Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - assertNotNull(doc); - assertTrue("check for host field ", doc.getFieldNames().contains("host")); - assertTrue("check for url field", doc.getFieldNames().contains("url")); - assertTrue("check for content field", doc.getFieldNames().contains("content")); - assertTrue("check for title field", doc.getFieldNames().contains("title")); - assertTrue("check for cache field", doc.getFieldNames().contains("cache")); - assertTrue("check for tstamp field", doc.getFieldNames().contains("tstamp")); + try { + filter.filter(doc, "http://www.apache.org/", page); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + assertNotNull(doc); + assertTrue("check for host field ", doc.getFieldNames().contains("host")); + assertTrue("check for url field", doc.getFieldNames().contains("url")); + assertTrue("check for content field", + doc.getFieldNames().contains("content")); + assertTrue("check for title field", doc.getFieldNames().contains("title")); + assertTrue("check for cache field", doc.getFieldNames().contains("cache")); + assertTrue("check for tstamp field", doc.getFieldNames().contains("tstamp")); } - + @Test public void testTitleFieldLength() throws Exception { - Configuration conf = NutchConfiguration.create(); - conf.setInt("indexer.max.title.length", 10); - BasicIndexingFilter filter = new BasicIndexingFilter(); - filter.setConf(conf); - assertNotNull(filter); - NutchDocument doc = new NutchDocument(); - WebPage page = WebPage.newBuilder().build(); - page.getInlinks().put(new Utf8("http://exceedmaximumtitleurl.org/"), new Utf8("exceeding title site")); - page.setTitle(new Utf8("This title exceeds maximum characters")); - try { - filter.filter(doc, "http://www.apache.org/", page); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - assertNotNull(doc); - assertEquals("assert title field only has 10 characters", 10, doc.getFieldValue("title").length()); + Configuration conf = NutchConfiguration.create(); + conf.setInt("indexer.max.title.length", 10); + BasicIndexingFilter filter = new BasicIndexingFilter(); + filter.setConf(conf); + assertNotNull(filter); + NutchDocument doc = new NutchDocument(); + WebPage page = WebPage.newBuilder().build(); + page.getInlinks().put(new Utf8("http://exceedmaximumtitleurl.org/"), + new Utf8("exceeding title site")); + page.setTitle(new Utf8("This title exceeds maximum characters")); + try { + filter.filter(doc, "http://www.apache.org/", page); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + assertNotNull(doc); + assertEquals("assert title field only has 10 characters", 10, doc + .getFieldValue("title").length()); } } Modified: nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java (original) +++ nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java Fri Jan 9 06:34:33 2015 @@ -42,7 +42,7 @@ import org.apache.nutch.util.Bytes; public class MetadataIndexer implements IndexingFilter { private Configuration conf; - private static Map<Utf8,String> parseFieldnames; + private static Map<Utf8, String> parseFieldnames; private static final String PARSE_CONF_PROPERTY = "index.metadata"; private static final String INDEX_PREFIX = "meta_"; private static final String PARSE_META_PREFIX = "meta_"; @@ -56,7 +56,7 @@ public class MetadataIndexer implements // add the fields from parsemd if (parseFieldnames != null) { - for (Entry<Utf8,String> metatag : parseFieldnames.entrySet()) { + for (Entry<Utf8, String> metatag : parseFieldnames.entrySet()) { ByteBuffer bvalues = page.getMetadata().get(metatag.getKey()); if (bvalues != null) { String key = metatag.getValue(); @@ -75,7 +75,7 @@ public class MetadataIndexer implements public void setConf(Configuration conf) { this.conf = conf; String[] metatags = conf.getStrings(PARSE_CONF_PROPERTY); - parseFieldnames = new TreeMap<Utf8,String>(); + parseFieldnames = new TreeMap<Utf8, String>(); for (int i = 0; i < metatags.length; i++) { parseFieldnames.put( new Utf8(PARSE_META_PREFIX + metatags[i].toLowerCase(Locale.ROOT)), Modified: nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java (original) +++ nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java Fri Jan 9 06:34:33 2015 @@ -20,3 +20,4 @@ * Metadata may come from CrawlDb, parse or content metadata. */ package org.apache.nutch.indexer.metadata; + Modified: nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -30,10 +30,12 @@ import org.slf4j.LoggerFactory; * Add (or reset) a few metaData properties as respective fields (if they are * available), so that they can be accurately used within the search index. * - * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length from the HTTP - * header, 'type' field is indexed to support query by type and finally the 'title' field is an attempt - * to reset the title if a content-disposition hint exists. The logic is that such a presence is indicative - * that the content provider wants the filename therein to be used as the title. + * 'lastModifed' is indexed to support query by date, 'contentLength' obtains + * content length from the HTTP header, 'type' field is indexed to support query + * by type and finally the 'title' field is an attempt to reset the title if a + * content-disposition hint exists. The logic is that such a presence is + * indicative that the content provider wants the filename therein to be used as + * the title. * * Still need to make content-length searchable! * @@ -41,7 +43,8 @@ import org.slf4j.LoggerFactory; */ public class MoreIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory.getLogger(MoreIndexingFilter.class); + public static final Logger LOG = LoggerFactory + .getLogger(MoreIndexingFilter.class); /** Get the MimeTypes resolver instance. */ private MimeUtil MIME; @@ -68,12 +71,13 @@ public class MoreIndexingFilter implemen // last-modified, or, if that's not present, use fetch time. private NutchDocument addTime(NutchDocument doc, WebPage page, String url) { long time = -1; - CharSequence lastModified = page - .getHeaders().get(new Utf8(HttpHeaders.LAST_MODIFIED)); + CharSequence lastModified = page.getHeaders().get( + new Utf8(HttpHeaders.LAST_MODIFIED)); // String lastModified = data.getMeta(Metadata.LAST_MODIFIED); if (lastModified != null) { // try parse last-modified time = getTime(lastModified.toString(), url); // use as time - String formlastModified = DateUtil.getThreadLocalDateFormat().format(new Date(time)); + String formlastModified = DateUtil.getThreadLocalDateFormat().format( + new Date(time)); // store as string doc.add("lastModified", formlastModified); } @@ -82,7 +86,8 @@ public class MoreIndexingFilter implemen time = page.getModifiedTime(); // use Modified time } - String dateString = DateUtil.getThreadLocalDateFormat().format(new Date(time)); + String dateString = DateUtil.getThreadLocalDateFormat().format( + new Date(time)); // un-stored, indexed and un-tokenized doc.add("date", dateString); @@ -97,17 +102,19 @@ public class MoreIndexingFilter implemen } catch (ParseException e) { // try to parse it as date in alternative format try { - Date parsedDate = DateUtils.parseDate(date, new String[] { - "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz", - "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, dd MMM yyyy HH:mm:ss zzz", - "EEE,dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:sszzz", - "EEE, dd MMM yyyy HH:mm:ss", "EEE, dd-MMM-yy HH:mm:ss zzz", - "yyyy/MM/dd HH:mm:ss.SSS zzz", "yyyy/MM/dd HH:mm:ss.SSS", - "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", "yyyy.MM.dd HH:mm:ss", - "yyyy-MM-dd HH:mm", "MMM dd yyyy HH:mm:ss. zzz", - "MMM dd yyyy HH:mm:ss zzz", "dd.MM.yyyy HH:mm:ss zzz", - "dd MM yyyy HH:mm:ss zzz", "dd.MM.yyyy; HH:mm:ss", - "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", "yyyy-MM-dd'T'HH:mm:ss'Z'" }); + Date parsedDate = DateUtils.parseDate(date, + new String[] { "EEE MMM dd HH:mm:ss yyyy", + "EEE MMM dd HH:mm:ss yyyy zzz", "EEE MMM dd HH:mm:ss zzz yyyy", + "EEE, dd MMM yyyy HH:mm:ss zzz", + "EEE,dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:sszzz", + "EEE, dd MMM yyyy HH:mm:ss", "EEE, dd-MMM-yy HH:mm:ss zzz", + "yyyy/MM/dd HH:mm:ss.SSS zzz", "yyyy/MM/dd HH:mm:ss.SSS", + "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", "yyyy.MM.dd HH:mm:ss", + "yyyy-MM-dd HH:mm", "MMM dd yyyy HH:mm:ss. zzz", + "MMM dd yyyy HH:mm:ss zzz", "dd.MM.yyyy HH:mm:ss zzz", + "dd MM yyyy HH:mm:ss zzz", "dd.MM.yyyy; HH:mm:ss", + "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", + "yyyy-MM-dd'T'HH:mm:ss'Z'" }); time = parsedDate.getTime(); // if (LOG.isWarnEnabled()) { // LOG.warn(url + ": parsed date: " + date +" to:"+time); @@ -123,8 +130,8 @@ public class MoreIndexingFilter implemen // Add Content-Length private NutchDocument addLength(NutchDocument doc, WebPage page, String url) { - CharSequence contentLength = page.getHeaders().get(new Utf8( - HttpHeaders.CONTENT_LENGTH)); + CharSequence contentLength = page.getHeaders().get( + new Utf8(HttpHeaders.CONTENT_LENGTH)); if (contentLength != null) { // NUTCH-1010 ContentLength not trimmed String trimmed = contentLength.toString().trim(); @@ -188,7 +195,7 @@ public class MoreIndexingFilter implemen if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) { String[] parts = getParts(mimeType); - for(String part: parts) { + for (String part : parts) { doc.add("type", part); } } @@ -233,8 +240,8 @@ public class MoreIndexingFilter implemen } private NutchDocument resetTitle(NutchDocument doc, WebPage page, String url) { - CharSequence contentDisposition = page.getHeaders().get(new Utf8( - HttpHeaders.CONTENT_DISPOSITION)); + CharSequence contentDisposition = page.getHeaders().get( + new Utf8(HttpHeaders.CONTENT_DISPOSITION)); if (contentDisposition == null) return doc; Modified: nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -37,7 +37,7 @@ public class TestMoreIndexingFilter { assertContentType(conf, "text/html", "text/html"); assertContentType(conf, "text/html; charset=UTF-8", "text/html"); } - + public void testGetParts() { String[] parts = MoreIndexingFilter.getParts("text/html"); assertParts(parts, 2, "text", "html"); @@ -48,34 +48,35 @@ public class TestMoreIndexingFilter { * @since NUTCH-901 */ @Test - public void testNoParts(){ - Configuration conf = NutchConfiguration.create(); - conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false); - MoreIndexingFilter filter = new MoreIndexingFilter(); - filter.setConf(conf); - assertNotNull(filter); - NutchDocument doc = new NutchDocument(); - try{ - filter.filter(doc, "http://nutch.apache.org/index.html", WebPage.newBuilder().build()); - } - catch(Exception e){ - e.printStackTrace(); - fail(e.getMessage()); - } - assertNotNull(doc); - assertTrue(doc.getFieldNames().contains("type")); - assertEquals(1, doc.getFieldValues("type").size()); - assertEquals("text/html", doc.getFieldValue("type")); + public void testNoParts() { + Configuration conf = NutchConfiguration.create(); + conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false); + MoreIndexingFilter filter = new MoreIndexingFilter(); + filter.setConf(conf); + assertNotNull(filter); + NutchDocument doc = new NutchDocument(); + try { + filter.filter(doc, "http://nutch.apache.org/index.html", WebPage + .newBuilder().build()); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + assertNotNull(doc); + assertTrue(doc.getFieldNames().contains("type")); + assertEquals(1, doc.getFieldValues("type").size()); + assertEquals("text/html", doc.getFieldValue("type")); } - + private void assertParts(String[] parts, int count, String... expected) { assertEquals(count, parts.length); for (int i = 0; i < expected.length; i++) { assertEquals(expected[i], parts[i]); } } - - private void assertContentType(Configuration conf, String source, String expected) throws IndexingException { + + private void assertContentType(Configuration conf, String source, + String expected) throws IndexingException { MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); WebPage page = WebPage.newBuilder().build(); Modified: nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java (original) +++ nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java Fri Jan 9 06:34:33 2015 @@ -19,3 +19,4 @@ * Index writer plugin for <a href="http://www.elasticsearch.org/">Elasticsearch</a>. */ package org.apache.nutch.indexwriter.elastic; + Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java (original) +++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java Fri Jan 9 06:34:33 2015 @@ -22,7 +22,7 @@ public interface SolrConstants { public static final String SERVER_URL = SOLR_PREFIX + "server.url"; public static final String COMMIT_SIZE = SOLR_PREFIX + "commit.size"; - + public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index"; public static final String MAPPING_FILE = SOLR_PREFIX + "mapping.file"; @@ -32,15 +32,15 @@ public interface SolrConstants { public static final String USERNAME = SOLR_PREFIX + "auth.username"; public static final String PASSWORD = SOLR_PREFIX + "auth.password"; - + public static final String ID_FIELD = "id"; - + public static final String URL_FIELD = "url"; - + public static final String BOOST_FIELD = "boost"; - + public static final String TIMESTAMP_FIELD = "tstamp"; - + public static final String DIGEST_FIELD = "digest"; } Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java (original) +++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java Fri Jan 9 06:34:33 2015 @@ -144,7 +144,9 @@ public class SolrIndexWriter implements public void commit() throws IOException { try { solr.commit(); - LOG.info("Total " + documentCount + (documentCount > 1 ? " documents are " : " document is ") + "added."); + LOG.info("Total " + documentCount + + (documentCount > 1 ? " documents are " : " document is ") + + "added."); } catch (SolrServerException e) { throw makeIOException(e); } Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java (original) +++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java Fri Jan 9 06:34:33 2015 @@ -38,16 +38,17 @@ import org.xml.sax.SAXException; public class SolrMappingReader { public static Logger LOG = LoggerFactory.getLogger(SolrMappingReader.class); - + private Configuration conf; - + private Map<String, String> keyMap = new HashMap<String, String>(); private Map<String, String> copyMap = new HashMap<String, String>(); private String uniqueKey = "id"; - + public static synchronized SolrMappingReader getInstance(Configuration conf) { ObjectCache cache = ObjectCache.get(conf); - SolrMappingReader instance = (SolrMappingReader)cache.getObject(SolrMappingReader.class.getName()); + SolrMappingReader instance = (SolrMappingReader) cache + .getObject(SolrMappingReader.class.getName()); if (instance == null) { instance = new SolrMappingReader(conf); cache.setObject(SolrMappingReader.class.getName(), instance); @@ -60,9 +61,10 @@ public class SolrMappingReader { parseMapping(); } - private void parseMapping() { + private void parseMapping() { InputStream ssInputStream = null; - ssInputStream = conf.getConfResourceAsInputStream(conf.get(SolrConstants.MAPPING_FILE, "solrindex-mapping.xml")); + ssInputStream = conf.getConfResourceAsInputStream(conf.get( + SolrConstants.MAPPING_FILE, "solrindex-mapping.xml")); InputSource inputSource = new InputSource(ssInputStream); try { @@ -74,48 +76,50 @@ public class SolrMappingReader { if (fieldList.getLength() > 0) { for (int i = 0; i < fieldList.getLength(); i++) { Element element = (Element) fieldList.item(i); - LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest")); - keyMap.put(element.getAttribute("source"), element.getAttribute("dest")); + LOG.info("source: " + element.getAttribute("source") + " dest: " + + element.getAttribute("dest")); + keyMap.put(element.getAttribute("source"), + element.getAttribute("dest")); } } NodeList copyFieldList = rootElement.getElementsByTagName("copyField"); if (copyFieldList.getLength() > 0) { for (int i = 0; i < copyFieldList.getLength(); i++) { Element element = (Element) copyFieldList.item(i); - LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest")); - copyMap.put(element.getAttribute("source"), element.getAttribute("dest")); + LOG.info("source: " + element.getAttribute("source") + " dest: " + + element.getAttribute("dest")); + copyMap.put(element.getAttribute("source"), + element.getAttribute("dest")); } } NodeList uniqueKeyItem = rootElement.getElementsByTagName("uniqueKey"); if (uniqueKeyItem.getLength() > 1) { LOG.warn("More than one unique key definitions found in solr index mapping, using default 'id'"); uniqueKey = "id"; - } - else if (uniqueKeyItem.getLength() == 0) { + } else if (uniqueKeyItem.getLength() == 0) { LOG.warn("No unique key definition found in solr index mapping using, default 'id'"); - } - else{ - uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue(); + } else { + uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue(); } } catch (MalformedURLException e) { - LOG.warn(e.toString()); + LOG.warn(e.toString()); } catch (SAXException e) { - LOG.warn(e.toString()); + LOG.warn(e.toString()); } catch (IOException e) { - LOG.warn(e.toString()); + LOG.warn(e.toString()); } catch (ParserConfigurationException e) { - LOG.warn(e.toString()); - } + LOG.warn(e.toString()); + } } - + public Map<String, String> getKeyMap() { return keyMap; } - + public Map<String, String> getCopyMap() { return copyMap; } - + public String getUniqueKey() { return uniqueKey; } @@ -128,14 +132,14 @@ public class SolrMappingReader { } public String mapKey(String key) throws IOException { - if(keyMap.containsKey(key)) { + if (keyMap.containsKey(key)) { key = keyMap.get(key); } return key; } public String mapCopyKey(String key) throws IOException { - if(copyMap.containsKey(key)) { + if (copyMap.containsKey(key)) { key = copyMap.get(key); } return key; Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java (original) +++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java Fri Jan 9 06:34:33 2015 @@ -1,6 +1,5 @@ package org.apache.nutch.indexwriter.solr; - import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; @@ -17,7 +16,8 @@ public class SolrUtils { public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class); - public static HttpSolrServer getHttpSolrServer(Configuration job) throws MalformedURLException { + public static HttpSolrServer getHttpSolrServer(Configuration job) + throws MalformedURLException { DefaultHttpClient client = new DefaultHttpClient(); // Check for username/password @@ -26,10 +26,14 @@ public class SolrUtils { LOG.info("Authenticating as: " + username); - AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, AuthScope.ANY_REALM, AuthScope.ANY_SCHEME); + AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, + AuthScope.ANY_REALM, AuthScope.ANY_SCHEME); + + client.getCredentialsProvider().setCredentials( + scope, + new UsernamePasswordCredentials(username, job + .get(SolrConstants.PASSWORD))); - client.getCredentialsProvider().setCredentials(scope, new UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD))); - HttpParams params = client.getParams(); HttpClientParams.setAuthenticating(params, true); @@ -46,12 +50,14 @@ public class SolrUtils { for (int i = 0; i < input.length(); i++) { ch = input.charAt(i); - // Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:] - // and non-printable control characters except tabulator, new line and carriage return + // Strip all non-characters + // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:] + // and non-printable control characters except tabulator, new line and + // carriage return if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000 - ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range - (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef - (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) { + ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range + (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef + (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) { retval.append(ch); } Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java (original) +++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java Fri Jan 9 06:34:33 2015 @@ -19,3 +19,4 @@ * Index writer plugin for <a href="http://lucene.apache.org/solr/">Apache Solr</a>. */ package org.apache.nutch.indexwriter.solr; + Modified: nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original) +++ nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Fri Jan 9 06:34:33 2015 @@ -47,7 +47,8 @@ import java.util.*; */ public class HTMLLanguageParser implements ParseFilter { - public static final Logger LOG = LoggerFactory.getLogger(HTMLLanguageParser.class); + public static final Logger LOG = LoggerFactory + .getLogger(HTMLLanguageParser.class); private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); @@ -113,8 +114,8 @@ public class HTMLLanguageParser implemen } if (lang != null) { - page.getMetadata().put(new Utf8(Metadata.LANGUAGE), ByteBuffer.wrap(lang - .getBytes())); + page.getMetadata().put(new Utf8(Metadata.LANGUAGE), + ByteBuffer.wrap(lang.getBytes())); return parse; } @@ -135,7 +136,8 @@ public class HTMLLanguageParser implemen return lang; } - CharSequence ulang = page.getHeaders().get(new Utf8(Response.CONTENT_LANGUAGE)); + CharSequence ulang = page.getHeaders().get( + new Utf8(Response.CONTENT_LANGUAGE)); if (ulang != null) { lang = ulang.toString(); } @@ -154,7 +156,7 @@ public class HTMLLanguageParser implemen String content = parse.getText(); if (content != null) { - text.append(" ").append(content.toString()); + text.append(" ").append(content.toString()); } LanguageIdentifier identifier = new LanguageIdentifier(text.toString()); Modified: nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -35,11 +35,10 @@ import java.util.HashSet; /** * An {@link org.apache.nutch.indexer.IndexingFilter} that adds a * <code>lang</code> (language) field to the document. - * - * It tries to find the language of the document by checking - * if {@link HTMLLanguageParser} has added some language - * information - * + * + * It tries to find the language of the document by checking if + * {@link HTMLLanguageParser} has added some language information + * * @author Sami Siren * @author Jerome Charron */ @@ -56,7 +55,8 @@ public class LanguageIndexingFilter impl /** * Constructs a new Language Indexing Filter. */ - public LanguageIndexingFilter() {} + public LanguageIndexingFilter() { + } public NutchDocument filter(NutchDocument doc, String url, WebPage page) throws IndexingException { Modified: nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original) +++ nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Fri Jan 9 06:34:33 2015 @@ -96,8 +96,8 @@ public class TestHTMLLanguageParser { { "torp, stuga, uthyres, bed & breakfast", null } }; for (int i = 0; i < 44; i++) { - assertEquals(tests[i][1], HTMLLanguageParser.LanguageParser - .parseLanguage(tests[i][0])); + assertEquals(tests[i][1], + HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0])); } } @@ -151,8 +151,8 @@ public class TestHTMLLanguageParser { page.setBaseUrl(BASE); page.setContent(ByteBuffer.wrap(text.getBytes())); page.setContentType(new Utf8("text/html")); - page - .getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/html")); + page.getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8, + new Utf8("text/html")); return page; } } Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java (original) +++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java Fri Jan 9 06:34:33 2015 @@ -19,7 +19,7 @@ package org.apache.nutch.protocol.http.a @SuppressWarnings("serial") public class BlockedException extends HttpException { - + public BlockedException(String msg) { super(msg); }