[ https://issues.apache.org/jira/browse/NUTCH-1693?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13865306#comment-13865306 ]
Markus Jelsma commented on NUTCH-1693: -------------------------------------- By the way, there are several places in Nutch that still use getBytes(), would you suggest we do something about that too? {code} markus@midas:~/projects/apache/nutch/trunk$ grep -nr getBytes src/ | grep -v svn src/test/org/apache/nutch/protocol/TestContent.java:48: Content r = new Content(url, url, page.getBytes("UTF8"), "text/html", src/test/org/apache/nutch/protocol/TestContent.java:64: "".getBytes("UTF8"), src/test/org/apache/nutch/protocol/TestContent.java:70: "".getBytes("UTF8"), src/test/org/apache/nutch/protocol/TestContent.java:76: "".getBytes("UTF8"), src/test/org/apache/nutch/protocol/TestContent.java:82: "<html></html>".getBytes("UTF8"), src/test/org/apache/nutch/protocol/TestContent.java:88: "<html></html>".getBytes("UTF8"), src/test/org/apache/nutch/protocol/TestContent.java:94: "<html></html>".getBytes("UTF8"), src/test/org/apache/nutch/protocol/TestContent.java:100: "".getBytes("UTF8"), src/test/org/apache/nutch/protocol/TestContent.java:106: "".getBytes("UTF8"), src/test/org/apache/nutch/util/TestGZIPUtils.java:121: byte[] testBytes= SHORT_TEST_STRING.getBytes(); src/test/org/apache/nutch/util/TestGZIPUtils.java:123: testBytes= LONGER_TEST_STRING.getBytes(); src/test/org/apache/nutch/util/TestGZIPUtils.java:125: testBytes= WEBPAGE.getBytes(); src/test/org/apache/nutch/util/TestGZIPUtils.java:130: byte[] testBytes= SHORT_TEST_STRING.getBytes(); src/test/org/apache/nutch/util/TestGZIPUtils.java:132: testBytes= LONGER_TEST_STRING.getBytes(); src/test/org/apache/nutch/util/TestGZIPUtils.java:134: testBytes= WEBPAGE.getBytes(); src/test/org/apache/nutch/util/TestGZIPUtils.java:139: byte[] testBytes= SHORT_TEST_STRING.getBytes(); src/test/org/apache/nutch/util/TestGZIPUtils.java:141: testBytes= LONGER_TEST_STRING.getBytes(); src/test/org/apache/nutch/util/TestGZIPUtils.java:143: testBytes= WEBPAGE.getBytes(); src/test/org/apache/nutch/util/TestGZIPUtils.java:148: byte[] testBytes= SHORT_TEST_STRING.getBytes(); src/test/org/apache/nutch/util/TestGZIPUtils.java:150: testBytes= LONGER_TEST_STRING.getBytes(); src/test/org/apache/nutch/util/TestGZIPUtils.java:152: testBytes= WEBPAGE.getBytes(); src/test/org/apache/nutch/util/TestEncodingDetector.java:35: contentInOctets = "çñôöøДЛжҶ".getBytes("utf-8"); src/test/org/apache/nutch/util/TestNodeWalker.java:65: parser.parse(new InputSource(new ByteArrayInputStream(WEBPAGE.getBytes()))); src/java/org/apache/nutch/tools/proxy/FakeHandler.java:58: byte[] bytes = testA.getBytes("UTF-8"); src/java/org/apache/nutch/tools/proxy/FakeHandler.java:62: os.write(p.getBytes()); src/java/org/apache/nutch/tools/proxy/FakeHandler.java:80: os.write(link.getBytes()); src/java/org/apache/nutch/tools/proxy/FakeHandler.java:86: os.write(link.getBytes()); src/java/org/apache/nutch/tools/proxy/FakeHandler.java:92: os.write(link.getBytes()); src/java/org/apache/nutch/tools/proxy/FakeHandler.java:93: os.write(testB.getBytes()); src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java:318: Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType, src/java/org/apache/nutch/tools/Benchmark.java:65: os.write(url.getBytes()); src/java/org/apache/nutch/crawl/MD5Signature.java:35: if (data == null) data = content.getUrl().getBytes(); src/java/org/apache/nutch/crawl/Generator.java:375: int hash1 = hash(url1.getBytes(), 0, url1.getLength()); src/java/org/apache/nutch/crawl/Generator.java:376: int hash2 = hash(url2.getBytes(), 0, url2.getLength()); src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java:512: return new String(x).getBytes(); src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java:151: byte[] bytes= tests[i].getBytes(); src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java:242: new ByteArrayInputStream(testPages[i].getBytes()) ), src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java:308: return new String(x).getBytes(); src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java:92: byte[] credBytes = (username + ":" + password).getBytes(); src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:84: rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT); src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:93: rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, MULTIPLE_AGENTS); src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:109: rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT); src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:113: rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, UNKNOWN_AGENT); src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java:152: byte[] bytes= tests[i].getBytes(); src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java:187: new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java:128: return new Content(URL, BASE, text.getBytes(), "text/html", meta, NutchConfiguration.create()); src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java:82: fos.write(expectedText.getBytes()); src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java:267: new Content(link, link, text.getBytes(), contentType, contentMeta, src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java:154: byte[] reqBytes= reqStr.toString().getBytes(); src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java:403: in.unread(line.substring(pos).getBytes("UTF-8")); src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java:67: InputStream is=new ByteArrayInputStream(xml.toString().getBytes()); {code} > TextMD5Signatue compute on textual content > ------------------------------------------ > > Key: NUTCH-1693 > URL: https://issues.apache.org/jira/browse/NUTCH-1693 > Project: Nutch > Issue Type: New Feature > Reporter: Tien Nguyen Manh > Assignee: Markus Jelsma > Priority: Minor > Fix For: 2.3, 1.8 > > Attachments: NUTCH-1693-trunk.patch, NUTCH-1693-trunk.patch, > NUTCH-1693.patch > > > I create a new MD5Signature that based on textual content. In our case we use > boilerpipe to extract main text from content so this signature is more > effective to deduplicate. -- This message was sent by Atlassian JIRA (v6.1.5#6160)