[jira] [Commented] (NUTCH-1693) TextMD5Signatue compute on textual content

Markus Jelsma (JIRA) Wed, 08 Jan 2014 03:01:32 -0800

    [ 
https://issues.apache.org/jira/browse/NUTCH-1693?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13865306#comment-13865306
 ]


Markus Jelsma commented on NUTCH-1693:
--------------------------------------

By the way, there are several places in Nutch that still use getBytes(), would 
you suggest we do something about that too?

{code}
markus@midas:~/projects/apache/nutch/trunk$ grep -nr getBytes src/ | grep -v svn
src/test/org/apache/nutch/protocol/TestContent.java:48:    Content r = new 
Content(url, url, page.getBytes("UTF8"), "text/html",
src/test/org/apache/nutch/protocol/TestContent.java:64:                    
"".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:70:                    
"".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:76:                    
"".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:82:                    
"<html></html>".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:88:                    
"<html></html>".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:94:                    
"<html></html>".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:100:                    
"".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:106:                    
"".getBytes("UTF8"),
src/test/org/apache/nutch/util/TestGZIPUtils.java:121:    byte[] testBytes= 
SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:123:    testBytes= 
LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:125:    testBytes= 
WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:130:    byte[] testBytes= 
SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:132:    testBytes= 
LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:134:    testBytes= 
WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:139:    byte[] testBytes= 
SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:141:    testBytes= 
LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:143:    testBytes= 
WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:148:    byte[] testBytes= 
SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:150:    testBytes= 
LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:152:    testBytes= 
WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestEncodingDetector.java:35:      
contentInOctets = "çñôöøДЛжҶ".getBytes("utf-8");
src/test/org/apache/nutch/util/TestNodeWalker.java:65:      parser.parse(new 
InputSource(new ByteArrayInputStream(WEBPAGE.getBytes())));
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:58:      byte[] bytes = 
testA.getBytes("UTF-8");
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:62:      
os.write(p.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:80:        
os.write(link.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:86:        
os.write(link.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:92:      
os.write(link.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:93:      
os.write(testB.getBytes());
src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java:318:        Content 
content = new Content(urlStr, urlStr, bytes.getBytes(), contentType,
src/java/org/apache/nutch/tools/Benchmark.java:65:      
os.write(url.getBytes());
src/java/org/apache/nutch/crawl/MD5Signature.java:35:    if (data == null) data 
= content.getUrl().getBytes();
src/java/org/apache/nutch/crawl/Generator.java:375:      int hash1 = 
hash(url1.getBytes(), 0, url1.getLength());
src/java/org/apache/nutch/crawl/Generator.java:376:      int hash2 = 
hash(url2.getBytes(), 0, url2.getLength());
src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java:512:
    return new String(x).getBytes();
src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java:151:
      byte[] bytes= tests[i].getBytes();
src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java:242:
              new ByteArrayInputStream(testPages[i].getBytes()) ),
src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java:308:
    return new String(x).getBytes();
src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java:92:
        byte[] credBytes = (username + ":" + password).getBytes();
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:84:
    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, SINGLE_AGENT);
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:93:
    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, MULTIPLE_AGENTS);
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:109:
    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, SINGLE_AGENT);
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:113:
    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, UNKNOWN_AGENT);
src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java:152:
      byte[] bytes= tests[i].getBytes();
src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java:187:
            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java:128:
    return new Content(URL, BASE, text.getBytes(), "text/html", meta, 
NutchConfiguration.create());
src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java:82: 
   fos.write(expectedText.getBytes());
src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java:267:       
   new Content(link, link, text.getBytes(), contentType, contentMeta,
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java:154:
      byte[] reqBytes= reqStr.toString().getBytes();
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java:403:
        in.unread(line.substring(pos).getBytes("UTF-8"));
src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java:67:
    InputStream is=new ByteArrayInputStream(xml.toString().getBytes());

{code}

> TextMD5Signatue compute on textual content
> ------------------------------------------
>
>                 Key: NUTCH-1693
>                 URL: https://issues.apache.org/jira/browse/NUTCH-1693
>             Project: Nutch
>          Issue Type: New Feature
>            Reporter: Tien Nguyen Manh
>            Assignee: Markus Jelsma
>            Priority: Minor
>             Fix For: 2.3, 1.8
>
>         Attachments: NUTCH-1693-trunk.patch, NUTCH-1693-trunk.patch, 
> NUTCH-1693.patch
>
>
> I create a new MD5Signature that based on textual content. In our case we use 
> boilerpipe to extract main text from content so this signature is more 
> effective to deduplicate.



--
This message was sent by Atlassian JIRA
(v6.1.5#6160)

[jira] [Commented] (NUTCH-1693) TextMD5Signatue compute on textual content

Reply via email to