[ https://issues.apache.org/jira/browse/NUTCH-2526?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16453844#comment-16453844 ]
ASF GitHub Bot commented on NUTCH-2526: --------------------------------------- sebastian-nagel closed pull request #324: NUTCH-2526 NPE in scoring-opic when indexing document without CrawlDb datum URL: https://github.com/apache/nutch/pull/324 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/src/java/org/apache/nutch/scoring/ScoringFilter.java b/src/java/org/apache/nutch/scoring/ScoringFilter.java index c1acc482f..2941980f2 100644 --- a/src/java/org/apache/nutch/scoring/ScoringFilter.java +++ b/src/java/org/apache/nutch/scoring/ScoringFilter.java @@ -193,17 +193,22 @@ public default void orphanedScore(Text url, CrawlDatum datum) } /** - * This method calculates a Lucene document boost. + * This method calculates a indexed document score/boost. * * @param url * url of the page * @param doc - * Lucene document. NOTE: this already contains all information + * indexed document. NOTE: this already contains all information * collected by indexing filters. Implementations may modify this * instance, in order to store/remove some information. * @param dbDatum - * current page from CrawlDb. NOTE: changes made to this instance are - * not persisted. + * current page from CrawlDb. NOTE: + * <ul> + * <li>changes made to this instance are not persisted</li> + * <li>may be null if indexing is done without CrawlDb or if the + * segment is generated not from the CrawlDb (via + * FreeGenerator).</li> + * </ul> * @param fetchDatum * datum from FetcherOutput (containing among others the fetching * status) @@ -214,10 +219,10 @@ public default void orphanedScore(Text url, CrawlDatum datum) * current inlinks from LinkDb. NOTE: changes made to this instance * are not persisted. * @param initScore - * initial boost value for the Lucene document. - * @return boost value for the Lucene document. This value is passed as an + * initial boost value for the indexed document. + * @return boost value for the indexed document. This value is passed as an * argument to the next scoring filter in chain. NOTE: implementations - * may also express other scoring strategies by modifying Lucene + * may also express other scoring strategies by modifying the indexed * document directly. * @throws ScoringFilterException */ diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java index a143f46a9..c98ccce44 100644 --- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java +++ b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java @@ -36,6 +36,7 @@ private Configuration conf; private float normalizedScore = 1.00f; + private float initialScore = 0.0f; public LinkAnalysisScoringFilter() { @@ -64,12 +65,15 @@ public float generatorSortValue(Text url, CrawlDatum datum, float initSort) public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { + if (dbDatum == null) { + return initScore; + } return (normalizedScore * dbDatum.getScore()); } public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException { - datum.setScore(0.0f); + datum.setScore(initialScore); } public void injectedScore(Text url, CrawlDatum datum) diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java index 530f267f1..5a080bed2 100644 --- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java +++ b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java @@ -167,6 +167,9 @@ public CrawlDatum distributeScoreToOutlinks(Text fromUrl, public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { + if (dbDatum == null) { + return initScore; + } return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore; } } ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > NPE in scoring-opic when indexing document without CrawlDb datum > ---------------------------------------------------------------- > > Key: NUTCH-2526 > URL: https://issues.apache.org/jira/browse/NUTCH-2526 > Project: Nutch > Issue Type: Improvement > Components: parser, scoring > Affects Versions: 1.14 > Reporter: Yash Thenuan > Assignee: Sebastian Nagel > Priority: Major > Fix For: 1.15 > > > I was trying to write a parse filter plugin whose work was to parse internal > links as a separate document.what I did basically is,breaking the page into > multiple parseResults each parseResult having ParseText and ParseData > corresponding to the InternalLinks. I was successfully able to parse them > separately. But at the time of Scoring Some Error occurred. > I am attaching the Logs for Indexing. > > 2018-03-07 15:41:52,327 INFO indexer.IndexerMapReduce - IndexerMapReduce: > crawldb: crawl/crawldb > 2018-03-07 15:41:52,327 INFO indexer.IndexerMapReduce - IndexerMapReduce: > linkdb: crawl/linkdb > 2018-03-07 15:41:52,327 INFO indexer.IndexerMapReduce - IndexerMapReduces: > adding segment: crawl/segments/20180307130959 > 2018-03-07 15:41:53,677 INFO anchor.AnchorIndexingFilter - Anchor > deduplication is: off > 2018-03-07 15:41:54,861 INFO indexer.IndexWriters - Adding > org.apache.nutch.indexwriter.elasticrest.ElasticRestIndexWriter > 2018-03-07 15:41:55,168 INFO client.AbstractJestClient - Setting server pool > to a list of 1 servers: [http://localhost:9200] > 2018-03-07 15:41:55,170 INFO client.JestClientFactory - Using multi > thread/connection supporting pooling connection manager > 2018-03-07 15:41:55,238 INFO client.JestClientFactory - Using default GSON > instance > 2018-03-07 15:41:55,238 INFO client.JestClientFactory - Node Discovery > disabled... > 2018-03-07 15:41:55,238 INFO client.JestClientFactory - Idle connection > reaping disabled... > 2018-03-07 15:41:55,282 INFO elasticrest.ElasticRestIndexWriter - Processing > remaining requests [docs = 1, length = 210402, total docs = 1] > 2018-03-07 15:41:55,361 INFO elasticrest.ElasticRestIndexWriter - Processing > to finalize last execute > 2018-03-07 15:41:55,458 INFO elasticrest.ElasticRestIndexWriter - Previous > took in ms 175, including wait 97 > 2018-03-07 15:41:55,468 WARN mapred.LocalJobRunner - job_local1561152089_0001 > java.lang.Exception: java.lang.NullPointerException > at > org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:462) > at > org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:529) > Caused by: java.lang.NullPointerException > at > org.apache.nutch.scoring.opic.OPICScoringFilter.indexerScore(OPICScoringFilter.java:171) > at > org.apache.nutch.scoring.ScoringFilters.indexerScore(ScoringFilters.java:120) > at > org.apache.nutch.indexer.IndexerMapReduce.reduce(IndexerMapReduce.java:296) > at > org.apache.nutch.indexer.IndexerMapReduce.reduce(IndexerMapReduce.java:57) > at > org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:444) > at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:392) > at > org.apache.hadoop.mapred.LocalJobRunner$Job$ReduceTaskRunnable.run(LocalJobRunner.java:319) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > 2018-03-07 15:41:55,510 ERROR indexer.IndexingJob - Indexer: > java.io.IOException: Job failed! > at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:873) > at org.apache.nutch.indexer.IndexingJob.index(IndexingJob.java:147) > at org.apache.nutch.indexer.IndexingJob.run(IndexingJob.java:230) > at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70) > at org.apache.nutch.indexer.IndexingJob.main(IndexingJob.java:239) -- This message was sent by Atlassian JIRA (v7.6.3#76005)