This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 7f51c2530795d7203aa1d0834be8e1c2c1373531 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Wed Apr 29 13:03:01 2020 +0200 NUTCH-2743 Add list of Nutch properties (nutch-default.xml) to documentation - modify ant build.xml to copy nutch-default.xml into docs/api/resources/ - adapt XSLT table layout - remove obsolete nutch-conf.xsl - fix typos and normalize spelling in nutch-default.xml --- build.xml | 7 ++-- conf/configuration.xsl | 41 +++++++++++++++++------ conf/nutch-conf.xsl | 24 -------------- conf/nutch-default.xml | 46 +++++++++++++------------- src/plugin/creativecommons/conf/nutch-site.xml | 6 ++-- 5 files changed, 59 insertions(+), 65 deletions(-) diff --git a/build.xml b/build.xml index 0a1bca0..5eb157e 100644 --- a/build.xml +++ b/build.xml @@ -786,11 +786,10 @@ <!-- Copy the plugin.dtd file to the plugin doc-files dir --> <copy file="${plugins.dir}/plugin.dtd" todir="${build.javadoc}/org/apache/nutch/plugin/doc-files"/> - </target> - <target name="default-doc" description="--> generate default Nutch documentation"> - <style basedir="${conf.dir}" destdir="${docs.dir}" - includes="nutch-default.xml" style="conf/nutch-conf.xsl"/> + <!-- Copy the definition of Nutch properties --> + <copy file="${conf.dir}/nutch-default.xml" todir="${build.javadoc}/resources/"/> + <copy file="${conf.dir}/configuration.xsl" todir="${build.javadoc}/resources/"/> </target> <!-- ================================================================== --> diff --git a/conf/configuration.xsl b/conf/configuration.xsl index 79141dc..1399673 100644 --- a/conf/configuration.xsl +++ b/conf/configuration.xsl @@ -19,20 +19,39 @@ <xsl:output method="html"/> <xsl:template match="configuration"> <html> + <head> + <title>Nutch Configuration Properties</title> + <meta charset="utf-8"/> + <style> + table { width: 100%; table-layout: fixed; } + th,td { padding: 0.2em 0.5em; } + td { overflow:hidden; vertical-align:top; } + th { background-color: #e0e0e0; } + tr { background-color: #f0f0f0; } + tr:nth-child(odd) { background-color: #fcfcfc; } + th.name { width: 20% } + th.value { width: 30% } + th.description { width: 50% } + </style> + </head> <body> -<table border="1"> -<tr> - <td>name</td> - <td>value</td> - <td>description</td> -</tr> +<table> + <thead> + <tr> + <th class="name">Nutch Property Name</th> + <th class="value">Default Value</th> + <th class="description">Description</th> + </tr> + </thead> + <tbody> <xsl:for-each select="property"> -<tr> - <td><a name="{name}"><xsl:value-of select="name"/></a></td> - <td><xsl:value-of select="value"/></td> - <td><xsl:value-of select="description"/></td> -</tr> + <tr> + <td><a name="{name}"><xsl:value-of select="name"/></a></td> + <td><xsl:value-of select="value"/></td> + <td><xsl:value-of select="description"/></td> + </tr> </xsl:for-each> + </tbody> </table> </body> </html> diff --git a/conf/nutch-conf.xsl b/conf/nutch-conf.xsl deleted file mode 100644 index 36a2275..0000000 --- a/conf/nutch-conf.xsl +++ /dev/null @@ -1,24 +0,0 @@ -<?xml version="1.0"?> -<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> -<xsl:output method="html"/> -<xsl:template match="nutch-conf"> -<html> -<body> -<table border="1"> -<tr> - <td>name</td> - <td>value</td> - <td>description</td> -</tr> -<xsl:for-each select="property"> -<tr> - <td><xsl:value-of select="name"/></td> - <td><xsl:value-of select="value"/></td> - <td><xsl:value-of select="description"/></td> -</tr> -</xsl:for-each> -</table> -</body> -</html> -</xsl:template> -</xsl:stylesheet> diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index f0afd1c..b833288 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -40,7 +40,7 @@ <name>file.content.limit</name> <value>1048576</value> <description>The length limit for downloaded content using the file:// - protocol, in bytes. If this value is nonnegative (>=0), content longer + protocol, in bytes. If this value is non-negative (>=0), content longer than it will be truncated; otherwise, no truncation at all. Do not confuse this setting with the http.content.limit setting. </description> @@ -50,7 +50,7 @@ <name>file.crawl.parent</name> <value>true</value> <description>The crawler is not restricted to the directories that you specified in the - Urls file but it is jumping into the parent directories as well. For your own crawlings you can + URLs file but it is jumping into the parent directories as well. For your own crawlings you can change this behavior (set to false) the way that only directories beneath the directories that you specify get crawled.</description> </property> @@ -75,7 +75,7 @@ And it is probably what we want to set most of time, since file:// URLs are meant to be local and we can always use them directly at parsing and indexing stages. Otherwise file contents will be saved. - !! NO IMPLEMENTED YET !! + !! NOT IMPLEMENTED YET !! </description> </property> @@ -216,7 +216,7 @@ <name>http.content.limit</name> <value>1048576</value> <description>The length limit for downloaded content using the http/https - protocols, in bytes. If this value is nonnegative (>=0), content longer + protocols, in bytes. If this value is non-negative (>=0), content longer than it will be truncated; otherwise, no truncation at all. Do not confuse this setting with the file.content.limit setting. </description> @@ -226,7 +226,7 @@ <name>http.time.limit</name> <value>-1</value> <description>The time limit in seconds to fetch a single document. - If this value is nonnegative (>=0), the HTTP protocol implementation + If this value is non-negative (>=0), the HTTP protocol implementation will stop reading from a socket after http.time.limit seconds have been spent for fetching this document. The HTTP response is then marked as truncated. The http.time.limit should be set to a longer @@ -394,7 +394,7 @@ <value>true</value> <description>Whether Nutch sends an HTTP If-Modified-Since header. It reduces bandwidth when enabled by not downloading pages that respond with an HTTP - Not-Modified header. URL's that are not downloaded are not passed through + Not-Modified header. URLs that are not downloaded are not passed through parse or indexing filters. If you regularly modify filters, you should force Nutch to also download unmodified pages by disabling this feature. </description> @@ -426,7 +426,7 @@ <name>ftp.content.limit</name> <value>1048576</value> <description>The length limit for downloaded content, in bytes. - If this value is nonnegative (>=0), content longer than it will be truncated; + If this value is non-negative (>=0), content longer than it will be truncated; otherwise, no truncation at all. Caution: classical ftp RFCs never defines partial transfer and, in fact, some ftp servers out there do not handle client side forced close-down very @@ -460,7 +460,7 @@ <value>false</value> <description>Whether to keep ftp connection. Useful if crawling same host again and again. When set to true, it avoids connection, login and dir list - parser setup for subsequent urls. If it is set to true, however, you must + parser setup for subsequent URLs. If it is set to true, however, you must make sure (roughly): (1) ftp.timeout is less than ftp.server.timeout (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay) @@ -584,7 +584,7 @@ <property> <name>db.update.purge.orphans</name> <value>false</value> - <description>If true, updatedb will permanently delete URL's marked + <description>If true, updatedb will permanently delete URLs marked as orphan from the CrawlDb. The plugin scoring-orphan needs to be activated to get records marked as orphan. See the plugin's options elsewhere in this document. @@ -596,7 +596,7 @@ <value>false</value> <description> !Temporary, can be overwritten with the command line! - Normalize urls when updating crawldb + Normalize URLs when updating crawldb </description> </property> @@ -605,7 +605,7 @@ <value>false</value> <description> !Temporary, can be overwritten with the command line! - Filter urls when updating crawldb + Filter URLS when updating crawldb </description> </property> @@ -749,7 +749,7 @@ <property> <name>db.fetch.retry.max</name> <value>3</value> - <description>The maximum number of times a url that has encountered + <description>The maximum number of times a URL that has encountered recoverable errors is generated for fetch.</description> </property> @@ -793,7 +793,7 @@ <property> <name>linkdb.max.inlinks</name> <value>10000</value> - <description>Maximum number of Inlinks per URL to be kept in LinkDb. + <description>Maximum number of inlinks per URL to be kept in LinkDb. If "invertlinks" finds more inlinks than this number, only the first N inlinks will be stored, and the rest will be discarded. </description> @@ -831,8 +831,8 @@ <property> <name>generate.max.count</name> <value>-1</value> - <description>The maximum number of urls in a single - fetchlist. -1 if unlimited. The urls are counted according + <description>The maximum number of URLs in a single + fetchlist. -1 if unlimited. The URLs are counted according to the value of the parameter generate.count.mode. </description> </property> @@ -1014,7 +1014,7 @@ <description>Comma-separated list of exceptions not shown with full stack trace in logs of fetcher and HTTP protocol implementations. The logs may shrink in size significantly, e.g., when for a large - unrestriced web crawl unknown hosts are logged shortly without full + unrestricted web crawl unknown hosts are logged shortly without full stack trace. The full class name of the exception class (extending Throwable) including the package path must be specified.</description> </property> @@ -1116,7 +1116,7 @@ and follow until the desired depth is reached. A value of 1 means all generated pages are fetched and their first degree outlinks are fetched and parsed too. Be careful, this feature is in itself agnostic of the state of the CrawlDB and does not know about already fetched pages. A setting larger than 2 will most likely fetch home pages twice in the same fetch cycle. - It is highly recommended to set db.ignore.external.links to true to restrict the outlink follower to URL's within the same + It is highly recommended to set db.ignore.external.links to true to restrict the outlink follower to URLs within the same domain. When disabled (false) the feature is likely to follow duplicates even when depth=1. A value of -1 of 0 disables this feature. </description> @@ -1334,8 +1334,8 @@ <property> <name>indexer.score.power</name> <value>0.5</value> - <description>Determines the power of link analyis scores. Each - pages's boost is set to <i>score<sup>scorePower</sup></i> where + <description>Determines the power of link analyis scores. The boost + of each page is set to <i>score<sup>scorePower</sup></i> where <i>score</i> is its link analysis score and <i>scorePower</i> is the value of this parameter. This is compiled into indexes, so, when this is changed, pages must be re-indexed for it to take @@ -1470,7 +1470,7 @@ <property> <name>plugin.folders</name> <value>plugins</value> - <description>Directories where nutch plugins are located. Each + <description>Directories where Nutch plugins are located. Each element may be a relative or absolute path. If absolute, it is used as is. If relative, it is searched for on the classpath.</description> </property> @@ -1772,7 +1772,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this <property> <name>urlfilter.order</name> <value></value> - <description>The order by which url filters are applied. + <description>The order by which URL filters are applied. If empty, all available url filters (as dictated by properties plugin-includes and plugin-excludes above) are loaded and applied in system defined order. If not empty, only named filters are loaded and applied @@ -2159,7 +2159,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> <property> <name>link.score.updater.clear.score</name> <value>0.0f</value> - <description>The default score for URL's that are not in the web graph.</description> + <description>The default score for URLs that are not in the web graph.</description> </property> <property> @@ -2580,7 +2580,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> <value></value> <description> Choose the type of Queue being used (ex - RabbitMQ, ActiveMq, Kafka, etc). - Currently there exists an implemtation for RabbitMQ producer. + Currently there exists an implementation for RabbitMQ producer. </description> </property> diff --git a/src/plugin/creativecommons/conf/nutch-site.xml b/src/plugin/creativecommons/conf/nutch-site.xml index e639746..e28e12a 100644 --- a/src/plugin/creativecommons/conf/nutch-site.xml +++ b/src/plugin/creativecommons/conf/nutch-site.xml @@ -1,9 +1,9 @@ <?xml version="1.0"?> -<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <!-- Creative Commons' Nutch configuration --> -<nutch-conf> +<configuration> <property> <name>http.agent.name</name> @@ -40,4 +40,4 @@ </description> </property> -</nutch-conf> +</configuration>