- adapt XSLT table layout - remove obsolete nutch-conf.xsl - fix typos and normalize spelling in nutch-default.xml

snagel Sun, 16 Aug 2020 12:04:44 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


commit 7f51c2530795d7203aa1d0834be8e1c2c1373531
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Wed Apr 29 13:03:01 2020 +0200

    NUTCH-2743 Add list of Nutch properties (nutch-default.xml) to documentation
    - modify ant build.xml to copy nutch-default.xml into docs/api/resources/
    - adapt XSLT table layout
    - remove obsolete nutch-conf.xsl
    - fix typos and normalize spelling in nutch-default.xml
---
 build.xml                                      |  7 ++--
 conf/configuration.xsl                         | 41 +++++++++++++++++------
 conf/nutch-conf.xsl                            | 24 --------------
 conf/nutch-default.xml                         | 46 +++++++++++++-------------
 src/plugin/creativecommons/conf/nutch-site.xml |  6 ++--
 5 files changed, 59 insertions(+), 65 deletions(-)

diff --git a/build.xml b/build.xml
index 0a1bca0..5eb157e 100644
--- a/build.xml
+++ b/build.xml
@@ -786,11 +786,10 @@
     <!-- Copy the plugin.dtd file to the plugin doc-files dir -->
     <copy file="${plugins.dir}/plugin.dtd"
           todir="${build.javadoc}/org/apache/nutch/plugin/doc-files"/>
-  </target>
 
-  <target name="default-doc" description="--> generate default Nutch 
documentation">
-    <style basedir="${conf.dir}" destdir="${docs.dir}"
-           includes="nutch-default.xml" style="conf/nutch-conf.xsl"/>
+    <!-- Copy the definition of Nutch properties -->
+    <copy file="${conf.dir}/nutch-default.xml" 
todir="${build.javadoc}/resources/"/>
+    <copy file="${conf.dir}/configuration.xsl" 
todir="${build.javadoc}/resources/"/>
   </target>
 
     <!-- ================================================================== -->
diff --git a/conf/configuration.xsl b/conf/configuration.xsl
index 79141dc..1399673 100644
--- a/conf/configuration.xsl
+++ b/conf/configuration.xsl
@@ -19,20 +19,39 @@
 <xsl:output method="html"/>
 <xsl:template match="configuration">
 <html>
+ <head>
+  <title>Nutch Configuration Properties</title>
+  <meta charset="utf-8"/>
+  <style>
+    table { width: 100%; table-layout: fixed; }
+    th,td { padding: 0.2em 0.5em; }
+    td { overflow:hidden; vertical-align:top; }
+    th { background-color: #e0e0e0; }
+    tr { background-color: #f0f0f0; }
+    tr:nth-child(odd) { background-color: #fcfcfc; }
+    th.name { width: 20% }
+    th.value { width: 30% }
+    th.description { width: 50% }
+  </style>
+ </head>
 <body>
-<table border="1">
-<tr>
- <td>name</td>
- <td>value</td>
- <td>description</td>
-</tr>
+<table>
+ <thead>
+  <tr>
+   <th class="name">Nutch Property Name</th>
+   <th class="value">Default Value</th>
+   <th class="description">Description</th>
+  </tr>
+ </thead>
+ <tbody>
 <xsl:for-each select="property">
-<tr>
-  <td><a name="{name}"><xsl:value-of select="name"/></a></td>
-  <td><xsl:value-of select="value"/></td>
-  <td><xsl:value-of select="description"/></td>
-</tr>
+  <tr>
+   <td><a name="{name}"><xsl:value-of select="name"/></a></td>
+   <td><xsl:value-of select="value"/></td>
+   <td><xsl:value-of select="description"/></td>
+  </tr>
 </xsl:for-each>
+ </tbody>
 </table>
 </body>
 </html>
diff --git a/conf/nutch-conf.xsl b/conf/nutch-conf.xsl
deleted file mode 100644
index 36a2275..0000000
--- a/conf/nutch-conf.xsl
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0"?>
-<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"; version="1.0">
-<xsl:output method="html"/>
-<xsl:template match="nutch-conf">
-<html>
-<body>
-<table border="1">
-<tr>
- <td>name</td>
- <td>value</td>
- <td>description</td>
-</tr>
-<xsl:for-each select="property">
-<tr>
-  <td><xsl:value-of select="name"/></td>
-  <td><xsl:value-of select="value"/></td>
-  <td><xsl:value-of select="description"/></td>
-</tr>
-</xsl:for-each>
-</table>
-</body>
-</html>
-</xsl:template>
-</xsl:stylesheet>
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index f0afd1c..b833288 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -40,7 +40,7 @@
   <name>file.content.limit</name>
   <value>1048576</value>
   <description>The length limit for downloaded content using the file://
-  protocol, in bytes. If this value is nonnegative (>=0), content longer
+  protocol, in bytes. If this value is non-negative (>=0), content longer
   than it will be truncated; otherwise, no truncation at all. Do not
   confuse this setting with the http.content.limit setting.
   </description>
@@ -50,7 +50,7 @@
   <name>file.crawl.parent</name>
   <value>true</value>
   <description>The crawler is not restricted to the directories that you 
specified in the
-    Urls file but it is jumping into the parent directories as well. For your 
own crawlings you can
+    URLs file but it is jumping into the parent directories as well. For your 
own crawlings you can
     change this behavior (set to false) the way that only directories beneath 
the directories that you specify get
     crawled.</description>
 </property>
@@ -75,7 +75,7 @@
   And it is probably what we want to set most of time, since file:// URLs
   are meant to be local and we can always use them directly at parsing
   and indexing stages. Otherwise file contents will be saved.
-  !! NO IMPLEMENTED YET !!
+  !! NOT IMPLEMENTED YET !!
   </description>
 </property>
 
@@ -216,7 +216,7 @@
   <name>http.content.limit</name>
   <value>1048576</value>
   <description>The length limit for downloaded content using the http/https
-  protocols, in bytes. If this value is nonnegative (>=0), content longer
+  protocols, in bytes. If this value is non-negative (>=0), content longer
   than it will be truncated; otherwise, no truncation at all. Do not
   confuse this setting with the file.content.limit setting.
   </description>
@@ -226,7 +226,7 @@
   <name>http.time.limit</name>
   <value>-1</value>
   <description>The time limit in seconds to fetch a single document.
-  If this value is nonnegative (>=0), the HTTP protocol implementation
+  If this value is non-negative (>=0), the HTTP protocol implementation
   will stop reading from a socket after http.time.limit seconds have
   been spent for fetching this document.  The HTTP response is then
   marked as truncated.  The http.time.limit should be set to a longer
@@ -394,7 +394,7 @@
   <value>true</value>
   <description>Whether Nutch sends an HTTP If-Modified-Since header. It reduces
   bandwidth when enabled by not downloading pages that respond with an HTTP
-  Not-Modified header. URL's that are not downloaded are not passed through
+  Not-Modified header. URLs that are not downloaded are not passed through
   parse or indexing filters. If you regularly modify filters, you should force
   Nutch to also download unmodified pages by disabling this feature.
   </description>
@@ -426,7 +426,7 @@
   <name>ftp.content.limit</name>
   <value>1048576</value>
   <description>The length limit for downloaded content, in bytes.
-  If this value is nonnegative (>=0), content longer than it will be truncated;
+  If this value is non-negative (>=0), content longer than it will be 
truncated;
   otherwise, no truncation at all.
   Caution: classical ftp RFCs never defines partial transfer and, in fact,
   some ftp servers out there do not handle client side forced close-down very
@@ -460,7 +460,7 @@
   <value>false</value>
   <description>Whether to keep ftp connection. Useful if crawling same host
   again and again. When set to true, it avoids connection, login and dir list
-  parser setup for subsequent urls. If it is set to true, however, you must
+  parser setup for subsequent URLs. If it is set to true, however, you must
   make sure (roughly):
   (1) ftp.timeout is less than ftp.server.timeout
   (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay)
@@ -584,7 +584,7 @@
 <property>
   <name>db.update.purge.orphans</name>
   <value>false</value>
-  <description>If true, updatedb will permanently delete URL's marked
+  <description>If true, updatedb will permanently delete URLs marked
   as orphan from the CrawlDb. The plugin scoring-orphan needs to be
   activated to get records marked as orphan. See the plugin's options
   elsewhere in this document.
@@ -596,7 +596,7 @@
     <value>false</value>
     <description>
        !Temporary, can be overwritten with the command line!
-       Normalize urls when updating crawldb
+       Normalize URLs when updating crawldb
     </description>
 </property>
 
@@ -605,7 +605,7 @@
     <value>false</value>
     <description>
        !Temporary, can be overwritten with the command line!
-       Filter urls when updating crawldb
+       Filter URLS when updating crawldb
     </description>
 </property>
 
@@ -749,7 +749,7 @@
 <property>
   <name>db.fetch.retry.max</name>
   <value>3</value>
-  <description>The maximum number of times a url that has encountered
+  <description>The maximum number of times a URL that has encountered
   recoverable errors is generated for fetch.</description>
 </property>
 
@@ -793,7 +793,7 @@
 <property>
   <name>linkdb.max.inlinks</name>
   <value>10000</value>
-  <description>Maximum number of Inlinks per URL to be kept in LinkDb.
+  <description>Maximum number of inlinks per URL to be kept in LinkDb.
   If "invertlinks" finds more inlinks than this number, only the first
   N inlinks will be stored, and the rest will be discarded.
   </description>
@@ -831,8 +831,8 @@
 <property>
   <name>generate.max.count</name>
   <value>-1</value>
-  <description>The maximum number of urls in a single
-  fetchlist.  -1 if unlimited. The urls are counted according
+  <description>The maximum number of URLs in a single
+  fetchlist.  -1 if unlimited. The URLs are counted according
   to the value of the parameter generate.count.mode.
   </description>
 </property>
@@ -1014,7 +1014,7 @@
   <description>Comma-separated list of exceptions not shown with full
   stack trace in logs of fetcher and HTTP protocol implementations.
   The logs may shrink in size significantly, e.g., when for a large
-  unrestriced web crawl unknown hosts are logged shortly without full
+  unrestricted web crawl unknown hosts are logged shortly without full
   stack trace.  The full class name of the exception class (extending
   Throwable) including the package path must be specified.</description>
 </property>
@@ -1116,7 +1116,7 @@
   and follow until the desired depth is reached. A value of 1 means all 
generated pages are fetched and their first degree
   outlinks are fetched and parsed too. Be careful, this feature is in itself 
agnostic of the state of the CrawlDB and does not
   know about already fetched pages. A setting larger than 2 will most likely 
fetch home pages twice in the same fetch cycle.
-  It is highly recommended to set db.ignore.external.links to true to restrict 
the outlink follower to URL's within the same
+  It is highly recommended to set db.ignore.external.links to true to restrict 
the outlink follower to URLs within the same
   domain. When disabled (false) the feature is likely to follow duplicates 
even when depth=1.
   A value of -1 of 0 disables this feature.
   </description>
@@ -1334,8 +1334,8 @@
 <property>
   <name>indexer.score.power</name>
   <value>0.5</value>
-  <description>Determines the power of link analyis scores.  Each
-  pages's boost is set to <i>score<sup>scorePower</sup></i> where
+  <description>Determines the power of link analyis scores. The boost
+  of each page is set to <i>score<sup>scorePower</sup></i> where
   <i>score</i> is its link analysis score and <i>scorePower</i> is the
   value of this parameter.  This is compiled into indexes, so, when
   this is changed, pages must be re-indexed for it to take
@@ -1470,7 +1470,7 @@
 <property>
   <name>plugin.folders</name>
   <value>plugins</value>
-  <description>Directories where nutch plugins are located.  Each
+  <description>Directories where Nutch plugins are located.  Each
   element may be a relative or absolute path.  If absolute, it is used
   as is.  If relative, it is searched for on the classpath.</description>
 </property>
@@ -1772,7 +1772,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value 
than 30, when using this
 <property>
   <name>urlfilter.order</name>
   <value></value>
-  <description>The order by which url filters are applied.
+  <description>The order by which URL filters are applied.
   If empty, all available url filters (as dictated by properties
   plugin-includes and plugin-excludes above) are loaded and applied in system
   defined order. If not empty, only named filters are loaded and applied
@@ -2159,7 +2159,7 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
 <property>
   <name>link.score.updater.clear.score</name>
   <value>0.0f</value>
-  <description>The default score for URL's that are not in the web 
graph.</description>
+  <description>The default score for URLs that are not in the web 
graph.</description>
 </property>
 
 <property>
@@ -2580,7 +2580,7 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <value></value>
   <description>
     Choose the type of Queue being used (ex - RabbitMQ, ActiveMq, Kafka, etc). 
-    Currently there exists an implemtation for RabbitMQ producer. 
+    Currently there exists an implementation for RabbitMQ producer.
   </description>
 </property>
 
diff --git a/src/plugin/creativecommons/conf/nutch-site.xml 
b/src/plugin/creativecommons/conf/nutch-site.xml
index e639746..e28e12a 100644
--- a/src/plugin/creativecommons/conf/nutch-site.xml
+++ b/src/plugin/creativecommons/conf/nutch-site.xml
@@ -1,9 +1,9 @@
 <?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
 <!-- Creative Commons' Nutch configuration -->
 
-<nutch-conf>
+<configuration>
 
 <property>
   <name>http.agent.name</name>
@@ -40,4 +40,4 @@
   </description>
 </property>
 
-</nutch-conf>
+</configuration>

[nutch] 01/35: NUTCH-2743 Add list of Nutch properties (nutch-default.xml) to documentation - modify ant build.xml to copy nutch-default.xml into docs/api/resources/ - adapt XSLT table layout - remove obsolete nutch-conf.xsl - fix typos and normalize spelling in nutch-default.xml

Reply via email to