Author: lewismc Date: Wed Aug 26 02:21:31 2015 New Revision: 1697808 URL: http://svn.apache.org/r1697808 Log: NUTCH-2083 Implement functionality to shadow nutch-selenium-grid-plugin from Mo Omer
Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java nutch/trunk/src/plugin/protocol-selenium/README.md Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1697808&r1=1697807&r2=1697808&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Aug 26 02:21:31 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2083 Implement functionality to shadow nutch-selenium-grid-plugin from Mo Omer (lewismc) + * NUTCH-2049 Upgrade to Hadoop 2.4 (lewismc) * NUTCH-1486 Upgrade to Solr 4.10.2 (lewismc, markus) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1697808&r1=1697807&r2=1697808&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Wed Aug 26 02:21:31 2015 @@ -1785,7 +1785,10 @@ CAUTION: Set the parser.timeout to -1 or <description> A String value representing the flavour of Selenium WebDriver() to use. Currently the following options - exist - firefox, chrome, safari and opera. + exist - 'firefox', 'chrome', 'safari', 'opera' and 'remote'. + If 'remote' is used it is essential to also set correct properties for + 'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host' and + 'selenium.hub.protocol'. </description> </property> @@ -1813,6 +1816,30 @@ CAUTION: Set the parser.timeout to -1 or </description> </property> +<property> + <name>selenium.hub.port</name> + <value>4444</value> + <description>Selenium Hub Location connection port</description> +</property> + +<property> + <name>selenium.hub.path</name> + <value>/wd/hub</value> + <description>Selenium Hub Location connection path</description> +</property> + +<property> + <name>selenium.hub.host</name> + <value>localhost</value> + <description>Selenium Hub Location connection host</description> +</property> + +<property> + <name>selenium.hub.protocol</name> + <value>http</value> + <description>Selenium Hub Location connection protocol</description> +</property> + <!-- lib-selenium configuration --> <property> <name>libselenium.page.load.delay</name> Modified: nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1697808&r1=1697807&r2=1697808&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java (original) +++ nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java Wed Aug 26 02:21:31 2015 @@ -17,7 +17,6 @@ package org.apache.nutch.protocol.selenium; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; @@ -30,15 +29,20 @@ import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.firefox.FirefoxDriver; import org.openqa.selenium.firefox.FirefoxProfile; +import org.openqa.selenium.remote.DesiredCapabilities; +import org.openqa.selenium.remote.RemoteWebDriver; import org.openqa.selenium.safari.SafariDriver; import org.openqa.selenium.support.ui.WebDriverWait; + import com.opera.core.systems.OperaDriver; + import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.io.OutputStream; import java.lang.String; +import java.net.URL; public class HttpWebClient { @@ -77,6 +81,13 @@ public class HttpWebClient { case "opera": driver = new OperaDriver(); break; + case "remote": + String seleniumHubHost = conf.get("selenium.hub.host", "localhost"); + int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444")); + String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub"); + String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http"); + driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox()); + break; default: LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType); driver = new FirefoxDriver(); Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java?rev=1697808&r1=1697807&r2=1697808&view=diff ============================================================================== --- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java (original) +++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java Wed Aug 26 02:21:31 2015 @@ -137,11 +137,8 @@ public class NaiveBayesParseFilter imple } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); - } - try { - train(); } catch (Exception e) { @@ -169,7 +166,7 @@ public class NaiveBayesParseFilter imple if (!filterParse(text)) { // kick in the second tier // if parent page found // irrelevent - LOG.info("ParseFilter: NaiveBayes: Page found irrelevent:: " + url); + LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url); LOG.info("Checking outlinks"); Outlink[] out = null; Modified: nutch/trunk/src/plugin/protocol-selenium/README.md URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/README.md?rev=1697808&r1=1697807&r2=1697808&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-selenium/README.md (original) +++ nutch/trunk/src/plugin/protocol-selenium/README.md Wed Aug 26 02:21:31 2015 @@ -1,22 +1,23 @@ Nutch Selenium ============== +# Introduction + This plugin allows you to fetch Javascript pages using [Selenium](http://www.seleniumhq.org/), while relying on the rest of the awesome Nutch stack! The underlying code is based on the nutch-htmlunit plugin, which was in turn based on nutch-httpclient. -# IMPORTANT NOTES: +There are essentially two ways in which Nutch can be used with Selenium. - * A version of this plugin which relies on the Selenium Hub/Node system can be found here: [nutch-selenium-grid-plugin](https://github.com/momer/nutch-selenium-grid-plugin) + * Locally (on each node) as a self contained process, or + * via the RemoteWebDriver which connects to [Selenium-Grid](http://www.seleniumhq.org/docs/07_selenium_grid.jsp). A grid consists of a single hub, and one or more nodes. -# Installation (tested on Ubuntu 14.0x) +# Installation ## Part 1: Setting up Selenium - * Ensure that you have Firefox installed + * Ensure that you have Firefox installed. More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox) ``` -# More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox) - sudo apt-get install firefox ``` * Install Xvfb and its associates @@ -51,14 +52,92 @@ sudo export DISPLAY=:11 </description> </property> ``` + +Then ensure that you have the correct configuration set within the following configuration options + +``` +<!-- protocol-selenium plugin properties --> + +<property> + <name>selenium.driver</name> + <value>firefox</value> + <description> + A String value representing the flavour of Selenium + WebDriver() to use. Currently the following options + exist - 'firefox', 'chrome', 'safari', 'opera' and 'remote'. + If 'remote' is used it is essential to also set correct properties for + 'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host' and + 'selenium.hub.protocol'. + </description> +</property> + +<property> + <name>selenium.take.screenshot</name> + <value>false</value> + <description> + Boolean property determining whether the protocol-selenium + WebDriver should capture a screenshot of the URL. If set to + true remember to define the 'selenium.screenshot.location' + property as this determines the location screenshots should be + persisted to on HDFS. If that property is not set, screenshots + are simply discarded. + </description> +</property> + +<property> + <name>selenium.screenshot.location</name> + <value></value> + <description> + The location on disk where a URL screenshot should be saved + to if the 'selenium.take.screenshot' proerty is set to true. + By default this is null, in this case screenshots held in memory + are simply discarded. + </description> +</property> + +<property> + <name>selenium.hub.port</name> + <value>4444</value> + <description>Selenium Hub Location connection port</description> +</property> + +<property> + <name>selenium.hub.path</name> + <value>/wd/hub</value> + <description>Selenium Hub Location connection path</description> +</property> + +<property> + <name>selenium.hub.host</name> + <value>localhost</value> + <description>Selenium Hub Location connection host</description> +</property> + +<property> + <name>selenium.hub.protocol</name> + <value>http</value> + <description>Selenium Hub Location connection protocol</description> +</property> + +<!-- lib-selenium configuration --> +<property> + <name>libselenium.page.load.delay</name> + <value>3</value> + <description> + The delay in seconds to use when loading a page with lib-selenium. This + setting is used by protocol-selenium and protocol-interactiveselenium + since they depending on lib-selenium for fetching. + </description> +</property> +``` + * If you've selected 'remote' value for the 'selenium.driver' property, ensure that you've configured + the additional properties based on your [Selenium-Grid installation](http://www.seleniumhq.org/docs/07_selenium_grid.jsp#installation). + * Compile nutch ``` ant runtime ``` * Start your web crawl (Ensure that you followed the above steps and have started your xvfb display as shown above) -``` -NUTCH_HOME/runtime/local/bin/crawl [-i|--index] [-D \"key=value\"] <Seed Dir> <Crawl Dir> <Num Rounds> -```