Author: lewismc Date: Wed Jul 22 04:08:20 2015 New Revision: 1692216 URL: http://svn.apache.org/r1692216 Log: NUTCH-2021 Use protocol-selenium to Capture Screenshots of the Page as it is Fetched
Added: nutch/trunk/src/plugin/lib-selenium/build-ivy.xml - copied, changed from r1687398, nutch/trunk/src/plugin/parse-tika/build-ivy.xml nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt - copied, changed from r1687398, nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java nutch/trunk/src/plugin/lib-selenium/ivy.xml nutch/trunk/src/plugin/lib-selenium/plugin.xml nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java nutch/trunk/src/plugin/protocol-selenium/ivy.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1692216&r1=1692215&r2=1692216&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jul 22 04:08:20 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2021 Use protocol-selenium to Capture Screenshots of the Page as it is Fetched (lewismc) + * NUTCH-2058 Indexer plugin that allows RegEx replacements on the NutchDocument field values (Peter Ciuffetti via mattmann) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1692216&r1=1692215&r2=1692216&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Wed Jul 22 04:08:20 2015 @@ -1736,4 +1736,40 @@ CAUTION: Set the parser.timeout to -1 or </description> </property> +<!-- protocol-selenium plugin properties --> + +<property> + <name>selenium.driver</name> + <value>firefox</value> + <description> + A String value representing the flavour of Selenium + WebDriver() to use. Currently the following options + exist - firefox, chrome, safari and opera. + </description> +</property> + +<property> + <name>selenium.take.screenshot</name> + <value>false</value> + <description> + Boolean property determining whether the protocol-selenium + WebDriver should capture a screenshot of the URL. If set to + true remember to define the 'selenium.screenshot.location' + property as this determines the location screenshots should be + persisted to on HDFS. If that property is not set, screenshots + are simply discarded. + </description> +</property> + +<property> + <name>selenium.screenshot.location</name> + <value></value> + <description> + The location on disk where a URL screenshot should be saved + to if the 'selenium.take.screenshot' proerty is set to true. + By default this is null, in this case screenshots held in memory + are simply discarded. + </description> +</property> + </configuration> Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1692216&r1=1692215&r2=1692216&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Jul 22 04:08:20 2015 @@ -26,8 +26,6 @@ import java.io.ByteArrayInputStream; import java.util.Arrays; import java.util.HashMap; import java.util.Map; -import java.security.MessageDigest; - import com.google.common.base.Strings; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -102,7 +100,7 @@ import org.slf4j.LoggerFactory; * {"mimeType":"video/quicktime","count":"2"} * {"mimeType":"image/gif","count":"63"} * ] - * + * } * </pre> * <p> * In the case above, the tool would have been run with the <b>-mimeType Copied: nutch/trunk/src/plugin/lib-selenium/build-ivy.xml (from r1687398, nutch/trunk/src/plugin/parse-tika/build-ivy.xml) URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/build-ivy.xml?p2=nutch/trunk/src/plugin/lib-selenium/build-ivy.xml&p1=nutch/trunk/src/plugin/parse-tika/build-ivy.xml&r1=1687398&r2=1692216&rev=1692216&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/build-ivy.xml (original) +++ nutch/trunk/src/plugin/lib-selenium/build-ivy.xml Wed Jul 22 04:08:20 2015 @@ -15,7 +15,7 @@ See the License for the specific language governing permissions and limitations under the License. --> -<project name="parse-tika" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> +<project name="lib-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> <property name="ivy.install.version" value="2.1.0" /> <condition property="ivy.home" value="${env.IVY_HOME}"> Copied: nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt (from r1687398, nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt) URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt?p2=nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt&p1=nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt&r1=1687398&r2=1692216&rev=1692216&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt (original) +++ nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt Wed Jul 22 04:08:20 2015 @@ -1,8 +1,6 @@ -1. Upgrade Tika depencency in trunk/ivy/ivy.xml +1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml -2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml - -3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml +2. Upgrade Tika's own dependencies in src/plugin/lib-selenium/plugin.xml To get the list of dependencies and their versions execute: $ ant -f ./build-ivy.xml $ ls lib | sed 's/^/ <library name="/g' | sed 's/$/"\/>/g' Modified: nutch/trunk/src/plugin/lib-selenium/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/ivy.xml?rev=1692216&r1=1692215&r2=1692216&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-selenium/ivy.xml (original) +++ nutch/trunk/src/plugin/lib-selenium/ivy.xml Wed Jul 22 04:08:20 2015 @@ -42,6 +42,10 @@ <dependency org="com.opera" name="operadriver" rev="1.5"> <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" /> </dependency> + <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" > + <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" /> + <exclude org="org.seleniumhq.selenium" name="selenium-java" /> + </dependency> <!-- end selenium dependencies --> </dependencies> Modified: nutch/trunk/src/plugin/lib-selenium/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/plugin.xml?rev=1692216&r1=1692215&r2=1692216&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-selenium/plugin.xml (original) +++ nutch/trunk/src/plugin/lib-selenium/plugin.xml Wed Jul 22 04:08:20 2015 @@ -27,11 +27,54 @@ <runtime> <library name="lib-selenium.jar"> <export name="*"/> - </library> + </library> + <library name="cglib-nodep-2.1_3.jar"/> + <library name="commons-codec-1.9.jar"/> + <library name="commons-collections-3.2.1.jar"/> + <library name="commons-exec-1.1.jar"/> + <library name="commons-io-2.4.jar"/> + <library name="commons-jxpath-1.3.jar"/> + <library name="commons-lang3-3.3.2.jar"/> + <library name="commons-logging-1.1.3.jar"/> + <library name="cssparser-0.9.14.jar"/> + <library name="gson-2.3.jar"/> + <library name="guava-18.0.jar"/> + <library name="htmlunit-2.15.jar"/> + <library name="htmlunit-core-js-2.15.jar"/> + <library name="httpclient-4.3.4.jar"/> + <library name="httpcore-4.3.2.jar"/> + <library name="httpmime-4.3.3.jar"/> + <library name="ini4j-0.5.2.jar"/> + <library name="jetty-http-8.1.15.v20140411.jar"/> + <library name="jetty-io-8.1.15.v20140411.jar"/> + <library name="jetty-util-8.1.15.v20140411.jar"/> + <library name="jetty-websocket-8.1.15.v20140411.jar"/> + <library name="jna-3.4.0.jar"/> + <library name="nekohtml-1.9.21.jar"/> + <library name="netty-3.5.2.Final.jar"/> + <library name="operadriver-1.5.jar"/> + <library name="operalaunchers-1.1.jar"/> + <library name="platform-3.4.0.jar"/> + <library name="protobuf-java-2.4.1.jar"/> + <library name="sac-1.3.jar"/> + <library name="selenium-api-2.44.0.jar"/> + <library name="selenium-chrome-driver-2.44.0.jar"/> + <library name="selenium-firefox-driver-2.44.0.jar"/> + <library name="selenium-htmlunit-driver-2.44.0.jar"/> + <library name="selenium-ie-driver-2.44.0.jar"/> + <library name="selenium-java-2.44.0.jar"/> + <library name="selenium-remote-driver-2.44.0.jar"/> + <library name="selenium-safari-driver-2.44.0.jar"/> + <library name="selenium-support-2.44.0.jar"/> + <library name="serializer-2.7.1.jar"/> + <library name="webbit-0.4.14.jar"/> + <library name="xalan-2.7.1.jar"/> + <library name="xercesImpl-2.11.0.jar"/> + <library name="xml-apis-1.4.01.jar"/> </runtime> <requires> - <library name="selenium-java-2.4.0.jar"> + <library name="selenium-java-2.44.0.jar"> <export name="*"/> </library> <library name="operadriver-1.5.jar"> Modified: nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1692216&r1=1692215&r2=1692216&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java (original) +++ nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java Wed Jul 22 04:08:20 2015 @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -17,19 +17,32 @@ package org.apache.nutch.protocol.selenium; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.openqa.selenium.By; +import org.openqa.selenium.OutputType; +import org.openqa.selenium.TakesScreenshot; import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.firefox.FirefoxDriver; import org.openqa.selenium.firefox.FirefoxProfile; +import org.openqa.selenium.safari.SafariDriver; import org.openqa.selenium.support.ui.WebDriverWait; - +import com.opera.core.systems.OperaDriver; +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.OutputStream; import java.lang.String; public class HttpWebClient { - private static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.protocol"); + private static final Logger LOG = LoggerFactory.getLogger(HttpWebClient.class); public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() { @@ -45,23 +58,67 @@ public class HttpWebClient { }; }; + /** + * Function for obtaining the HTML BODY using the selected + * {@link org.openqa.selenium.WebDriver}. + * There are a number of configuration properties within + * <code>nutch-site.xml</code> which determine whether to + * take screenshots of the rendered pages and persist them + * as timestamped .png's into HDFS. + * @param url the URL to fetch and render + * @param conf the {@link org.apache.hadoop.conf.Configuration} + * @return the rendered inner HTML page + */ public static String getHtmlPage(String url, Configuration conf) { - WebDriver driver = null; + WebDriver driver = null; try { - driver = new FirefoxDriver(); - //} WebDriver driver = threadWebDriver.get(); - // if (driver == null) { - // driver = new FirefoxDriver(); - // } - + String driverType = conf.get("selenium.driver", "firefox"); + switch (driverType) { + case "firefox": + driver = new FirefoxDriver(); + break; + case "chrome": + driver = new ChromeDriver(); + break; + case "safari": + driver = new SafariDriver(); + break; + case "opera": + driver = new OperaDriver(); + break; + default: + LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType); + driver = new FirefoxDriver(); + break; + } + LOG.debug("Selenium {} WebDriver selected.", driverType); driver.get(url); // Wait for the page to load, timeout after 3 seconds new WebDriverWait(driver, 3); - String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML"); + if (conf.getBoolean("selenium.take.screenshot", false)) { + File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE); + LOG.debug("In-memory screenshot taken of: {}", url); + FileSystem fs = FileSystem.get(conf); + Path screenshotPath = new Path(conf.get("selenium.screenshot.location") + "/" + srcFile.getName()); + if (screenshotPath != null) { + OutputStream os = null; + if (!fs.exists(screenshotPath)) { + LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName()); + os = fs.create(screenshotPath); + } + InputStream is = new BufferedInputStream(new FileInputStream(srcFile)); + IOUtils.copyBytes(is, os, conf); + LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); + } else { + LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for " + + "'selenium.screenshot.location' is absent from nutch-site.xml.", url); + } + } + String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML"); return innerHtml; // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit @@ -75,4 +132,4 @@ public class HttpWebClient { public static String getHtmlPage(String url) { return getHtmlPage(url, null); } -} \ No newline at end of file +} Modified: nutch/trunk/src/plugin/protocol-selenium/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/ivy.xml?rev=1692216&r1=1692215&r2=1692216&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-selenium/ivy.xml (original) +++ nutch/trunk/src/plugin/protocol-selenium/ivy.xml Wed Jul 22 04:08:20 2015 @@ -42,6 +42,10 @@ <dependency org="com.opera" name="operadriver" rev="1.5"> <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" /> </dependency> + <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" > + <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" /> + <exclude org="org.seleniumhq.selenium" name="selenium-java" /> + </dependency> <!-- end selenium dependencies --> </dependencies>