Author: lewismc
Date: Wed Jul 22 04:08:20 2015
New Revision: 1692216

URL: http://svn.apache.org/r1692216
Log:
NUTCH-2021 Use protocol-selenium to Capture Screenshots of the Page as it is 
Fetched

Added:
    nutch/trunk/src/plugin/lib-selenium/build-ivy.xml
      - copied, changed from r1687398, 
nutch/trunk/src/plugin/parse-tika/build-ivy.xml
    nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt
      - copied, changed from r1687398, 
nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
    nutch/trunk/src/plugin/lib-selenium/ivy.xml
    nutch/trunk/src/plugin/lib-selenium/plugin.xml
    
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
    nutch/trunk/src/plugin/protocol-selenium/ivy.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 22 04:08:20 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2021 Use protocol-selenium to Capture Screenshots of the Page as it is 
Fetched (lewismc)
+
 * NUTCH-2058 Indexer plugin that allows RegEx replacements on the 
NutchDocument 
   field values (Peter Ciuffetti via mattmann)
 

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Jul 22 04:08:20 2015
@@ -1736,4 +1736,40 @@ CAUTION: Set the parser.timeout to -1 or
   </description>
 </property>
 
+<!-- protocol-selenium plugin properties -->
+
+<property>
+  <name>selenium.driver</name>
+  <value>firefox</value>
+  <description>
+    A String value representing the flavour of Selenium 
+    WebDriver() to use. Currently the following options
+    exist - firefox, chrome, safari and opera.
+  </description>
+</property>
+
+<property>
+  <name>selenium.take.screenshot</name>
+  <value>false</value>
+  <description>
+    Boolean property determining whether the protocol-selenium
+    WebDriver should capture a screenshot of the URL. If set to
+    true remember to define the 'selenium.screenshot.location' 
+    property as this determines the location screenshots should be 
+    persisted to on HDFS. If that property is not set, screenshots
+    are simply discarded.
+  </description>
+</property>
+
+<property>
+  <name>selenium.screenshot.location</name>
+  <value></value>
+  <description>
+    The location on disk where a URL screenshot should be saved
+    to if the 'selenium.take.screenshot' proerty is set to true.
+    By default this is null, in this case screenshots held in memory
+    are simply discarded.
+  </description>
+</property>
+
 </configuration>

Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Jul 22 
04:08:20 2015
@@ -26,8 +26,6 @@ import java.io.ByteArrayInputStream;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
-import java.security.MessageDigest;
-
 import com.google.common.base.Strings;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -102,7 +100,7 @@ import org.slf4j.LoggerFactory;
  *     {"mimeType":"video/quicktime","count":"2"}
  *     {"mimeType":"image/gif","count":"63"}
  *   ]
- *  
+ * }
  * </pre>
  * <p>
  * In the case above, the tool would have been run with the <b>-mimeType

Copied: nutch/trunk/src/plugin/lib-selenium/build-ivy.xml (from r1687398, 
nutch/trunk/src/plugin/parse-tika/build-ivy.xml)
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/build-ivy.xml?p2=nutch/trunk/src/plugin/lib-selenium/build-ivy.xml&p1=nutch/trunk/src/plugin/parse-tika/build-ivy.xml&r1=1687398&r2=1692216&rev=1692216&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/build-ivy.xml (original)
+++ nutch/trunk/src/plugin/lib-selenium/build-ivy.xml Wed Jul 22 04:08:20 2015
@@ -15,7 +15,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 -->
-<project name="parse-tika" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
+<project name="lib-selenium" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
 
     <property name="ivy.install.version" value="2.1.0" />
     <condition property="ivy.home" value="${env.IVY_HOME}">

Copied: nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt (from 
r1687398, nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt)
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt?p2=nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt&p1=nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt&r1=1687398&r2=1692216&rev=1692216&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt (original)
+++ nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt Wed Jul 22 
04:08:20 2015
@@ -1,8 +1,6 @@
-1. Upgrade Tika depencency in trunk/ivy/ivy.xml
+1. Upgrade various driver versions dependency in 
src/plugin/lib-selenium/ivy.xml
 
-2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
-
-3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml
+2. Upgrade Tika's own dependencies in src/plugin/lib-selenium/plugin.xml
    To get the list of dependencies and their versions execute:
    $ ant -f ./build-ivy.xml
    $ ls lib | sed 's/^/      <library name="/g' | sed 's/$/"\/>/g'

Modified: nutch/trunk/src/plugin/lib-selenium/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/ivy.xml?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/ivy.xml (original)
+++ nutch/trunk/src/plugin/lib-selenium/ivy.xml Wed Jul 22 04:08:20 2015
@@ -42,6 +42,10 @@
     <dependency org="com.opera" name="operadriver" rev="1.5">
       <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
     </dependency>
+    <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+      <exclude org="org.seleniumhq.selenium" name="selenium-java" />
+    </dependency>
     <!-- end selenium dependencies -->
   </dependencies>
   

Modified: nutch/trunk/src/plugin/lib-selenium/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/plugin.xml?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/plugin.xml (original)
+++ nutch/trunk/src/plugin/lib-selenium/plugin.xml Wed Jul 22 04:08:20 2015
@@ -27,11 +27,54 @@
    <runtime>
      <library name="lib-selenium.jar">
         <export name="*"/>
-     </library>       
+     </library>
+      <library name="cglib-nodep-2.1_3.jar"/>
+      <library name="commons-codec-1.9.jar"/>
+      <library name="commons-collections-3.2.1.jar"/>
+      <library name="commons-exec-1.1.jar"/>
+      <library name="commons-io-2.4.jar"/>
+      <library name="commons-jxpath-1.3.jar"/>
+      <library name="commons-lang3-3.3.2.jar"/>
+      <library name="commons-logging-1.1.3.jar"/>
+      <library name="cssparser-0.9.14.jar"/>
+      <library name="gson-2.3.jar"/>
+      <library name="guava-18.0.jar"/>
+      <library name="htmlunit-2.15.jar"/>
+      <library name="htmlunit-core-js-2.15.jar"/>
+      <library name="httpclient-4.3.4.jar"/>
+      <library name="httpcore-4.3.2.jar"/>
+      <library name="httpmime-4.3.3.jar"/>
+      <library name="ini4j-0.5.2.jar"/>
+      <library name="jetty-http-8.1.15.v20140411.jar"/>
+      <library name="jetty-io-8.1.15.v20140411.jar"/>
+      <library name="jetty-util-8.1.15.v20140411.jar"/>
+      <library name="jetty-websocket-8.1.15.v20140411.jar"/>
+      <library name="jna-3.4.0.jar"/>
+      <library name="nekohtml-1.9.21.jar"/>
+      <library name="netty-3.5.2.Final.jar"/>
+      <library name="operadriver-1.5.jar"/>
+      <library name="operalaunchers-1.1.jar"/>
+      <library name="platform-3.4.0.jar"/>
+      <library name="protobuf-java-2.4.1.jar"/>
+      <library name="sac-1.3.jar"/>
+      <library name="selenium-api-2.44.0.jar"/>
+      <library name="selenium-chrome-driver-2.44.0.jar"/>
+      <library name="selenium-firefox-driver-2.44.0.jar"/>
+      <library name="selenium-htmlunit-driver-2.44.0.jar"/>
+      <library name="selenium-ie-driver-2.44.0.jar"/>
+      <library name="selenium-java-2.44.0.jar"/>
+      <library name="selenium-remote-driver-2.44.0.jar"/>
+      <library name="selenium-safari-driver-2.44.0.jar"/>
+      <library name="selenium-support-2.44.0.jar"/>
+      <library name="serializer-2.7.1.jar"/>
+      <library name="webbit-0.4.14.jar"/>
+      <library name="xalan-2.7.1.jar"/>
+      <library name="xercesImpl-2.11.0.jar"/>
+      <library name="xml-apis-1.4.01.jar"/>
    </runtime>
 
    <requires>
-     <library name="selenium-java-2.4.0.jar">
+     <library name="selenium-java-2.44.0.jar">
        <export name="*"/>
      </library>
      <library name="operadriver-1.5.jar">

Modified: 
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
 Wed Jul 22 04:08:20 2015
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -17,19 +17,32 @@
 package org.apache.nutch.protocol.selenium;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.openqa.selenium.By;
+import org.openqa.selenium.OutputType;
+import org.openqa.selenium.TakesScreenshot;
 import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.chrome.ChromeDriver;
 import org.openqa.selenium.firefox.FirefoxDriver;
 import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.safari.SafariDriver;
 import org.openqa.selenium.support.ui.WebDriverWait;
-
+import com.opera.core.systems.OperaDriver;
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
 import java.lang.String;
 
 public class HttpWebClient {
 
-  private static final Logger LOG = 
LoggerFactory.getLogger("org.apache.nutch.protocol");
+  private static final Logger LOG = 
LoggerFactory.getLogger(HttpWebClient.class);
 
   public static ThreadLocal<WebDriver> threadWebDriver = new 
ThreadLocal<WebDriver>() {
 
@@ -45,23 +58,67 @@ public class HttpWebClient {
     };
   };
 
+  /**
+   * Function for obtaining the HTML BODY using the selected
+   * {@link org.openqa.selenium.WebDriver}.
+   * There are a number of configuration properties within
+   * <code>nutch-site.xml</code> which determine whether to
+   * take screenshots of the rendered pages and persist them
+   * as timestamped .png's into HDFS.
+   * @param url the URL to fetch and render
+   * @param conf the {@link org.apache.hadoop.conf.Configuration}
+   * @return the rendered inner HTML page
+   */
   public static String getHtmlPage(String url, Configuration conf) {
-    WebDriver driver = null;
 
+    WebDriver driver = null;
     try {
-      driver = new FirefoxDriver();
-      //} WebDriver driver = threadWebDriver.get();
-      //  if (driver == null) {
-      //    driver = new FirefoxDriver();
-      //  }
-
+      String driverType  = conf.get("selenium.driver", "firefox");
+      switch (driverType) {
+      case "firefox":
+        driver = new FirefoxDriver();
+        break;
+      case "chrome":
+        driver = new ChromeDriver();
+        break;
+      case "safari":
+        driver = new SafariDriver();
+        break;
+      case "opera":
+        driver = new OperaDriver();
+        break;
+      default:
+        LOG.error("The Selenium WebDriver choice {} is not available... 
defaulting to FirefoxDriver().", driverType);
+        driver = new FirefoxDriver();
+        break;
+      }
+      LOG.debug("Selenium {} WebDriver selected.", driverType);
       driver.get(url);
 
       // Wait for the page to load, timeout after 3 seconds
       new WebDriverWait(driver, 3);
 
-      String innerHtml = 
driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+      if (conf.getBoolean("selenium.take.screenshot", false)) {
+        File srcFile = 
((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+        LOG.debug("In-memory screenshot taken of: {}", url);
+        FileSystem fs = FileSystem.get(conf);
+        Path screenshotPath = new 
Path(conf.get("selenium.screenshot.location") + "/" + srcFile.getName());
+        if (screenshotPath != null) {
+          OutputStream os = null;
+          if (!fs.exists(screenshotPath)) {
+            LOG.debug("No existing screenshot already exists... creating new 
file at {} {}.", screenshotPath, srcFile.getName());
+            os = fs.create(screenshotPath);
+          }
+          InputStream is = new BufferedInputStream(new 
FileInputStream(srcFile));
+          IOUtils.copyBytes(is, os, conf);
+          LOG.debug("Screenshot for {} successfully saved to: {} {}", url, 
screenshotPath, srcFile.getName()); 
+        } else {
+          LOG.warn("Screenshot for {} not saved to HDFS (subsequently 
disgarded) as value for "
+              + "'selenium.screenshot.location' is absent from 
nutch-site.xml.", url);
+        }
+      }
 
+      String innerHtml = 
driver.findElement(By.tagName("body")).getAttribute("innerHTML");
       return innerHtml;
 
       // I'm sure this catch statement is a code smell ; borrowing it from 
lib-htmlunit
@@ -75,4 +132,4 @@ public class HttpWebClient {
   public static String getHtmlPage(String url) {
     return getHtmlPage(url, null);
   }
-}
\ No newline at end of file
+}

Modified: nutch/trunk/src/plugin/protocol-selenium/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/ivy.xml?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/ivy.xml (original)
+++ nutch/trunk/src/plugin/protocol-selenium/ivy.xml Wed Jul 22 04:08:20 2015
@@ -42,6 +42,10 @@
     <dependency org="com.opera" name="operadriver" rev="1.5">
       <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
     </dependency>
+    <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+      <exclude org="org.seleniumhq.selenium" name="selenium-java" />
+    </dependency>
     <!-- end selenium dependencies -->
   </dependencies>
   


Reply via email to