Author: lewismc
Date: Thu Feb 26 18:31:39 2015
New Revision: 1662530

URL: http://svn.apache.org/r1662530
Log:
NUTCH-1933 nutch-selenium plugin

Added:
    nutch/trunk/src/plugin/lib-selenium/
    nutch/trunk/src/plugin/lib-selenium/build.xml
    nutch/trunk/src/plugin/lib-selenium/ivy.xml
    nutch/trunk/src/plugin/lib-selenium/plugin.xml
    nutch/trunk/src/plugin/lib-selenium/src/
    nutch/trunk/src/plugin/lib-selenium/src/java/
    nutch/trunk/src/plugin/lib-selenium/src/java/org/
    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/
    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/
    
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/
    
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
    nutch/trunk/src/plugin/protocol-selenium/
    nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
    nutch/trunk/src/plugin/protocol-selenium/build.xml
    nutch/trunk/src/plugin/protocol-selenium/ivy.xml
    nutch/trunk/src/plugin/protocol-selenium/plugin.xml
    nutch/trunk/src/plugin/protocol-selenium/src/
    nutch/trunk/src/plugin/protocol-selenium/src/java/
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/
    
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/
    
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
    
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
    
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
    nutch/trunk/src/plugin/protocol-selenium/src/target/
    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/
    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/
    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/
    
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/
    
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/
    
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/
    
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/build.xml
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 26 18:31:39 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1933 nutch-selenium plugin (Mo Omer, Mohammad Al-Moshin, lewismc)
+
 * NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn, snagel, 
lewismc)
 
 * NUTCH-1724 LinkDBReader to support regex output filtering (markus)

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Thu Feb 26 18:31:39 2015
@@ -184,6 +184,7 @@
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
       <packageset dir="${plugins.dir}/lib-http/src/java"/>
+      <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
       <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
       <packageset dir="${plugins.dir}/parse-ext/src/java"/>
@@ -197,6 +198,7 @@
       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
@@ -591,6 +593,7 @@
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
       <packageset dir="${plugins.dir}/lib-http/src/java"/>
+      <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
       <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
       <packageset dir="${plugins.dir}/parse-ext/src/java"/>
@@ -604,6 +607,7 @@
       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
@@ -985,6 +989,8 @@
         <source path="${plugins.dir}/language-identifier/src/test/" />
         <source path="${plugins.dir}/lib-http/src/java/" />
         <source path="${plugins.dir}/lib-http/src/test/" />
+        <source path="${plugins.dir}/lib-selenium/src/java/" />
+        <source path="${plugins.dir}/lib-selenium/src/test/" />
         <source path="${plugins.dir}/lib-regex-filter/src/java/" />
         <source path="${plugins.dir}/lib-regex-filter/src/test/" />
         <source path="${plugins.dir}/microformats-reltag/src/java/" />
@@ -1008,6 +1014,8 @@
         <source path="${plugins.dir}/protocol-httpclient/src/test/" />
         <source path="${plugins.dir}/protocol-http/src/java/" />
         <source path="${plugins.dir}/protocol-http/src/test/" />
+        <source path="${plugins.dir}/protocol-selenium/src/java"/>
+        <source path="${plugins.dir}/protocol-selenium/src/test"/>
         <source path="${plugins.dir}/scoring-depth/src/java/" />
         <source path="${plugins.dir}/scoring-link/src/java/" />
         <source path="${plugins.dir}/scoring-opic/src/java/" />

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Thu Feb 26 18:31:39 2015
@@ -23,24 +23,24 @@
                        database etc.
                </description>
        </info>
-
+       
        <configurations>
                <include file="${basedir}/ivy/ivy-configurations.xml" />
        </configurations>
-
+       
        <publications>
                <!--get the artifact from our module name -->
                <artifact conf="master" />
        </publications>
-
+       
        <dependencies>
                <dependency org="org.slf4j" name="slf4j-api" rev="1.6.1"
                        conf="*->master" />
                <dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"
                        conf="*->master" />
-
+               
                <dependency org="log4j" name="log4j" rev="1.2.15" 
conf="*->master" />
-
+               
                <dependency org="commons-lang" name="commons-lang" rev="2.6"
                        conf="*->default" />
                <dependency org="commons-collections" name="commons-collections"
@@ -49,7 +49,7 @@
                        rev="3.1" conf="*->master" />
                <dependency org="commons-codec" name="commons-codec" rev="1.3"
                        conf="*->default" />
-
+               
                <dependency org="org.apache.hadoop" name="hadoop-core" 
rev="1.2.0"
                        conf="*->default">
                        <exclude org="hsqldb" name="hsqldb" />

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Feb 26 18:31:39 2015
@@ -50,6 +50,8 @@
      <ant dir="protocol-ftp" target="deploy"/>
      <ant dir="protocol-http" target="deploy"/>
      <ant dir="protocol-httpclient" target="deploy"/>
+     <ant dir="lib-selenium" target="deploy"/>
+     <ant dir="protocol-selenium" target="deploy" />
      <ant dir="parse-ext" target="deploy"/>
      <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-html" target="deploy"/>
@@ -149,6 +151,8 @@
     <ant dir="protocol-ftp" target="clean"/>
     <ant dir="protocol-http" target="clean"/>
     <ant dir="protocol-httpclient" target="clean"/>
+    <ant dir="lib-selenium" target="clean"/>
+    <ant dir="protocol-selenium" target="clean" />
     <ant dir="parse-ext" target="clean"/>
     <ant dir="parse-js" target="clean"/>
     <ant dir="parse-html" target="clean"/>

Added: nutch/trunk/src/plugin/lib-selenium/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/build.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/build.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/build.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-selenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">    
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+  </path>
+</project>

Added: nutch/trunk/src/plugin/lib-selenium/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/ivy.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/ivy.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,48 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <!-- begin selenium dependencies -->
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" 
rev="2.44.0" />
+    
+    <dependency org="com.opera" name="operadriver" rev="1.5">
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+    </dependency>
+    <!-- end selenium dependencies -->
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/lib-selenium/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/plugin.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/plugin.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/plugin.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for http protocol implementations
+ !-->
+<plugin
+   id="lib-selenium"
+   name="HTTP Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-selenium.jar">
+        <export name="*"/>
+     </library>       
+   </runtime>
+
+   <requires>
+     <library name="selenium-java-2.4.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="operadriver-1.5.jar">
+       <export name="*"/>
+       <exclude name="selenium-remote-driver" />
+     </library>
+   </requires>
+</plugin>

Added: 
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1662530&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
 (added)
+++ 
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
 Thu Feb 26 18:31:39 2015
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.openqa.selenium.By;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.firefox.FirefoxDriver;
+import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.support.ui.WebDriverWait;
+
+import java.lang.String;
+
+public class HttpWebClient {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger("org.apache.nutch.protocol");
+
+  public static ThreadLocal<WebDriver> threadWebDriver = new 
ThreadLocal<WebDriver>() {
+
+    @Override
+    protected WebDriver initialValue()
+    {
+      FirefoxProfile profile = new FirefoxProfile();
+      profile.setPreference("permissions.default.stylesheet", 2);
+      profile.setPreference("permissions.default.image", 2);
+      profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", 
"false");
+      WebDriver driver = new FirefoxDriver(profile);
+      return driver;
+    };
+  };
+
+  public static String getHtmlPage(String url, Configuration conf) {
+    WebDriver driver = null;
+
+    try {
+      driver = new FirefoxDriver();
+      //} WebDriver driver = threadWebDriver.get();
+      //  if (driver == null) {
+      //    driver = new FirefoxDriver();
+      //  }
+
+      driver.get(url);
+
+      // Wait for the page to load, timeout after 3 seconds
+      new WebDriverWait(driver, 3);
+
+      String innerHtml = 
driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+
+      return innerHtml;
+
+      // I'm sure this catch statement is a code smell ; borrowing it from 
lib-htmlunit
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    } finally {
+      if (driver != null) try { driver.quit(); } catch (Exception e) { throw 
new RuntimeException(e); }
+    }
+  };
+
+  public static String getHtmlPage(String url) {
+    return getHtmlPage(url, null);
+  }
+}
\ No newline at end of file

Added: nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml Thu Feb 26 18:31:39 
2015
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without 
any special installation -->
+        <get 
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar";
 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not 
already dropped
+              it into ant's lib dir (note that the latter copy will always 
take precedence).
+              We will not fail as long as local lib dir exists (it may be 
empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

Added: nutch/trunk/src/plugin/protocol-selenium/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/build.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/build.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+    <ant target="jar" inheritall="false" dir="../lib-selenium"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+      <include name="**/lib-selenium/*.jar" />
+    </fileset>
+  </path>
+
+</project>

Added: nutch/trunk/src/plugin/protocol-selenium/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/ivy.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/ivy.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,48 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="default"/>
+  </publications>
+
+  <dependencies>
+    <!-- begin selenium dependencies -->
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" 
rev="2.44.0" />
+    
+    <dependency org="com.opera" name="operadriver" rev="1.5">
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+    </dependency>
+    <!-- end selenium dependencies -->
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/protocol-selenium/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/plugin.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/plugin.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/plugin.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-selenium"
+   name="Http Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-selenium.jar">
+         <export name="*"/>
+      </library>
+      <library name="cglib-nodep-2.1_3.jar"/>
+      <library name="commons-codec-1.9.jar"/>
+      <library name="commons-collections-3.2.1.jar"/>
+      <library name="commons-exec-1.1.jar"/>
+      <library name="commons-io-2.4.jar"/>
+      <library name="commons-jxpath-1.3.jar"/>
+      <library name="commons-lang3-3.3.2.jar"/>
+      <library name="commons-logging-1.1.3.jar"/>
+      <library name="cssparser-0.9.14.jar"/>
+      <library name="gson-2.3.jar"/>
+      <library name="guava-18.0.jar"/>
+      <library name="htmlunit-2.15.jar"/>
+      <library name="htmlunit-core-js-2.15.jar"/>
+      <library name="httpclient-4.3.4.jar"/>
+      <library name="httpcore-4.3.2.jar"/>
+      <library name="httpmime-4.3.3.jar"/>
+      <library name="ini4j-0.5.2.jar"/>
+      <library name="jetty-http-8.1.15.v20140411.jar"/>
+      <library name="jetty-io-8.1.15.v20140411.jar"/>
+      <library name="jetty-util-8.1.15.v20140411.jar"/>
+      <library name="jetty-websocket-8.1.15.v20140411.jar"/>
+      <library name="jna-3.4.0.jar"/>
+      <library name="nekohtml-1.9.21.jar"/>
+      <library name="netty-3.5.2.Final.jar"/>
+      <library name="operadriver-1.5.jar"/>
+      <library name="operalaunchers-1.1.jar"/>
+      <library name="platform-3.4.0.jar"/>
+      <library name="protobuf-java-2.4.1.jar"/>
+      <library name="sac-1.3.jar"/>
+      <library name="selenium-api-2.44.0.jar"/>
+      <library name="selenium-chrome-driver-2.44.0.jar"/>
+      <library name="selenium-firefox-driver-2.44.0.jar"/>
+      <library name="selenium-htmlunit-driver-2.44.0.jar"/>
+      <library name="selenium-ie-driver-2.44.0.jar"/>
+      <library name="selenium-java-2.44.0.jar"/>
+      <library name="selenium-remote-driver-2.44.0.jar"/>
+      <library name="selenium-safari-driver-2.44.0.jar"/>
+      <library name="selenium-support-2.44.0.jar"/>
+      <library name="serializer-2.7.1.jar"/>
+      <library name="webbit-0.4.14.jar"/>
+      <library name="xalan-2.7.1.jar"/>
+      <library name="xercesImpl-2.11.0.jar"/>
+      <library name="xml-apis-1.4.01.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+      <import plugin="lib-selenium"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.selenium"
+              name="HttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.selenium.Http"
+                      class="org.apache.nutch.protocol.selenium.Http">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

Added: 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java?rev=1662530&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
 (added)
+++ 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
 Thu Feb 26 18:31:39 2015
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.IOException;
+import java.net.URL;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.protocol.selenium.HttpResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  public Http() {
+    super(LOG);
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  @Override
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+
+}

Added: 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java?rev=1662530&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
 (added)
+++ 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
 Thu Feb 26 18:31:39 2015
@@ -0,0 +1,360 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+
+/* Most of this code was borrowed from protocol-htmlunit; which in turn 
borrowed it from protocol-httpclient */
+
+public class HttpResponse implements Response {
+
+  private Http http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+
+  /** The nutch configuration */
+  private Configuration conf = null;
+
+  public HttpResponse(Http http, URL url, CrawlDatum datum) throws 
ProtocolException, IOException {
+
+    this.conf = http.getConf();
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    if (!"http".equals(url.getProtocol()))
+      throw new HttpException("Not an HTTP url:" + url);
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      port = 80;
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy() ? http.getProxyHost() : host;
+      int sockPort = http.useProxy() ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy()) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + 
HttpDateFormat.toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(new 
BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
+              Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        // parse headers
+        parseHeaders(in, line);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      // Get Content type header
+      String contentType = getHeader(Response.CONTENT_TYPE);
+
+      // handle with Selenium only if content type in HTML or XHTML 
+      if (contentType != null) {
+        if (contentType.contains("text/html") || 
contentType.contains("application/xhtml")) {
+          readPlainContent(url);
+        } else {
+          try {
+            int contentLength = Integer.MAX_VALUE;
+            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+            if (contentLengthString != null) {
+              try {
+                contentLength = Integer.parseInt(contentLengthString.trim());
+              } catch (NumberFormatException ex) {
+                throw new HttpException("bad content length: " + 
contentLengthString);
+              }
+            }
+
+            if (http.getMaxContent() >= 0 && contentLength > 
http.getMaxContent()) {
+              contentLength = http.getMaxContent();
+            }
+
+            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+            int bufferFilled = 0;
+            int totalRead = 0;
+            ByteArrayOutputStream out = new ByteArrayOutputStream();
+            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+                && totalRead + bufferFilled <= contentLength) {
+              totalRead += bufferFilled;
+              out.write(buffer, 0, bufferFilled);
+            }
+
+            content = out.toByteArray();
+
+          } catch (Exception e) {
+            if (code == 200)
+              throw new IOException(e.toString());
+            // for codes other than 200 OK, we are fine with empty content
+          } finally {
+            if (in != null) {
+              in.close();
+            }
+          }
+        }
+      } 
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  private void readPlainContent(URL url) throws IOException {
+    String page = HttpWebClient.getHtmlPage(url.toString(), conf);
+
+    content = page.getBytes("UTF-8");
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line) 
throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException("bad status line '" + line + "': " + 
e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line) throws IOException, 
HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line) throws 
IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = 
line.indexOf("<HTML")) != -1)
+          || ((pos = line.indexOf("<html")) != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          //TODO: (CM) We don't know the header names here
+          //since we're just handling them generically. It would
+          //be nice to provide some sort of mapping function here
+          //for the returned header names to the standard metadata
+          //names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line, 
boolean allowContinuedLine)
+      throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+}

Added: 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html?rev=1662530&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
 (added)
+++ 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
 Thu Feb 26 18:31:39 2015
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
+</body>
+</html>

Added: 
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html?rev=1662530&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
 (added)
+++ 
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
 Thu Feb 26 18:31:39 2015
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the 
htmlunit.</p><p></p>
+</body>
+</html>


Reply via email to