As I lost all attachment in my message I'll try to put diffs here.
I don't know the right way to put exception trace and dumps here (200kb)
So...
===
letter_nutch_src_diff:
-----
diff -u -r -B /home/alex/nutch2.x__originl/conf/gora.properties
/home/alex/nutch2.x__patched/conf/gora.properties
--- /home/alex/nutch2.x__originl/conf/gora.properties    2014-10-07
18:17:47.642068629 +0400
+++ /home/alex/nutch2.x__patched/conf/gora.properties    2014-10-07
17:04:18.850031165 +0400
@@ -83,3 +83,7 @@
 #gora.datastore.accumulo.user=root
 #gora.datastore.accumulo.password=secret

+
+
+
+gora.datastore.default=org.apache.gora.hbase.store.HBaseStore
Only in /home/alex/nutch2.x__patched/conf: nutch-site.xml
diff -u -r -B /home/alex/nutch2.x__originl/default.properties
/home/alex/nutch2.x__patched/default.properties
--- /home/alex/nutch2.x__originl/default.properties    2014-10-07
18:17:54.062068684 +0400
+++ /home/alex/nutch2.x__patched/default.properties    2014-10-07
18:23:13.142071395 +0400
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
+hadoop.version=2.4.0
 name=apache-nutch
 version=2.3-SNAPSHOT
 final.name=${name}-${version}
diff -u -r -B /home/alex/nutch2.x__originl/ivy/ivy.xml
/home/alex/nutch2.x__patched/ivy/ivy.xml
--- /home/alex/nutch2.x__originl/ivy/ivy.xml    2014-10-07
18:17:49.314068644 +0400
+++ /home/alex/nutch2.x__patched/ivy/ivy.xml    2014-10-07
21:14:32.000000000 +0400
@@ -46,13 +46,11 @@
     <dependency org="commons-codec" name="commons-codec" rev="1.3"
       conf="*->default" />

-    <dependency org="org.apache.hadoop" name="hadoop-core"
-      rev="1.2.0" conf="*->default">
-      <exclude org="net.sf.kosmosfs" name="kfs" />
-      <exclude org="net.java.dev.jets3t" name="jets3t" />
-      <exclude org="org.eclipse.jdt" name="core" />
-      <exclude org="org.mortbay.jetty" name="jsp-*" />
-    </dependency>
+    <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.4.0"/>
+    <dependency org="org.apache.hadoop"
name="hadoop-mapreduce-client-core" rev="2.4.0"/>
+    <dependency org="org.apache.hadoop"
name="hadoop-mapreduce-client-common" rev="2.4.0"/>
+    <!--dependency org="org.apache.hadoop"
name="hadoop-yarn-server-common" rev="2.4.0"/-->
+    <dependency org="org.apache.hbase" name="hbase" rev="0.94.18" />

     <dependency org="com.ibm.icu" name="icu4j" rev="4.0.1" />
     <dependency org="org.apache.tika" name="tika-core" rev="1.6" />
@@ -80,12 +78,7 @@
     <!--artifacts needed for testing -->
     <dependency org="junit" name="junit" rev="4.11" conf="*->default" />

-    <dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0"
conf="test->default">
-      <exclude org="net.sf.kosmosfs" name="kfs" />
-      <exclude org="net.java.dev.jets3t" name="jets3t" />
-      <exclude org="org.eclipse.jdt" name="core" />
-      <exclude org="org.mortbay.jetty" name="jsp-*" />
-    </dependency>
+    <dependency org="org.apache.hadoop" name="hadoop-test" rev="2.4.0"
conf="test->default"/>

     <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.26"
conf="test->default" />
     <dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.26"
conf="test->default" />
@@ -103,8 +96,9 @@
     <!-- N.B. To use Gora SNAPSHOT's merely replace the 'ref' value with
the SNAPSHOT version
     and add changing="true" alongside the dependency declaration. An
example has been
     provided for the gora-core dependency as below -->
-    <dependency org="org.apache.gora" name="gora-core" rev="0.4"
conf="*->default"/>
-
+    <dependency org="org.apache.gora" name="gora-shims-distribution"
rev="0.5" conf="*->default" />
+    <dependency org="org.apache.gora" name="gora-core" rev="0.5"
conf="*->default" />
+
     <!-- Uncomment this to use SQL as Gora backend. It should be noted
that the
     gora-sql 0.1.1-incubating artifact is NOT compatable with gora-core
0.3. Users should
     downgrade to gora-core 0.2.1 in order to use SQL as a backend. -->
@@ -116,9 +110,7 @@
     <dependency org="mysql" name="mysql-connector-java" rev="5.1.18"
conf="*->default"/>
     -->
     <!-- Uncomment this to use HBase as Gora backend. -->
-    <!--
-    <dependency org="org.apache.gora" name="gora-hbase" rev="0.4"
conf="*->default" />
-    -->
+    <dependency org="org.apache.gora" name="gora-hbase" rev="0.5"
conf="*->default" />
     <!-- Uncomment this to use Accumulo as Gora backend. -->
     <!--
     <dependency org="org.apache.gora" name="gora-accumulo" rev="0.4"
conf="*->default" />
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
2014-10-07 18:17:52.138068668 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
2014-10-07 18:49:15.574084672 +0400
@@ -55,13 +55,7 @@
   @Override
   public void map(String key, WebPage page, Context context)
   throws IOException, InterruptedException {
-   if(Mark.GENERATE_MARK.checkMark(page) == null) {
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; not
generated yet");
-      }
-      return;
-    }
-
+
     String url = TableUtil.unreverseUrl(key);

     scoreData.clear();
@@ -88,7 +82,8 @@
     urlWithScore.setScore(Float.MAX_VALUE);
     pageWritable.setWebPage(page);
     nutchWritable.set(pageWritable);
-    context.write(urlWithScore, nutchWritable);
+    // TODO AM: Temporary commented. It follows to loosing of main part of
Nutch functionality such as scoring, etc.
+    //context.write(urlWithScore, nutchWritable);

     for (ScoreDatum scoreDatum : scoreData) {
       String reversedOut = TableUtil.reverseUrl(scoreDatum.getUrl());
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/crawl/GeneratorReducer.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/crawl/GeneratorReducer.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/crawl/GeneratorReducer.java
2014-10-07 18:17:52.142068668 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/crawl/GeneratorReducer.java
2014-10-07 18:42:43.594081342 +0400
@@ -76,7 +76,7 @@
         hostCountMap.put(hostordomain, hostCount + 1);
       }

-      Mark.GENERATE_MARK.putMark(page, batchId);
+      Mark.GENERATE_MARK.putMark(page, batchId == null ? new Utf8("1") :
batchId);
       page.setBatchId(batchId);
       try {
         context.write(TableUtil.reverseUrl(key.url), page);
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/fetcher/FetcherReducer.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/fetcher/FetcherReducer.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/fetcher/FetcherReducer.java
2014-10-07 18:17:52.754068673 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/fetcher/FetcherReducer.java
2014-10-07 18:45:16.834082644 +0400
@@ -634,7 +634,7 @@
         fit.page.setContentType(new Utf8(content.getContentType()));
         fit.page.setBaseUrl(new Utf8(content.getBaseUrl()));
       }
-      Mark.FETCH_MARK.putMark(fit.page,
Mark.GENERATE_MARK.checkMark(fit.page));
+      Mark.FETCH_MARK.putMark(fit.page,
Mark.GENERATE_MARK.checkMark(fit.page) == null ? new Utf8("1") :
Mark.GENERATE_MARK.checkMark(fit.page));
       String key = TableUtil.reverseUrl(fit.url);

       if (parse) {
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/parse/ParseUtil.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/parse/ParseUtil.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/parse/ParseUtil.java
2014-10-07 18:17:52.326068669 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/parse/ParseUtil.java
2014-10-02 10:15:08.000000000 +0400
@@ -286,6 +286,8 @@
         if (fetchMark != null) {
           Mark.PARSE_MARK.putMark(page, fetchMark);
         }
+        else // TODO AM temporary decision
+            Mark.PARSE_MARK.putMark(page, new Utf8("1"));
       }
     }
   }
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/Host.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/Host.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/Host.java
2014-10-07 18:17:51.994068666 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/Host.java
2014-10-07 20:21:33.986131736 +0400
@@ -55,6 +55,8 @@
     public String toString() {return name;}
   };

+  public int getFieldsCount() { return Field.values().length; }
+
   public static final String[] _ALL_FIELDS = {
   "__g__dirty",
   "metadata",
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/ParseStatus.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/ParseStatus.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/ParseStatus.java
2014-10-07 18:17:51.994068666 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/ParseStatus.java
2014-10-07 20:21:22.982131643 +0400
@@ -51,6 +51,8 @@
     public String toString() {return name;}
   };

+  public int getFieldsCount() { return Field.values().length; }
+
   public static final String[] _ALL_FIELDS = {
   "__g__dirty",
   "majorCode",
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/ProtocolStatus.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/ProtocolStatus.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/ProtocolStatus.java
2014-10-07 18:17:51.994068666 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/ProtocolStatus.java
2014-10-07 20:21:01.962131464 +0400
@@ -54,6 +54,8 @@
     public String toString() {return name;}
   };

+  public int getFieldsCount() { return Field.values().length; }
+
   public static final String[] _ALL_FIELDS = {
   "__g__dirty",
   "code",
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/WebPage.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/WebPage.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/WebPage.java
2014-10-07 18:17:51.994068666 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/WebPage.java
2014-10-07 20:20:10.846131030 +0400
@@ -72,6 +72,8 @@
     public String toString() {return name;}
   };

+  public int getFieldsCount() { return Field.values().length; }
+
   public static final String[] _ALL_FIELDS = {
   "__g__dirty",
   "baseUrl",
===
letter_avro176_src_diff:
-----
diff -u -r
/home/alex/avro-1.7.6__originl/lang/java/avro/src/main/java/org/apache/avro/io/BinaryDecoder.java
/home/alex/avro-1.7.6__bak12/lang/java/avro/src/main/java/org/apache/avro/io/BinaryDecoder.java
---
/home/alex/avro-1.7.6__originl/lang/java/avro/src/main/java/org/apache/avro/io/BinaryDecoder.java
2014-09-25 12:43:08.000000000 +0400
+++
/home/alex/avro-1.7.6__bak12/lang/java/avro/src/main/java/org/apache/avro/io/BinaryDecoder.java
2014-09-25 17:30:54.000000000 +0400
@@ -50,6 +50,7 @@
   private int minPos = 0;
   private int pos = 0;
   private int limit = 0;
+  public static String log = "";

   byte[] getBuf() { return buf; }
   int getPos() { return pos; }
@@ -149,9 +150,9 @@
       }
     }
     pos += len;
-    if (pos > limit) {
-      throw new EOFException();
-    }
+    //if (pos > limit) {
+    //  throw new EOFException();
+    //}
     return (n >>> 1) ^ -(n & 1); // back to two's-complement
   }

@@ -186,9 +187,9 @@
     } else {
       l = n;
     }
-    if (pos > limit) {
-      throw new EOFException();
-    }
+    //if (pos > limit) {
+    //  throw new EOFException();
+    //}
     return (l >>> 1) ^ -(l & 1); // back to two's-complement
   }

@@ -231,9 +232,9 @@
     int len = 1;
     int n = (buf[pos] & 0xff) | ((buf[pos + len++] & 0xff) << 8)
         | ((buf[pos + len++] & 0xff) << 16) | ((buf[pos + len++] & 0xff)
<< 24);
-    if ((pos + 4) > limit) {
-      throw new EOFException();
-    }
+    //if ((pos + 4) > limit) {
+    //  throw new EOFException();
+    //}
     pos += 4;
     return Float.intBitsToFloat(n);
   }
@@ -332,16 +333,23 @@
    */
   protected void doReadBytes(byte[] bytes, int start, int length)
       throws IOException {
+    log = log + " BinaryDecoder, doReadBytes(): length=" + length + ",
start=" + start + ", pos=" + pos + ", limit=" + limit + ", bytes.length=" +
(bytes == null ? "null" : bytes.length) + "";
     if (length < 0)
       throw new AvroRuntimeException("Malformed data. Length is negative: "
                                      + length);
     int remaining = limit - pos;
+    if (length==0 && start==0 && pos==1 && limit==0 && ((bytes == null) ||
(bytes.length==0)))
+      return;
     if (length <= remaining) {
       System.arraycopy(buf, pos, bytes, start, length);
       pos += length;
     } else {
       // read the rest of the buffer
+      try {
       System.arraycopy(buf, pos, bytes, start, remaining);
+      } catch (java.lang.ArrayIndexOutOfBoundsException ex) {
+        return;
+      }
       start += remaining;
       length -= remaining;
       pos = limit;
@@ -466,11 +474,20 @@
    */
   private void ensureBounds(int num) throws IOException {
     int remaining = limit - pos;
+    log = log + " BinaryDecoder, doReadBytes(): num=" + num + ",
remaining=" + remaining + ", pos=" + pos + ", minPos=" + minPos + ",
limit=" + limit + ", buf.length=" + (buf == null ? "null" : buf.length) +
"";
     if (remaining < num) {
+
+    //if (remaining==-1 && pos==1 && minPos==0 && limit==0 && ((buf ==
null) || (buf.length==0) || (buf.length==8192)))
+    //  return;
+
       // move remaining to front
+      try {
       source.compactAndFill(buf, pos, minPos, remaining);
-      if (pos >= limit)
-        throw new EOFException();
+      } catch (java.lang.ArrayIndexOutOfBoundsException ex) {
+         return;
+      }
+      //if (pos >= limit)
+      //  throw new EOFException();
     }
   }

diff -u -r
/home/alex/avro-1.7.6__originl/lang/java/avro/src/main/java/org/apache/avro/util/ByteBufferInputStream.java
/home/alex/avro-1.7.6__bak12/lang/java/avro/src/main/java/org/apache/avro/util/ByteBufferInputStream.java
---
/home/alex/avro-1.7.6__originl/lang/java/avro/src/main/java/org/apache/avro/util/ByteBufferInputStream.java
2014-09-25 12:43:08.000000000 +0400
+++
/home/alex/avro-1.7.6__bak12/lang/java/avro/src/main/java/org/apache/avro/util/ByteBufferInputStream.java
2014-09-22 17:48:23.000000000 +0400
@@ -18,7 +18,7 @@

 package org.apache.avro.util;

-import java.io.EOFException;
+//import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.ByteBuffer;
@@ -37,7 +37,11 @@
    * @throws EOFException if EOF is reached. */
   @Override
   public int read() throws IOException {
-    return getBuffer().get() & 0xff;
+    //return getBuffer().get() & 0xff;
+    ByteBuffer buffer = getBuffer();
+    if (buffer == null)
+      return -1;
+     return buffer.get() & 0xff;
   }

   /** @see InputStream#read(byte[], int, int)
@@ -46,6 +50,10 @@
   public int read(byte[] b, int off, int len) throws IOException {
     if (len == 0) return 0;
     ByteBuffer buffer = getBuffer();
+
+     if (buffer == null)
+       return -1;
+
     int remaining = buffer.remaining();
     if (len > remaining) {
       buffer.get(b, off, remaining);
@@ -61,6 +69,10 @@
   public ByteBuffer readBuffer(int length) throws IOException {
     if (length == 0) return ByteBuffer.allocate(0);
     ByteBuffer buffer = getBuffer();
+
+    if (buffer == null)
+      return ByteBuffer.allocate(0);
+
     if (buffer.remaining() == length) {           // can return current
as-is?
       current++;
       return buffer;                              // return w/o copying
@@ -83,7 +95,8 @@
         return buffer;
       current++;
     }
-    throw new EOFException();
+    //throw new EOFException();
+    return null;
   }
 }


On Fri, Oct 3, 2014 at 2:44 PM, Alex Median <[email protected]> wrote:

>
> Hi,
>
> Within a month I'm in the process of installing Nutch 2.3 in this
> configuration (subj).
> Nutch 2 initially with Hadoop 1 was chosen a few months ago, some of the
> coding is already done.
> We chose Amazon AWS Elastic MapReduce (EMR) as a platform.
> Unfortunately EMR Hadoop 1 version on an old Debian does not suit us.
> Therefore, we need to establish exactly Nutch 2 in the above
> configuration:
> Hadoop 2.4.0 + HBase 0.94.18 (Amazon Linux: AMI version:3.2.1, Hadoop
> distribution:Amazon 2.4.0, Applications:HBase 0.94.18)
>
> But info about another experiences with Nutch2+hadoop2 will also good..
>
> What has been done for the last iteration of the installation on local
> computer:
>
> 1. Nutch 2.x
> 1.1 svn current 2.x version
> 1.2. prepared scripts:
> 1.2.1 ivy:
> <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.4.0">..
> <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core"
> rev="2.4.0">..
> <dependency org="org.apache.gora" name="gora" rev="0.5" conf="*->default"
> />
> <dependency org="org.apache.gora" name="gora-hbase" rev="0.5"
> conf="*->default" />
> etc.
> 1.2.2 default.properties:
> hadoop.version=2.4.0
> version=2.3-SNAPSHOT
> etc.
> 1.3. added public int getFieldsCount() { return Field.values().length; }
> to ProtocolStatus.java, ParseStatus.java, Host.java, WebPage.java.
>
> 2. HBase
> 2.1 svn HBase 0.94.18
> 2.2 prepared for Protobuf 2.5.0 [1], also thanks to Dobromyslov [5]
> 2.3 also generated hbase-0.94.18-hadoop-2.4.0.jar
>
> 3. Gora 0.5 (also was tested for versions 0.4, 0.6-SNAPSHOT, and 0.5.3
> from com.argonio.gora)
>
> 4. Avro 1.7.6 (also played with versions 1.7.4, 1.7.7)
> 4.1 svn
> 4.2 patched for AVRO-813[2]
> 4.3 patched for AVRO-882[3] and rollbacked
> 4.4 patched as mentioned in [4] - commented throwing EOFException against
> org.apache.avro.io.BinaryDecoder.ensureBounds(BinaryDecoder.java:473), etc.
>
> After investigating numerous exceptions in many weeks, a number of changes
> have been made ​​in the code Nutch 2.x and Avro 1.7.6 to suppress
> exceptions and walk a little further. We got some success, Nutch looks like
> a bit of running, but is unstable and incorrect. All necessary (for us)
> stages pass in cycle (inject, generate, fetch, parse, updatedb). But some
> functionalities are broken and ignored.
> It seems that because of the poor Nutch/Hadoop/HBase experience, we broke
> the normal data exchange between Nutch and HBase (also with gora and avro).
> Perhaps some of the fields (and/or some of the data formats) read and
> write incorrectly. For example, many markers are lost and temporary
> emulated in code to pass through the steps; data in batchId field are lost;
> scoring is broken also.
>
> Please help us! Perhaps there are somewhere the necessary working
> assemblies and/or scripts and patches. Maybe someone has a positive
> experience in this. I'm ready to publish all my diffs and exception traces.
> Also, I would be very grateful if someone would tell me when we can get a
> new of Nutch 2.3 release; it seems that it will be Hadoop2-compatible.
>
> [1] http://hbase.apache.org/book/configuration.html
> [2] https://issues.apache.org/jira/browse/AVRO-813
> [3] https://issues.apache.org/jira/browse/AVRO-882
> http://mail-archives.apache.org/mod_mbox/avro-user/201108.mbox/%3ccaanh3_9_cqqbmt4vqyzg8-ikfo4nnlpcuzbbwd4kqoavpek...@mail.gmail.com%3E
> [4]
> http://mail-archives.apache.org/mod_mbox/nutch-user/201409.mbox/%3cCAEmTxX9HrRM00SxerFAdRdZy=wVAd9xCchDTuLaxPQ=wi0q...@mail.gmail.com%3e
> [5]
> http://stackoverflow.com/questions/13946725/configuring-hbase-standalone-mode-with-apache-nutch-java-lang-illegalargumente
> https://github.com/dobromyslov
>
> BR,
> Alex Median
>

Reply via email to