As I lost all attachment in my message I'll try to put diffs here.
I don't know the right way to put exception trace and dumps here (200kb)
So...
===
letter_nutch_src_diff:
-----
diff -u -r -B /home/alex/nutch2.x__originl/conf/gora.properties
/home/alex/nutch2.x__patched/conf/gora.properties
--- /home/alex/nutch2.x__originl/conf/gora.properties 2014-10-07
18:17:47.642068629 +0400
+++ /home/alex/nutch2.x__patched/conf/gora.properties 2014-10-07
17:04:18.850031165 +0400
@@ -83,3 +83,7 @@
#gora.datastore.accumulo.user=root
#gora.datastore.accumulo.password=secret
+
+
+
+gora.datastore.default=org.apache.gora.hbase.store.HBaseStore
Only in /home/alex/nutch2.x__patched/conf: nutch-site.xml
diff -u -r -B /home/alex/nutch2.x__originl/default.properties
/home/alex/nutch2.x__patched/default.properties
--- /home/alex/nutch2.x__originl/default.properties 2014-10-07
18:17:54.062068684 +0400
+++ /home/alex/nutch2.x__patched/default.properties 2014-10-07
18:23:13.142071395 +0400
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-
+hadoop.version=2.4.0
name=apache-nutch
version=2.3-SNAPSHOT
final.name=${name}-${version}
diff -u -r -B /home/alex/nutch2.x__originl/ivy/ivy.xml
/home/alex/nutch2.x__patched/ivy/ivy.xml
--- /home/alex/nutch2.x__originl/ivy/ivy.xml 2014-10-07
18:17:49.314068644 +0400
+++ /home/alex/nutch2.x__patched/ivy/ivy.xml 2014-10-07
21:14:32.000000000 +0400
@@ -46,13 +46,11 @@
<dependency org="commons-codec" name="commons-codec" rev="1.3"
conf="*->default" />
- <dependency org="org.apache.hadoop" name="hadoop-core"
- rev="1.2.0" conf="*->default">
- <exclude org="net.sf.kosmosfs" name="kfs" />
- <exclude org="net.java.dev.jets3t" name="jets3t" />
- <exclude org="org.eclipse.jdt" name="core" />
- <exclude org="org.mortbay.jetty" name="jsp-*" />
- </dependency>
+ <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.4.0"/>
+ <dependency org="org.apache.hadoop"
name="hadoop-mapreduce-client-core" rev="2.4.0"/>
+ <dependency org="org.apache.hadoop"
name="hadoop-mapreduce-client-common" rev="2.4.0"/>
+ <!--dependency org="org.apache.hadoop"
name="hadoop-yarn-server-common" rev="2.4.0"/-->
+ <dependency org="org.apache.hbase" name="hbase" rev="0.94.18" />
<dependency org="com.ibm.icu" name="icu4j" rev="4.0.1" />
<dependency org="org.apache.tika" name="tika-core" rev="1.6" />
@@ -80,12 +78,7 @@
<!--artifacts needed for testing -->
<dependency org="junit" name="junit" rev="4.11" conf="*->default" />
- <dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0"
conf="test->default">
- <exclude org="net.sf.kosmosfs" name="kfs" />
- <exclude org="net.java.dev.jets3t" name="jets3t" />
- <exclude org="org.eclipse.jdt" name="core" />
- <exclude org="org.mortbay.jetty" name="jsp-*" />
- </dependency>
+ <dependency org="org.apache.hadoop" name="hadoop-test" rev="2.4.0"
conf="test->default"/>
<dependency org="org.mortbay.jetty" name="jetty" rev="6.1.26"
conf="test->default" />
<dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.26"
conf="test->default" />
@@ -103,8 +96,9 @@
<!-- N.B. To use Gora SNAPSHOT's merely replace the 'ref' value with
the SNAPSHOT version
and add changing="true" alongside the dependency declaration. An
example has been
provided for the gora-core dependency as below -->
- <dependency org="org.apache.gora" name="gora-core" rev="0.4"
conf="*->default"/>
-
+ <dependency org="org.apache.gora" name="gora-shims-distribution"
rev="0.5" conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-core" rev="0.5"
conf="*->default" />
+
<!-- Uncomment this to use SQL as Gora backend. It should be noted
that the
gora-sql 0.1.1-incubating artifact is NOT compatable with gora-core
0.3. Users should
downgrade to gora-core 0.2.1 in order to use SQL as a backend. -->
@@ -116,9 +110,7 @@
<dependency org="mysql" name="mysql-connector-java" rev="5.1.18"
conf="*->default"/>
-->
<!-- Uncomment this to use HBase as Gora backend. -->
- <!--
- <dependency org="org.apache.gora" name="gora-hbase" rev="0.4"
conf="*->default" />
- -->
+ <dependency org="org.apache.gora" name="gora-hbase" rev="0.5"
conf="*->default" />
<!-- Uncomment this to use Accumulo as Gora backend. -->
<!--
<dependency org="org.apache.gora" name="gora-accumulo" rev="0.4"
conf="*->default" />
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
2014-10-07 18:17:52.138068668 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
2014-10-07 18:49:15.574084672 +0400
@@ -55,13 +55,7 @@
@Override
public void map(String key, WebPage page, Context context)
throws IOException, InterruptedException {
- if(Mark.GENERATE_MARK.checkMark(page) == null) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; not
generated yet");
- }
- return;
- }
-
+
String url = TableUtil.unreverseUrl(key);
scoreData.clear();
@@ -88,7 +82,8 @@
urlWithScore.setScore(Float.MAX_VALUE);
pageWritable.setWebPage(page);
nutchWritable.set(pageWritable);
- context.write(urlWithScore, nutchWritable);
+ // TODO AM: Temporary commented. It follows to loosing of main part of
Nutch functionality such as scoring, etc.
+ //context.write(urlWithScore, nutchWritable);
for (ScoreDatum scoreDatum : scoreData) {
String reversedOut = TableUtil.reverseUrl(scoreDatum.getUrl());
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/crawl/GeneratorReducer.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/crawl/GeneratorReducer.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/crawl/GeneratorReducer.java
2014-10-07 18:17:52.142068668 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/crawl/GeneratorReducer.java
2014-10-07 18:42:43.594081342 +0400
@@ -76,7 +76,7 @@
hostCountMap.put(hostordomain, hostCount + 1);
}
- Mark.GENERATE_MARK.putMark(page, batchId);
+ Mark.GENERATE_MARK.putMark(page, batchId == null ? new Utf8("1") :
batchId);
page.setBatchId(batchId);
try {
context.write(TableUtil.reverseUrl(key.url), page);
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/fetcher/FetcherReducer.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/fetcher/FetcherReducer.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/fetcher/FetcherReducer.java
2014-10-07 18:17:52.754068673 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/fetcher/FetcherReducer.java
2014-10-07 18:45:16.834082644 +0400
@@ -634,7 +634,7 @@
fit.page.setContentType(new Utf8(content.getContentType()));
fit.page.setBaseUrl(new Utf8(content.getBaseUrl()));
}
- Mark.FETCH_MARK.putMark(fit.page,
Mark.GENERATE_MARK.checkMark(fit.page));
+ Mark.FETCH_MARK.putMark(fit.page,
Mark.GENERATE_MARK.checkMark(fit.page) == null ? new Utf8("1") :
Mark.GENERATE_MARK.checkMark(fit.page));
String key = TableUtil.reverseUrl(fit.url);
if (parse) {
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/parse/ParseUtil.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/parse/ParseUtil.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/parse/ParseUtil.java
2014-10-07 18:17:52.326068669 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/parse/ParseUtil.java
2014-10-02 10:15:08.000000000 +0400
@@ -286,6 +286,8 @@
if (fetchMark != null) {
Mark.PARSE_MARK.putMark(page, fetchMark);
}
+ else // TODO AM temporary decision
+ Mark.PARSE_MARK.putMark(page, new Utf8("1"));
}
}
}
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/Host.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/Host.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/Host.java
2014-10-07 18:17:51.994068666 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/Host.java
2014-10-07 20:21:33.986131736 +0400
@@ -55,6 +55,8 @@
public String toString() {return name;}
};
+ public int getFieldsCount() { return Field.values().length; }
+
public static final String[] _ALL_FIELDS = {
"__g__dirty",
"metadata",
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/ParseStatus.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/ParseStatus.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/ParseStatus.java
2014-10-07 18:17:51.994068666 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/ParseStatus.java
2014-10-07 20:21:22.982131643 +0400
@@ -51,6 +51,8 @@
public String toString() {return name;}
};
+ public int getFieldsCount() { return Field.values().length; }
+
public static final String[] _ALL_FIELDS = {
"__g__dirty",
"majorCode",
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/ProtocolStatus.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/ProtocolStatus.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/ProtocolStatus.java
2014-10-07 18:17:51.994068666 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/ProtocolStatus.java
2014-10-07 20:21:01.962131464 +0400
@@ -54,6 +54,8 @@
public String toString() {return name;}
};
+ public int getFieldsCount() { return Field.values().length; }
+
public static final String[] _ALL_FIELDS = {
"__g__dirty",
"code",
diff -u -r -B
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/WebPage.java
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/WebPage.java
---
/home/alex/nutch2.x__originl/src/java/org/apache/nutch/storage/WebPage.java
2014-10-07 18:17:51.994068666 +0400
+++
/home/alex/nutch2.x__patched/src/java/org/apache/nutch/storage/WebPage.java
2014-10-07 20:20:10.846131030 +0400
@@ -72,6 +72,8 @@
public String toString() {return name;}
};
+ public int getFieldsCount() { return Field.values().length; }
+
public static final String[] _ALL_FIELDS = {
"__g__dirty",
"baseUrl",
===
letter_avro176_src_diff:
-----
diff -u -r
/home/alex/avro-1.7.6__originl/lang/java/avro/src/main/java/org/apache/avro/io/BinaryDecoder.java
/home/alex/avro-1.7.6__bak12/lang/java/avro/src/main/java/org/apache/avro/io/BinaryDecoder.java
---
/home/alex/avro-1.7.6__originl/lang/java/avro/src/main/java/org/apache/avro/io/BinaryDecoder.java
2014-09-25 12:43:08.000000000 +0400
+++
/home/alex/avro-1.7.6__bak12/lang/java/avro/src/main/java/org/apache/avro/io/BinaryDecoder.java
2014-09-25 17:30:54.000000000 +0400
@@ -50,6 +50,7 @@
private int minPos = 0;
private int pos = 0;
private int limit = 0;
+ public static String log = "";
byte[] getBuf() { return buf; }
int getPos() { return pos; }
@@ -149,9 +150,9 @@
}
}
pos += len;
- if (pos > limit) {
- throw new EOFException();
- }
+ //if (pos > limit) {
+ // throw new EOFException();
+ //}
return (n >>> 1) ^ -(n & 1); // back to two's-complement
}
@@ -186,9 +187,9 @@
} else {
l = n;
}
- if (pos > limit) {
- throw new EOFException();
- }
+ //if (pos > limit) {
+ // throw new EOFException();
+ //}
return (l >>> 1) ^ -(l & 1); // back to two's-complement
}
@@ -231,9 +232,9 @@
int len = 1;
int n = (buf[pos] & 0xff) | ((buf[pos + len++] & 0xff) << 8)
| ((buf[pos + len++] & 0xff) << 16) | ((buf[pos + len++] & 0xff)
<< 24);
- if ((pos + 4) > limit) {
- throw new EOFException();
- }
+ //if ((pos + 4) > limit) {
+ // throw new EOFException();
+ //}
pos += 4;
return Float.intBitsToFloat(n);
}
@@ -332,16 +333,23 @@
*/
protected void doReadBytes(byte[] bytes, int start, int length)
throws IOException {
+ log = log + " BinaryDecoder, doReadBytes(): length=" + length + ",
start=" + start + ", pos=" + pos + ", limit=" + limit + ", bytes.length=" +
(bytes == null ? "null" : bytes.length) + "";
if (length < 0)
throw new AvroRuntimeException("Malformed data. Length is negative: "
+ length);
int remaining = limit - pos;
+ if (length==0 && start==0 && pos==1 && limit==0 && ((bytes == null) ||
(bytes.length==0)))
+ return;
if (length <= remaining) {
System.arraycopy(buf, pos, bytes, start, length);
pos += length;
} else {
// read the rest of the buffer
+ try {
System.arraycopy(buf, pos, bytes, start, remaining);
+ } catch (java.lang.ArrayIndexOutOfBoundsException ex) {
+ return;
+ }
start += remaining;
length -= remaining;
pos = limit;
@@ -466,11 +474,20 @@
*/
private void ensureBounds(int num) throws IOException {
int remaining = limit - pos;
+ log = log + " BinaryDecoder, doReadBytes(): num=" + num + ",
remaining=" + remaining + ", pos=" + pos + ", minPos=" + minPos + ",
limit=" + limit + ", buf.length=" + (buf == null ? "null" : buf.length) +
"";
if (remaining < num) {
+
+ //if (remaining==-1 && pos==1 && minPos==0 && limit==0 && ((buf ==
null) || (buf.length==0) || (buf.length==8192)))
+ // return;
+
// move remaining to front
+ try {
source.compactAndFill(buf, pos, minPos, remaining);
- if (pos >= limit)
- throw new EOFException();
+ } catch (java.lang.ArrayIndexOutOfBoundsException ex) {
+ return;
+ }
+ //if (pos >= limit)
+ // throw new EOFException();
}
}
diff -u -r
/home/alex/avro-1.7.6__originl/lang/java/avro/src/main/java/org/apache/avro/util/ByteBufferInputStream.java
/home/alex/avro-1.7.6__bak12/lang/java/avro/src/main/java/org/apache/avro/util/ByteBufferInputStream.java
---
/home/alex/avro-1.7.6__originl/lang/java/avro/src/main/java/org/apache/avro/util/ByteBufferInputStream.java
2014-09-25 12:43:08.000000000 +0400
+++
/home/alex/avro-1.7.6__bak12/lang/java/avro/src/main/java/org/apache/avro/util/ByteBufferInputStream.java
2014-09-22 17:48:23.000000000 +0400
@@ -18,7 +18,7 @@
package org.apache.avro.util;
-import java.io.EOFException;
+//import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
@@ -37,7 +37,11 @@
* @throws EOFException if EOF is reached. */
@Override
public int read() throws IOException {
- return getBuffer().get() & 0xff;
+ //return getBuffer().get() & 0xff;
+ ByteBuffer buffer = getBuffer();
+ if (buffer == null)
+ return -1;
+ return buffer.get() & 0xff;
}
/** @see InputStream#read(byte[], int, int)
@@ -46,6 +50,10 @@
public int read(byte[] b, int off, int len) throws IOException {
if (len == 0) return 0;
ByteBuffer buffer = getBuffer();
+
+ if (buffer == null)
+ return -1;
+
int remaining = buffer.remaining();
if (len > remaining) {
buffer.get(b, off, remaining);
@@ -61,6 +69,10 @@
public ByteBuffer readBuffer(int length) throws IOException {
if (length == 0) return ByteBuffer.allocate(0);
ByteBuffer buffer = getBuffer();
+
+ if (buffer == null)
+ return ByteBuffer.allocate(0);
+
if (buffer.remaining() == length) { // can return current
as-is?
current++;
return buffer; // return w/o copying
@@ -83,7 +95,8 @@
return buffer;
current++;
}
- throw new EOFException();
+ //throw new EOFException();
+ return null;
}
}
On Fri, Oct 3, 2014 at 2:44 PM, Alex Median <[email protected]> wrote:
>
> Hi,
>
> Within a month I'm in the process of installing Nutch 2.3 in this
> configuration (subj).
> Nutch 2 initially with Hadoop 1 was chosen a few months ago, some of the
> coding is already done.
> We chose Amazon AWS Elastic MapReduce (EMR) as a platform.
> Unfortunately EMR Hadoop 1 version on an old Debian does not suit us.
> Therefore, we need to establish exactly Nutch 2 in the above
> configuration:
> Hadoop 2.4.0 + HBase 0.94.18 (Amazon Linux: AMI version:3.2.1, Hadoop
> distribution:Amazon 2.4.0, Applications:HBase 0.94.18)
>
> But info about another experiences with Nutch2+hadoop2 will also good..
>
> What has been done for the last iteration of the installation on local
> computer:
>
> 1. Nutch 2.x
> 1.1 svn current 2.x version
> 1.2. prepared scripts:
> 1.2.1 ivy:
> <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.4.0">..
> <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core"
> rev="2.4.0">..
> <dependency org="org.apache.gora" name="gora" rev="0.5" conf="*->default"
> />
> <dependency org="org.apache.gora" name="gora-hbase" rev="0.5"
> conf="*->default" />
> etc.
> 1.2.2 default.properties:
> hadoop.version=2.4.0
> version=2.3-SNAPSHOT
> etc.
> 1.3. added public int getFieldsCount() { return Field.values().length; }
> to ProtocolStatus.java, ParseStatus.java, Host.java, WebPage.java.
>
> 2. HBase
> 2.1 svn HBase 0.94.18
> 2.2 prepared for Protobuf 2.5.0 [1], also thanks to Dobromyslov [5]
> 2.3 also generated hbase-0.94.18-hadoop-2.4.0.jar
>
> 3. Gora 0.5 (also was tested for versions 0.4, 0.6-SNAPSHOT, and 0.5.3
> from com.argonio.gora)
>
> 4. Avro 1.7.6 (also played with versions 1.7.4, 1.7.7)
> 4.1 svn
> 4.2 patched for AVRO-813[2]
> 4.3 patched for AVRO-882[3] and rollbacked
> 4.4 patched as mentioned in [4] - commented throwing EOFException against
> org.apache.avro.io.BinaryDecoder.ensureBounds(BinaryDecoder.java:473), etc.
>
> After investigating numerous exceptions in many weeks, a number of changes
> have been made in the code Nutch 2.x and Avro 1.7.6 to suppress
> exceptions and walk a little further. We got some success, Nutch looks like
> a bit of running, but is unstable and incorrect. All necessary (for us)
> stages pass in cycle (inject, generate, fetch, parse, updatedb). But some
> functionalities are broken and ignored.
> It seems that because of the poor Nutch/Hadoop/HBase experience, we broke
> the normal data exchange between Nutch and HBase (also with gora and avro).
> Perhaps some of the fields (and/or some of the data formats) read and
> write incorrectly. For example, many markers are lost and temporary
> emulated in code to pass through the steps; data in batchId field are lost;
> scoring is broken also.
>
> Please help us! Perhaps there are somewhere the necessary working
> assemblies and/or scripts and patches. Maybe someone has a positive
> experience in this. I'm ready to publish all my diffs and exception traces.
> Also, I would be very grateful if someone would tell me when we can get a
> new of Nutch 2.3 release; it seems that it will be Hadoop2-compatible.
>
> [1] http://hbase.apache.org/book/configuration.html
> [2] https://issues.apache.org/jira/browse/AVRO-813
> [3] https://issues.apache.org/jira/browse/AVRO-882
> http://mail-archives.apache.org/mod_mbox/avro-user/201108.mbox/%3ccaanh3_9_cqqbmt4vqyzg8-ikfo4nnlpcuzbbwd4kqoavpek...@mail.gmail.com%3E
> [4]
> http://mail-archives.apache.org/mod_mbox/nutch-user/201409.mbox/%3cCAEmTxX9HrRM00SxerFAdRdZy=wVAd9xCchDTuLaxPQ=wi0q...@mail.gmail.com%3e
> [5]
> http://stackoverflow.com/questions/13946725/configuring-hbase-standalone-mode-with-apache-nutch-java-lang-illegalargumente
> https://github.com/dobromyslov
>
> BR,
> Alex Median
>