parser hanged there and wait... this is my conf ...
<value>language-identifier|protocol-http|urlfilter-(suffix|regex)|parse-(rss|swf|tika)|index-(basic|anchor|more)|query-(basic|more|custom|site|url)|response-(json|xml)|summary-(basic|lucene)|scoring-opic|urlnormalizer-(pass|regex|basic)</value> this is debug log 2010-07-14 15:10:18,703 DEBUG tika.TikaParser - Using Tika parser org.apache.tika.parser.pdf.PDFParser for mime-type application/pdf 2010-07-14 15:10:18,771 ERROR tika.TikaParser - Error parsing http://www.amway.com.tr/_fileserver/item/11781 java.io.IOException: expected='endstream' actual='' org.apache.pdfbox.io.pushbackinputstr...@5f790458 at org.apache.pdfbox.pdfparser.BaseParser.parseCOSStream(BaseParser.java:380) at org.apache.pdfbox.pdfparser.PDFParser.parseObject(PDFParser.java:528) at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:179) at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:847) at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:814) at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:63) at org.apache.nutch.parse.tika.TikaParser.getParse(TikaParser.java:95) at org.apache.nutch.parse.ParseUtil.parse(ParseUtil.java:82) at org.apache.nutch.parse.ParseSegment.map(ParseSegment.java:85) at org.apache.nutch.parse.ParseSegment.map(ParseSegment.java:41) at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50) at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:358) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:307) at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:177) 2010-07-14 15:10:18,772 WARN parse.Parser - Error parsing: http://www.amway.com.tr/_fileserver/item/11781: failed(2,0): expected='endstream' actual='' org.apache.pdfbox.io.pushbackinputstr...@5f790458 2010-07-14 15:10:18,773 DEBUG parse.ParseUtil - Parsing [http://www.amway.com.tr/_fileserver/item/11934] with [org.apache.nutch.parse.tika.tikapar...@557485ac] 2010-07-14 15:10:18,773 DEBUG tika.TikaParser - Using Tika parser org.apache.tika.parser.pdf.PDFParser for mime-type application/pdf 2010-07-14 15:10:18,833 ERROR tika.TikaParser - Error parsing http://www.amway.com.tr/_fileserver/item/11934 java.io.IOException: expected='endstream' actual='' org.apache.pdfbox.io.pushbackinputstr...@7a07f1ac at org.apache.pdfbox.pdfparser.BaseParser.parseCOSStream(BaseParser.java:380) at org.apache.pdfbox.pdfparser.PDFParser.parseObject(PDFParser.java:528) at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:179) at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:847) at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:814) at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:63) at org.apache.nutch.parse.tika.TikaParser.getParse(TikaParser.java:95) at org.apache.nutch.parse.ParseUtil.parse(ParseUtil.java:82) at org.apache.nutch.parse.ParseSegment.map(ParseSegment.java:85) at org.apache.nutch.parse.ParseSegment.map(ParseSegment.java:41) at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50) at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:358) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:307) at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:177) 2010-07-14 15:10:18,833 WARN parse.Parser - Error parsing: http://www.amway.com.tr/_fileserver/item/11934: failed(2,0): expected='endstream' actual='' org.apache.pdfbox.io.pushbackinputstr...@7a07f1ac 2010-07-14 15:10:18,834 DEBUG parse.ParseUtil - Parsing [http://www.amway.com.tr/_fileserver/item/11935] with [org.apache.nutch.parse.tika.tikapar...@557485ac] 2010-07-14 15:10:18,835 DEBUG tika.TikaParser - Using Tika parser org.apache.tika.parser.pdf.PDFParser for mime-type application/pdf 2010-07-14 15:10:18,856 ERROR tika.TikaParser - Error parsing http://www.amway.com.tr/_fileserver/item/11935 java.io.IOException: expected='endstream' actual='' org.apache.pdfbox.io.pushbackinputstr...@410b69b at org.apache.pdfbox.pdfparser.BaseParser.parseCOSStream(BaseParser.java:380) at org.apache.pdfbox.pdfparser.PDFParser.parseObject(PDFParser.java:528) at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:179) at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:847) at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:814) at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:63) at org.apache.nutch.parse.tika.TikaParser.getParse(TikaParser.java:95) at org.apache.nutch.parse.ParseUtil.parse(ParseUtil.java:82) at org.apache.nutch.parse.ParseSegment.map(ParseSegment.java:85) at org.apache.nutch.parse.ParseSegment.map(ParseSegment.java:41) at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50) at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:358) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:307) at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:177) 2010-07-14 15:10:18,857 WARN parse.Parser - Error parsing: http://www.amway.com.tr/_fileserver/item/11935: failed(2,0): expected='endstream' actual='' org.apache.pdfbox.io.pushbackinputstr...@410b69b 2010-07-14 15:10:18,859 WARN parse.ParserFactory - ParserFactory: Plugin: org.apache.nutch.parse.zip.ZipParser mapped to contentType application/zip via parse-plugins.xml, but not enabled via plugin.includes in nutch-default.xml 2010-07-14 15:10:18,859 INFO parse.ParserFactory - The parsing plugins: [org.apache.nutch.parse.tika.Parser] are enabled via the plugin.includes system property, and all claim to support the content type *, but they are not mapped to it in the parse-plugins.xml file 2010-07-14 15:10:18,859 DEBUG parse.ParseUtil - Parsing [http://www.amway.com.tr/_fileserver/item/12037] with [org.apache.nutch.parse.tika.tikapar...@557485ac] 2010-07-14 15:10:18,859 DEBUG tika.TikaParser - Using Tika parser org.apache.tika.parser.pkg.PackageParser for mime-type application/zip -- View this message in context: http://lucene.472066.n3.nabble.com/parse-step-hangs-tp961720p966289.html Sent from the Nutch - User mailing list archive at Nabble.com.