I am trying to extract text from
https://hal.archives-ouvertes.fr/pastel-00003992/document
using PDFTextStripper (pdfbox V1.8.8)
I can visually read this (263 pages) on AdobeReader on MacOSX, but PDFBox
gives the following output.
495 [main] INFO org.apache.pdfbox.pdfparser.PDFParser - Document is
encrypted
656 [main] ERROR org.apache.pdfbox.filter.FlateFilter - FlateFilter: stop
reading corrupt stream due to a DataFormatException
[8 repeats snipped]
656 [main] ERROR org.apache.pdfbox.filter.FlateFilter - FlateFilter: stop
reading corrupt stream due to a DataFormatException
java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at
org.xmlcml.args.DefaultArgProcessor.runRunMethod(DefaultArgProcessor.java:597)
at
org.xmlcml.args.DefaultArgProcessor.runRunMethodsOnChosenArgOptions(DefaultArgProcessor.java:463)
at
org.xmlcml.args.DefaultArgProcessor.runAndOutput(DefaultArgProcessor.java:663)
at org.xmlcml.norma.Norma.run(Norma.java:28)
at org.xmlcml.norma.Prototypes.runHalThesis1(Prototypes.java:11)
at org.xmlcml.norma.Prototypes.main(Prototypes.java:6)
Caused by: java.lang.RuntimeException: Cannot transform PDF
examples/theses/HalThesis1/fulltext.pdf
at
org.xmlcml.norma.NormaTransformer.applyPDF2TXTToCMLDir(NormaTransformer.java:87)
at org.xmlcml.norma.NormaTransformer.transform(NormaTransformer.java:69)
at
org.xmlcml.norma.NormaArgProcessor.transform(NormaArgProcessor.java:172)
... 10 more
Caused by: java.io.IOException
at org.apache.pdfbox.filter.FlateFilter.decode(FlateFilter.java:109)
at org.apache.pdfbox.cos.COSStream.doDecode(COSStream.java:379)
at org.apache.pdfbox.cos.COSStream.doDecode(COSStream.java:291)
at
org.apache.pdfbox.cos.COSStream.getUnfilteredStream(COSStream.java:225)
at
org.apache.pdfbox.pdmodel.common.COSStreamArray.getUnfilteredStream(COSStreamArray.java:197)
at
org.apache.pdfbox.pdfparser.PDFStreamParser.<init>(PDFStreamParser.java:117)
at
org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:251)
at
org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235)
at
org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215)
at
org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:460)
at
org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:385)
at
org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:344)
at
org.apache.pdfbox.util.PDFTextStripper.getText(PDFTextStripper.java:275)
at
org.xmlcml.norma.input.pdf.PDF2TXTConverter.readPDF(PDF2TXTConverter.java:19)
at
org.xmlcml.norma.NormaTransformer.applyPDF2TXTToCMLDir(NormaTransformer.java:85)
... 12 more
Caused by: java.util.zip.DataFormatException: unknown compression method
at java.util.zip.Inflater.inflateBytes(Native Method)
at java.util.zip.Inflater.inflate(Inflater.java:259)
at java.util.zip.Inflater.inflate(Inflater.java:280)
at org.apache.pdfbox.filter.FlateFilter.decompress(FlateFilter.java:128)
at org.apache.pdfbox.filter.FlateFilter.decode(FlateFilter.java:101)
... 26 more
Exception in thread "main" java.lang.RuntimeException: cannot process
argument: --xsl (DataFormatException: unknown compression method)
at
org.xmlcml.args.DefaultArgProcessor.runRunMethodsOnChosenArgOptions(DefaultArgProcessor.java:466)
at
org.xmlcml.args.DefaultArgProcessor.runAndOutput(DefaultArgProcessor.java:663)
at org.xmlcml.norma.Norma.run(Norma.java:28)
at org.xmlcml.norma.Prototypes.runHalThesis1(Prototypes.java:11)
at org.xmlcml.norma.Prototypes.main(Prototypes.java:6)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at
org.xmlcml.args.DefaultArgProcessor.runRunMethod(DefaultArgProcessor.java:597)
at
org.xmlcml.args.DefaultArgProcessor.runRunMethodsOnChosenArgOptions(DefaultArgProcessor.java:463)
... 4 more
Caused by: java.lang.RuntimeException: Cannot transform PDF
examples/theses/HalThesis1/fulltext.pdf
at
org.xmlcml.norma.NormaTransformer.applyPDF2TXTToCMLDir(NormaTransformer.java:87)
at org.xmlcml.norma.NormaTransformer.transform(NormaTransformer.java:69)
at
org.xmlcml.norma.NormaArgProcessor.transform(NormaArgProcessor.java:172)
... 10 more
Caused by: java.io.IOException
at org.apache.pdfbox.filter.FlateFilter.decode(FlateFilter.java:109)
at org.apache.pdfbox.cos.COSStream.doDecode(COSStream.java:379)
at org.apache.pdfbox.cos.COSStream.doDecode(COSStream.java:291)
at
org.apache.pdfbox.cos.COSStream.getUnfilteredStream(COSStream.java:225)
at
org.apache.pdfbox.pdmodel.common.COSStreamArray.getUnfilteredStream(COSStreamArray.java:197)
at
org.apache.pdfbox.pdfparser.PDFStreamParser.<init>(PDFStreamParser.java:117)
at
org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:251)
at
org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235)
at
org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215)
at
org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:460)
at
org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:385)
at
org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:344)
at
org.apache.pdfbox.util.PDFTextStripper.getText(PDFTextStripper.java:275)
at
org.xmlcml.norma.input.pdf.PDF2TXTConverter.readPDF(PDF2TXTConverter.java:19)
at
org.xmlcml.norma.NormaTransformer.applyPDF2TXTToCMLDir(NormaTransformer.java:85)
... 12 more
Caused by: java.util.zip.DataFormatException: unknown compression method
at java.util.zip.Inflater.inflateBytes(Native Method)
at java.util.zip.Inflater.inflate(Inflater.java:259)
at java.util.zip.Inflater.inflate(Inflater.java:280)
at org.apache.pdfbox.filter.FlateFilter.decompress(FlateFilter.java:128)
at org.apache.pdfbox.filter.FlateFilter.decode(FlateFilter.java:101)
... 26 more
Is this a problem of encryption, or a broken PDF that Adobe can somehow
read or some other problem?
Many thanks
--
Peter Murray-Rust
Reader in Molecular Informatics
Unilever Centre, Dep. Of Chemistry
University of Cambridge
CB2 1EW, UK
+44-1223-763069