[ https://issues.apache.org/jira/browse/TIKA-1098?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14181630#comment-14181630 ]
Andreas Lehmkühler commented on TIKA-1098: ------------------------------------------ I've finally solved PDFBOX-1273. The fix will be part of the upcoming version 1.8.8 and 2.0.0. Thanks for your patience :-) > not able to parse pdfs/docs/ppts using 1.1 tika parser > -------------------------------------------------------- > > Key: TIKA-1098 > URL: https://issues.apache.org/jira/browse/TIKA-1098 > Project: Tika > Issue Type: Bug > Components: parser > Affects Versions: 1.1 > Environment: linux redhat > Reporter: Qian Diao > Attachments: url_1763_approx-alg-notes.pdf > > > Hi, > I got some parsing problems when using Tika 1.1 for the attached pdf file. > my code (Test.java): > import java.io.File; > import java.io.InputStream; > import java.io.FileInputStream; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.parser.AutoDetectParser; > import org.apache.tika.parser.ParseContext; > import org.apache.tika.parser.Parser; > import org.apache.tika.parser.html.BoilerpipeContentHandler; > import org.apache.tika.sax.BodyContentHandler; > import org.apache.tika.parser.html.HtmlParser; > import de.l3s.boilerpipe.extractors.ArticleExtractor; > public class Test { > private static final String validBoilerpipeFilenameRegEx = > ".*(\\.)(htm|html|shtml|php|asp|aspx)$"; > public String parseFile(File inFile) { > if (inFile == null || !inFile.isFile() || !inFile.canRead()) > return null; > > InputStream is = null; > String outputText = ""; > try { > // Open input stream > is = new FileInputStream(inFile); > // Prepare parser > BodyContentHandler contenthandler = new > BodyContentHandler(-1); > Metadata metadata = new Metadata(); > metadata.set(Metadata.RESOURCE_NAME_KEY, inFile.getName()); > ParseContext pc = new ParseContext(); > // Call parse with boilerpipe if valid boilerpipe extension; > otherwise, call regular parse. > if (!inFile.getName().matches(validBoilerpipeFilenameRegEx)) { > Parser parser = new AutoDetectParser(); > parser.parse(is, contenthandler, metadata, pc); > } > else { > Parser parser = new HtmlParser(); > BoilerpipeContentHandler bh = new > BoilerpipeContentHandler(contenthandler, new ArticleExtractor()); > parser.parse(is, bh, metadata, pc); > } > // Prepare text for write > outputText = contenthandler.toString(); > } catch (Exception e) { > System.out.println(e); > return null; > } finally { > try { > if (is != null) > is.close(); > } catch (Exception e) {} > } > > return outputText; > } > =====output==== > org.apache.tika.exception.TikaException: Unable to extract PDF content > url_1763_approx-alg-notes.pdf -- This message was sent by Atlassian JIRA (v6.3.4#6332)