Author: jukka Date: Sun Sep 14 12:02:32 2008 New Revision: 695266 URL: http://svn.apache.org/viewvc?rev=695266&view=rev Log: TIKA-114: PDFParser : Getting content of the document using "writer.ToString ()" , some words are stuck together
Patch by Dave Meikle. Modified: incubator/tika/trunk/CHANGES.txt incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Modified: incubator/tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=695266&r1=695265&r2=695266&view=diff ============================================================================== --- incubator/tika/trunk/CHANGES.txt (original) +++ incubator/tika/trunk/CHANGES.txt Sun Sep 14 12:02:32 2008 @@ -81,6 +81,10 @@ 34. TIKA-54 - Outlook msg parser (Rida Benjelloun, Dave Meikle & Jukka Zitting) +35. TIKA-114 - PDFParser : Getting content of the document using + "writer.ToString ()" , some words are stuck together + (Dave Meikle) + Release 0.1-incubating - 12/27/2007 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann) Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=695266&r1=695265&r2=695266&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Sun Sep 14 12:02:32 2008 @@ -39,7 +39,7 @@ /** * Converts the given PDF document (and related metadata) to a stream * of XHTML SAX events sent to the given content handler. - * + * * @param document PDF document * @param handler SAX content handler * @param metadata PDF metadata @@ -124,21 +124,45 @@ } } - protected void processLineSeparator(TextPosition p) throws IOException { - try { - handler.characters("\n"); - } catch (SAXException e) { - throw new IOExceptionWithCause("Unable to write a newline", e); + // Two methods added to work around lack of support for processWordSeparator + // and processLineSeparator in PDFBox-0.7.3. This is fixed in CVS Head (PDFBox-0.7.4) + public String getWordSeparator() + { + try + { + handler.characters(" "); + } catch(SAXException e) { + } + return super.getWordSeparator(); //To change body of overridden methods use File | Settings | File Templates. } - protected void processWordSeparator(TextPosition a, TextPosition b) - throws IOException { - try { - handler.characters(" "); - } catch (SAXException e) { - throw new IOExceptionWithCause("Unable to write a space", e); + public String getLineSeparator() + { + try + { + handler.characters("\n"); + } catch(SAXException e) { + } + return super.getLineSeparator(); } +// protected void processLineSeparator(TextPosition p) throws IOException { +// try { +// handler.characters("\n"); +// } catch (SAXException e) { +// throw new IOExceptionWithCause("Unable to write a newline", e); +// } +// } +// +// protected void processWordSeparator(TextPosition a, TextPosition b) +// throws IOException { +// try { +// handler.characters(" "); +// } catch (SAXException e) { +// throw new IOExceptionWithCause("Unable to write a space", e); +// } +// } + }