Author: lehmi Date: Wed Apr 20 15:58:55 2016 New Revision: 1740161 URL: http://svn.apache.org/viewvc?rev=1740161&view=rev Log: PDFBOX-3281: ignore encoding parameter when writing html output
Modified: pdfbox/branches/2.0/ (props changed) pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java Propchange: pdfbox/branches/2.0/ ------------------------------------------------------------------------------ --- svn:mergeinfo (original) +++ svn:mergeinfo Wed Apr 20 15:58:55 2016 @@ -1,3 +1,3 @@ /pdfbox/branches/no-awt:1618517-1621410 /pdfbox/no-awt:1618514-1618516 -/pdfbox/trunk:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755 +/pdfbox/trunk:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755,1740160 Modified: pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java?rev=1740161&r1=1740160&r2=1740161&view=diff ============================================================================== --- pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java (original) +++ pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java Wed Apr 20 15:58:55 2016 @@ -49,8 +49,9 @@ public final class ExtractText private static final String SORT = "-sort"; private static final String IGNORE_BEADS = "-ignoreBeads"; private static final String DEBUG = "-debug"; - // jjb - added simple HTML output private static final String HTML = "-html"; + + private static final String STD_ENCODING = "UTF-8"; /* * debug flag @@ -93,7 +94,7 @@ public final class ExtractText boolean sort = false; boolean separateBeads = true; String password = ""; - String encoding = "UTF-8"; + String encoding = STD_ENCODING; String pdfFile = null; String outputFile = null; // Defaults to text files @@ -204,6 +205,11 @@ public final class ExtractText } else { + if (toHTML && !STD_ENCODING.equals(encoding)) + { + encoding = STD_ENCODING; + System.out.println("The encoding parameter is ignored when writing html output."); + } output = new OutputStreamWriter( new FileOutputStream( outputFile ), encoding ); } Modified: pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java?rev=1740161&r1=1740160&r2=1740161&view=diff ============================================================================== --- pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java (original) +++ pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java Wed Apr 20 15:58:55 2016 @@ -39,7 +39,6 @@ public class PDFText2HTML extends PDFTex { private static final int INITIAL_PDF_TO_HTML_BYTES = 8192; - private boolean onFirstPage = true; private final FontState fontState = new FontState(); /** @@ -64,34 +63,26 @@ public class PDFText2HTML extends PDFTex * * @throws IOException * If there is a problem writing out the header to the document. + * @deprecated deprecated, use {@link #startDocument(PDDocument)} */ protected void writeHeader() throws IOException { + } + + @Override + protected void startDocument(PDDocument document) throws IOException + { StringBuilder buf = new StringBuilder(INITIAL_PDF_TO_HTML_BYTES); buf.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"" + "\n" + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"); buf.append("<html><head>"); buf.append("<title>").append(escape(getTitle())).append("</title>\n"); - buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=\"UTF-16\">\n"); + buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=\"UTF-8\">\n"); buf.append("</head>\n"); buf.append("<body>\n"); super.writeString(buf.toString()); } - - /** - * {@inheritDoc} - */ - @Override - protected void writePage() throws IOException - { - if (onFirstPage) - { - writeHeader(); - onFirstPage = false; - } - super.writePage(); - } - + /** * {@inheritDoc} */