Author: lehmi
Date: Fri May  5 16:00:21 2017
New Revision: 1794069

URL: http://svn.apache.org/viewvc?rev=1794069&view=rev
Log:
PDFBOX-3347: fallback to ISO-8859-1 for names with invalid UTF-8

Modified:
    pdfbox/branches/2.0/   (props changed)
    pdfbox/branches/2.0/pdfbox/   (props changed)
    
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java

Propchange: pdfbox/branches/2.0/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Fri May  5 16:00:21 2017
@@ -1,3 +1,3 @@
 /pdfbox/branches/no-awt:1618517-1621410
 /pdfbox/no-awt:1618514-1618516
-/pdfbox/trunk:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755,1740160,1742437,1742442,1745595,1745606,1745772,1745774,1745776,1745779,1746032,1746151,1749162,1749165,1749432,1766088,1766213,1768061,1770985,1770988,1772528,1778172,1782679,1786586,1786603,1787546,1790745
+/pdfbox/trunk:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755,1740160,1742437,1742442,1743248,1745595,1745606,1745772,1745774,1745776,1745779,1746032,1746151,1749162,1749165,1749432,1766088,1766213,1768061,1770985,1770988,1772528,1778172,1782679,1786586,1786603,1787546,1790745

Propchange: pdfbox/branches/2.0/pdfbox/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Fri May  5 16:00:21 2017
@@ -1,3 +1,3 @@
 /pdfbox/branches/no-awt/pdfbox:1618517-1621410
 /pdfbox/no-awt/pdfbox:1618514-1618516
-/pdfbox/trunk/pdfbox:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755,1740160,1742437,1742442,1745595,1745606,1745772,1745774,1745776,1745779,1746032,1746151,1749162,1749165,1749432,1757165,1758817,1770988,1772528,1778172,1782679,1786586,1786603,1787546,1790745
+/pdfbox/trunk/pdfbox:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755,1740160,1742437,1742442,1743248,1745595,1745606,1745772,1745774,1745776,1745779,1746032,1746151,1749162,1749165,1749432,1757165,1758817,1770988,1772528,1778172,1782679,1786586,1786603,1787546,1790745

Modified: 
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1794069&r1=1794068&r2=1794069&view=diff
==============================================================================
--- 
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
 (original)
+++ 
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
 Fri May  5 16:00:21 2017
@@ -18,6 +18,9 @@ package org.apache.pdfbox.pdfparser;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.CharsetDecoder;
 import java.util.Arrays;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -779,11 +782,39 @@ public abstract class BaseParser
         {
             seqSource.unread(c);
         }
-        String string = new String(buffer.toByteArray(), Charsets.UTF_8);
+        
+        byte[] bytes = buffer.toByteArray();
+        String string;
+        if (isValidUTF8(bytes))
+        {
+            string = new String(buffer.toByteArray(), Charsets.UTF_8);
+        }
+        else
+        {
+            // some malformed PDFs don't use UTF-8 see PDFBOX-3347
+            string = new String(buffer.toByteArray(), Charsets.ISO_8859_1);
+        }
         return COSName.getPDFName(string);
     }
 
     /**
+     * Returns true if a byte sequence is valid UTF-8.
+     */
+    private boolean isValidUTF8(byte[] input)
+    {
+        CharsetDecoder cs = Charsets.UTF_8.newDecoder();
+        try
+        {
+            cs.decode(ByteBuffer.wrap(input));
+            return true;
+        }
+        catch (CharacterCodingException e)
+        {
+            return false;
+        }
+    }
+    
+    /**
      * This will parse a boolean object from the stream.
      *
      * @return The parsed boolean object.


Reply via email to