Author: lehmi
Date: Sun Jan 21 13:15:31 2024
New Revision: 1915351

URL: http://svn.apache.org/viewvc?rev=1915351&view=rev
Log:
PDFBOX-5704: detect and fix wrong CID font subtype inspired by the 
implementation of pdf.js as proposed by Mike Li on dev@

Modified:
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java?rev=1915351&r1=1915350&r2=1915351&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java
 (original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java
 Sun Jan 21 13:15:31 2024
@@ -17,9 +17,16 @@
 package org.apache.pdfbox.pdmodel.font;
 
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.List;
 
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.io.RandomAccessRead;
 import org.apache.logging.log4j.Logger;
 import org.apache.logging.log4j.LogManager;
 import org.apache.pdfbox.pdmodel.ResourceCache;
@@ -32,6 +39,12 @@ public final class PDFontFactory
 {
     private static final Logger LOG = 
LogManager.getLogger(PDFontFactory.class);
 
+    private static final String FONT_TYPE1C = "Type1C";
+    private static final String FONT_OPEN_TYPE = "OTTO";
+    private static final String FONT_TTF_COLLECTION = "ttcf";
+    private static final String FONT_TRUE_TYPE = "true";
+    private static final byte[] TTF_HEADER = new byte[] { 0, 1, 0, 0 };
+
     private PDFontFactory()
     {
     }
@@ -48,6 +61,58 @@ public final class PDFontFactory
         return createFont(dictionary, null);
     }
 
+    private static class FontType
+    {
+        private static final List<String> cidType0Types //
+                = Arrays.asList(COSName.TYPE1.getName(), FONT_TYPE1C);
+        private static final List<String> cidType2Types //
+                = Arrays.asList(COSName.TRUE_TYPE.getName(), 
COSName.OPEN_TYPE.getName());
+        private final COSName type;
+        private final COSName subtype;
+
+        public FontType(COSName type, String subtypeString)
+        {
+            this.type = type;
+            if (cidType0Types.contains(subtypeString))
+            {
+                subtype = COSName.CID_FONT_TYPE0;
+            }
+            else if (cidType2Types.contains(subtypeString))
+            {
+                subtype = COSName.CID_FONT_TYPE2;
+            }
+            else
+            {
+                subtype = null;
+            }
+        }
+
+        public FontType(COSName type, COSName subtype)
+        {
+            this.type = type;
+            this.subtype = subtype;
+        }
+
+        public FontType(COSName type)
+        {
+            this(type, (COSName) null);
+        }
+
+        public COSName getSubtype()
+        {
+            return subtype;
+        }
+
+        public boolean isCIDSubtype(COSName cidSubtype)
+        {
+            if (!COSName.TYPE0.equals(type))
+            {
+                return false;
+            }
+            return subtype != null && subtype.equals(cidSubtype);
+        }
+    }
+
     /**
      * Creates a new PDFont instance with the appropriate subclass.
      *
@@ -93,6 +158,18 @@ public final class PDFontFactory
         }
         else if (COSName.TYPE0.equals(subType))
         {
+            COSDictionary fontDescriptor = getFontDescriptor(dictionary);
+            FontType fontTypeFromFont = getFontTypeFromFont(fontDescriptor, 
subType);
+            if (fontTypeFromFont != null)
+            {
+                COSDictionary descendantFont = getDescendantFont(dictionary);
+                COSName descFontType = descendantFont != null
+                        ? descendantFont.getCOSName(COSName.SUBTYPE) : null;
+                if (descFontType != null && 
!fontTypeFromFont.isCIDSubtype(descFontType))
+                {
+                    fixType0Subtype(descendantFont, fontDescriptor, 
fontTypeFromFont.getSubtype());
+                }
+            }
             return new PDType0Font(dictionary);
         }
         else if (COSName.CID_FONT_TYPE0.equals(subType))
@@ -112,6 +189,171 @@ public final class PDFontFactory
         }
     }
 
+    private static void fixType0Subtype(COSDictionary descendantFont, 
COSDictionary fontDescriptor,
+            COSName newSubType)
+    {
+        LOG.warn("Try to fix different descendant font types for font {}",
+                fontDescriptor.getNameAsString(COSName.FONT_NAME));
+        if (COSName.CID_FONT_TYPE0.equals(newSubType)
+                && !fontDescriptor.containsKey(COSName.FONT_FILE3)
+                && fontDescriptor.containsKey(COSName.FONT_FILE2))
+        {
+            fontDescriptor.setItem(COSName.FONT_FILE3, 
fontDescriptor.getItem(COSName.FONT_FILE2));
+            fontDescriptor.removeItem(COSName.FONT_FILE2);
+        }
+        if (COSName.CID_FONT_TYPE2.equals(newSubType)
+                && fontDescriptor.containsKey(COSName.FONT_FILE3)
+                && !fontDescriptor.containsKey(COSName.FONT_FILE2))
+        {
+            fontDescriptor.setItem(COSName.FONT_FILE2, 
fontDescriptor.getItem(COSName.FONT_FILE3));
+            fontDescriptor.removeItem(COSName.FONT_FILE3);
+        }
+        descendantFont.setItem(COSName.SUBTYPE, newSubType);
+    }
+
+    private static FontType getFontTypeFromFont(COSDictionary fontDescriptor, 
COSName fontType)
+            throws IOException
+    {
+        byte[] fontHeader = getFontHeader(fontDescriptor);
+        if (fontHeader == null)
+        {
+            return null;
+        }
+        boolean isComposite = COSName.TYPE0.equals(fontType);
+        if (isTrueTypeFile(fontHeader) || isTrueTypeCollectionFile(fontHeader))
+        {
+            return isComposite //
+                    ? new FontType(COSName.TYPE0, COSName.TRUE_TYPE.getName())
+                    : new FontType(COSName.TRUE_TYPE);
+        }
+        if (isOpenTypeFile(fontHeader))
+        {
+            return isComposite //
+                    ? new FontType(COSName.TYPE0, COSName.OPEN_TYPE.getName())
+                    : new FontType(COSName.OPEN_TYPE);
+        }
+        if (isType1File(fontHeader) || isPfbFile(fontHeader))
+        {
+            if (isComposite)
+            {
+                return new FontType(COSName.TYPE0, COSName.TYPE1.getName());
+            }
+            return fontType.equals(COSName.MM_TYPE1)
+                    ? new FontType(COSName.MM_TYPE1, COSName.TYPE1.getName())
+                    : new FontType(COSName.TYPE1);
+        }
+        // CFF fonts have a more or less variable header so that the check 
should be done
+        // after all others to avoid wrong classifications
+        if (isCFFFile(fontHeader))
+        {
+            if (isComposite)
+            {
+                return new FontType(COSName.TYPE0, FONT_TYPE1C);
+            }
+            return fontType.equals(COSName.MM_TYPE1) //
+                    ? new FontType(COSName.MM_TYPE1, FONT_TYPE1C)
+                    : new FontType(COSName.TYPE1, FONT_TYPE1C);
+        }
+        return null;
+    }
+
+    private static boolean isTrueTypeFile(byte[] header)
+    {
+        return Arrays.equals(TTF_HEADER, header)
+                || FONT_TRUE_TYPE.equals(new String(header, 
StandardCharsets.US_ASCII));
+    }
+
+    private static boolean isTrueTypeCollectionFile(byte[] header)
+    {
+        return FONT_TTF_COLLECTION.equals(new String(header, 
StandardCharsets.US_ASCII));
+    }
+
+    private static boolean isOpenTypeFile(byte[] header)
+    {
+        return FONT_OPEN_TYPE.equals(new String(header, 
StandardCharsets.US_ASCII));
+    }
+
+    private static boolean isType1File(byte[] header)
+    {
+        // All Type1 font programs must begin with the comment '%!' (0x25 + 
0x21).
+        return header[0] == 0x25 && header[1] == 0x21;
+    }
+
+    private static boolean isPfbFile(byte[] header)
+    {
+        // all PFB fonts start with 0x80 followed by either 0x01 or 0x02
+        return header[0] == 0x80 && (header[1] == 0x01 || header[1] == 0x02);
+    }
+
+    private static boolean isCFFFile(byte[] header)
+    {
+        // the header consist of 4 values
+        // major version, minor version, header size, offset size
+        // the major version must be >= 1 and the offset size >= 1 and <= 4
+        return header[0] >= 1 && header[3] >= 1 && header[3] <= 4;
+    }
+
+    private static COSDictionary getFontDescriptor(COSDictionary dictionary)
+    {
+        COSDictionary fontDescriptor = 
dictionary.getCOSDictionary(COSName.FONT_DESC);
+        if (fontDescriptor == null)
+        {
+            COSDictionary descendantFont = getDescendantFont(dictionary);
+            if (descendantFont != null)
+            {
+                fontDescriptor = 
descendantFont.getCOSDictionary(COSName.FONT_DESC);
+            }
+        }
+        return fontDescriptor;
+    }
+
+    private static COSDictionary getDescendantFont(COSDictionary dictionary)
+    {
+        COSArray descendantFonts = 
dictionary.getCOSArray(COSName.DESCENDANT_FONTS);
+        if (descendantFonts != null && descendantFonts.size() > 0)
+        {
+            COSBase descendantFontDictBase = descendantFonts.getObject(0);
+            if (descendantFontDictBase instanceof COSDictionary)
+            {
+                return (COSDictionary) descendantFontDictBase;
+            }
+        }
+        return null;
+    }
+
+    private static byte[] getFontHeader(COSDictionary fontDescriptor) throws 
IOException
+    {
+        if (fontDescriptor == null)
+        {
+            return null;
+        }
+        COSStream fontFile = fontDescriptor.getCOSStream(COSName.FONT_FILE);
+        if (fontFile == null)
+        {
+            fontFile = fontDescriptor.getCOSStream(COSName.FONT_FILE2);
+        }
+        if (fontFile == null)
+        {
+            fontFile = fontDescriptor.getCOSStream(COSName.FONT_FILE3);
+        }
+        byte[] header = null;
+        if (fontFile != null)
+        {
+            try (RandomAccessRead fontView = fontFile.createView())
+            {
+                int headerLength = 4;
+                header = new byte[headerLength];
+                int remainingBytes = headerLength;
+                int amountRead;
+                while ((amountRead = fontView.read(header, headerLength - 
remainingBytes,
+                        remainingBytes)) > 0)
+                {
+                    remainingBytes -= amountRead;
+                }
+            }
+        }
+        return header;
+    }
     /**
      * Creates a new PDCIDFont instance with the appropriate subclass.
      *
@@ -127,19 +369,15 @@ public final class PDFontFactory
         {
             throw new IOException("Expected 'Font' dictionary but found '" + 
type.getName() + "'");
         }
-
         COSName subType = dictionary.getCOSName(COSName.SUBTYPE);
         if (COSName.CID_FONT_TYPE0.equals(subType))
         {
             return new PDCIDFontType0(dictionary, parent);
         }
-        else if (COSName.CID_FONT_TYPE2.equals(subType))
+        if (COSName.CID_FONT_TYPE2.equals(subType))
         {
             return new PDCIDFontType2(dictionary, parent);
         }
-        else
-        {
-            throw new IOException("Invalid font type: " + type);
-        }
+        throw new IOException("Invalid font type: " + type);
     }
 }


Reply via email to