Author: lehmi Date: Sun Jan 21 13:15:31 2024 New Revision: 1915351 URL: http://svn.apache.org/viewvc?rev=1915351&view=rev Log: PDFBOX-5704: detect and fix wrong CID font subtype inspired by the implementation of pdf.js as proposed by Mike Li on dev@
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java?rev=1915351&r1=1915350&r2=1915351&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFontFactory.java Sun Jan 21 13:15:31 2024 @@ -17,9 +17,16 @@ package org.apache.pdfbox.pdmodel.font; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.io.RandomAccessRead; import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager; import org.apache.pdfbox.pdmodel.ResourceCache; @@ -32,6 +39,12 @@ public final class PDFontFactory { private static final Logger LOG = LogManager.getLogger(PDFontFactory.class); + private static final String FONT_TYPE1C = "Type1C"; + private static final String FONT_OPEN_TYPE = "OTTO"; + private static final String FONT_TTF_COLLECTION = "ttcf"; + private static final String FONT_TRUE_TYPE = "true"; + private static final byte[] TTF_HEADER = new byte[] { 0, 1, 0, 0 }; + private PDFontFactory() { } @@ -48,6 +61,58 @@ public final class PDFontFactory return createFont(dictionary, null); } + private static class FontType + { + private static final List<String> cidType0Types // + = Arrays.asList(COSName.TYPE1.getName(), FONT_TYPE1C); + private static final List<String> cidType2Types // + = Arrays.asList(COSName.TRUE_TYPE.getName(), COSName.OPEN_TYPE.getName()); + private final COSName type; + private final COSName subtype; + + public FontType(COSName type, String subtypeString) + { + this.type = type; + if (cidType0Types.contains(subtypeString)) + { + subtype = COSName.CID_FONT_TYPE0; + } + else if (cidType2Types.contains(subtypeString)) + { + subtype = COSName.CID_FONT_TYPE2; + } + else + { + subtype = null; + } + } + + public FontType(COSName type, COSName subtype) + { + this.type = type; + this.subtype = subtype; + } + + public FontType(COSName type) + { + this(type, (COSName) null); + } + + public COSName getSubtype() + { + return subtype; + } + + public boolean isCIDSubtype(COSName cidSubtype) + { + if (!COSName.TYPE0.equals(type)) + { + return false; + } + return subtype != null && subtype.equals(cidSubtype); + } + } + /** * Creates a new PDFont instance with the appropriate subclass. * @@ -93,6 +158,18 @@ public final class PDFontFactory } else if (COSName.TYPE0.equals(subType)) { + COSDictionary fontDescriptor = getFontDescriptor(dictionary); + FontType fontTypeFromFont = getFontTypeFromFont(fontDescriptor, subType); + if (fontTypeFromFont != null) + { + COSDictionary descendantFont = getDescendantFont(dictionary); + COSName descFontType = descendantFont != null + ? descendantFont.getCOSName(COSName.SUBTYPE) : null; + if (descFontType != null && !fontTypeFromFont.isCIDSubtype(descFontType)) + { + fixType0Subtype(descendantFont, fontDescriptor, fontTypeFromFont.getSubtype()); + } + } return new PDType0Font(dictionary); } else if (COSName.CID_FONT_TYPE0.equals(subType)) @@ -112,6 +189,171 @@ public final class PDFontFactory } } + private static void fixType0Subtype(COSDictionary descendantFont, COSDictionary fontDescriptor, + COSName newSubType) + { + LOG.warn("Try to fix different descendant font types for font {}", + fontDescriptor.getNameAsString(COSName.FONT_NAME)); + if (COSName.CID_FONT_TYPE0.equals(newSubType) + && !fontDescriptor.containsKey(COSName.FONT_FILE3) + && fontDescriptor.containsKey(COSName.FONT_FILE2)) + { + fontDescriptor.setItem(COSName.FONT_FILE3, fontDescriptor.getItem(COSName.FONT_FILE2)); + fontDescriptor.removeItem(COSName.FONT_FILE2); + } + if (COSName.CID_FONT_TYPE2.equals(newSubType) + && fontDescriptor.containsKey(COSName.FONT_FILE3) + && !fontDescriptor.containsKey(COSName.FONT_FILE2)) + { + fontDescriptor.setItem(COSName.FONT_FILE2, fontDescriptor.getItem(COSName.FONT_FILE3)); + fontDescriptor.removeItem(COSName.FONT_FILE3); + } + descendantFont.setItem(COSName.SUBTYPE, newSubType); + } + + private static FontType getFontTypeFromFont(COSDictionary fontDescriptor, COSName fontType) + throws IOException + { + byte[] fontHeader = getFontHeader(fontDescriptor); + if (fontHeader == null) + { + return null; + } + boolean isComposite = COSName.TYPE0.equals(fontType); + if (isTrueTypeFile(fontHeader) || isTrueTypeCollectionFile(fontHeader)) + { + return isComposite // + ? new FontType(COSName.TYPE0, COSName.TRUE_TYPE.getName()) + : new FontType(COSName.TRUE_TYPE); + } + if (isOpenTypeFile(fontHeader)) + { + return isComposite // + ? new FontType(COSName.TYPE0, COSName.OPEN_TYPE.getName()) + : new FontType(COSName.OPEN_TYPE); + } + if (isType1File(fontHeader) || isPfbFile(fontHeader)) + { + if (isComposite) + { + return new FontType(COSName.TYPE0, COSName.TYPE1.getName()); + } + return fontType.equals(COSName.MM_TYPE1) + ? new FontType(COSName.MM_TYPE1, COSName.TYPE1.getName()) + : new FontType(COSName.TYPE1); + } + // CFF fonts have a more or less variable header so that the check should be done + // after all others to avoid wrong classifications + if (isCFFFile(fontHeader)) + { + if (isComposite) + { + return new FontType(COSName.TYPE0, FONT_TYPE1C); + } + return fontType.equals(COSName.MM_TYPE1) // + ? new FontType(COSName.MM_TYPE1, FONT_TYPE1C) + : new FontType(COSName.TYPE1, FONT_TYPE1C); + } + return null; + } + + private static boolean isTrueTypeFile(byte[] header) + { + return Arrays.equals(TTF_HEADER, header) + || FONT_TRUE_TYPE.equals(new String(header, StandardCharsets.US_ASCII)); + } + + private static boolean isTrueTypeCollectionFile(byte[] header) + { + return FONT_TTF_COLLECTION.equals(new String(header, StandardCharsets.US_ASCII)); + } + + private static boolean isOpenTypeFile(byte[] header) + { + return FONT_OPEN_TYPE.equals(new String(header, StandardCharsets.US_ASCII)); + } + + private static boolean isType1File(byte[] header) + { + // All Type1 font programs must begin with the comment '%!' (0x25 + 0x21). + return header[0] == 0x25 && header[1] == 0x21; + } + + private static boolean isPfbFile(byte[] header) + { + // all PFB fonts start with 0x80 followed by either 0x01 or 0x02 + return header[0] == 0x80 && (header[1] == 0x01 || header[1] == 0x02); + } + + private static boolean isCFFFile(byte[] header) + { + // the header consist of 4 values + // major version, minor version, header size, offset size + // the major version must be >= 1 and the offset size >= 1 and <= 4 + return header[0] >= 1 && header[3] >= 1 && header[3] <= 4; + } + + private static COSDictionary getFontDescriptor(COSDictionary dictionary) + { + COSDictionary fontDescriptor = dictionary.getCOSDictionary(COSName.FONT_DESC); + if (fontDescriptor == null) + { + COSDictionary descendantFont = getDescendantFont(dictionary); + if (descendantFont != null) + { + fontDescriptor = descendantFont.getCOSDictionary(COSName.FONT_DESC); + } + } + return fontDescriptor; + } + + private static COSDictionary getDescendantFont(COSDictionary dictionary) + { + COSArray descendantFonts = dictionary.getCOSArray(COSName.DESCENDANT_FONTS); + if (descendantFonts != null && descendantFonts.size() > 0) + { + COSBase descendantFontDictBase = descendantFonts.getObject(0); + if (descendantFontDictBase instanceof COSDictionary) + { + return (COSDictionary) descendantFontDictBase; + } + } + return null; + } + + private static byte[] getFontHeader(COSDictionary fontDescriptor) throws IOException + { + if (fontDescriptor == null) + { + return null; + } + COSStream fontFile = fontDescriptor.getCOSStream(COSName.FONT_FILE); + if (fontFile == null) + { + fontFile = fontDescriptor.getCOSStream(COSName.FONT_FILE2); + } + if (fontFile == null) + { + fontFile = fontDescriptor.getCOSStream(COSName.FONT_FILE3); + } + byte[] header = null; + if (fontFile != null) + { + try (RandomAccessRead fontView = fontFile.createView()) + { + int headerLength = 4; + header = new byte[headerLength]; + int remainingBytes = headerLength; + int amountRead; + while ((amountRead = fontView.read(header, headerLength - remainingBytes, + remainingBytes)) > 0) + { + remainingBytes -= amountRead; + } + } + } + return header; + } /** * Creates a new PDCIDFont instance with the appropriate subclass. * @@ -127,19 +369,15 @@ public final class PDFontFactory { throw new IOException("Expected 'Font' dictionary but found '" + type.getName() + "'"); } - COSName subType = dictionary.getCOSName(COSName.SUBTYPE); if (COSName.CID_FONT_TYPE0.equals(subType)) { return new PDCIDFontType0(dictionary, parent); } - else if (COSName.CID_FONT_TYPE2.equals(subType)) + if (COSName.CID_FONT_TYPE2.equals(subType)) { return new PDCIDFontType2(dictionary, parent); } - else - { - throw new IOException("Invalid font type: " + type); - } + throw new IOException("Invalid font type: " + type); } }