ma hei pang created PDFBOX-5708:
-----------------------------------

             Summary: Remove duplicate Embedded Subsets of font in PDF
                 Key: PDFBOX-5708
                 URL: https://issues.apache.org/jira/browse/PDFBOX-5708
             Project: PDFBox
          Issue Type: Wish
          Components: PDModel
    Affects Versions: 2.0.24
            Reporter: ma hei pang
         Attachments: 1.pdf, 2.pdf, picture1.PNG

I am using pdfbox-2.0.24 to merge multiple PDFs. However, the merged file size 
is too large because it contains many duplicate embedded subsets of font. For 
an example, I merge 1.pdf and 2.pdf which shows some repeated font list as 
picture1. 

 

I tried the source code provided by mkl : [How to reduce the size of merged 
PDF/A-1b files with pdfbox or other java library - Stack 
Overflow|https://stackoverflow.com/questions/53420344/how-to-reduce-the-size-of-merged-pdf-a-1b-files-with-pdfbox-or-other-java-librar/53544377#53544377]
{code:java}
// code placeholder
{code}
import org.apache.pdfbox.cos.*; import org.apache.pdfbox.pdmodel.PDDocument; 
import java.io.File; import java.io.IOException; import java.io.InputStream; 
import java.security.MessageDigest; import 
java.security.NoSuchAlgorithmException; import java.util.*; public class 
PdfOptimizer \{ public PdfOptimizer() {} public void optimize(PDDocument 
pdDocument) throws IOException \{ Map<COSBase, Collection<Reference>> 
complexObjects = findComplexObjects(pdDocument); for (int pass = 0; ; pass++) { 
int merges = mergeDuplicates(complexObjects); if (merges <= 0) { 
System.out.printf("Pass %d - No merged objects\n\n", pass); break; } 
System.out.printf("Pass %d - Merged objects: %d\n\n", pass, merges); } } 
Map<COSBase, Collection<Reference>> findComplexObjects(PDDocument pdDocument) 
\{ // Implementation of findComplexObjects method COSDictionary 
catalogDictionary = pdDocument.getDocumentCatalog().getCOSObject(); 
Map<COSBase, Collection<Reference>> incomingReferences = new HashMap<>(); 
incomingReferences.put(catalogDictionary, new ArrayList<>()); Set<COSBase> 
lastPass = Collections.<COSBase>singleton(catalogDictionary); Set<COSBase> 
thisPass = new HashSet<>(); while(!lastPass.isEmpty()) { for (COSBase object : 
lastPass) { if (object instanceof COSArray) { COSArray array = (COSArray) 
object; for (int i = 0; i < array.size(); i++) { addTarget(new 
ArrayReference(array, i), incomingReferences, thisPass); } } else if (object 
instanceof COSDictionary) \{ COSDictionary dictionary = (COSDictionary) object; 
for (COSName key : dictionary.keySet()) { addTarget(new 
DictionaryReference(dictionary, key), incomingReferences, thisPass); } } } 
lastPass = thisPass; thisPass = new HashSet<>(); } return incomingReferences; } 
void addTarget(Reference reference, Map<COSBase, Collection<Reference>> 
incomingReferences, Set<COSBase> thisPass) \{ // Implementation of addTarget 
method COSBase object = reference.getTo(); if (object instanceof COSArray || 
object instanceof COSDictionary) { Collection<Reference> incoming = 
incomingReferences.get(object); if (incoming == null) { incoming = new 
ArrayList<>(); incomingReferences.put(object, incoming); thisPass.add(object); 
} incoming.add(reference); } } int mergeDuplicates(Map<COSBase, 
Collection<Reference>> complexObjects) throws IOException \{ // Implementation 
of mergeDuplicates method List<HashOfCOSBase> hashes = new 
ArrayList<>(complexObjects.size()); for (COSBase object : 
complexObjects.keySet()) { hashes.add(new HashOfCOSBase(object)); } 
Collections.sort(hashes); int removedDuplicates = 0; if (!hashes.isEmpty()) \{ 
int runStart = 0; int runHash = hashes.get(0).hash; for (int i = 1; i < 
hashes.size(); i++) { int hash = hashes.get(i).hash; if (hash != runHash) { int 
runSize = i - runStart; if (runSize != 1) { System.out.printf("Equal hash %d 
for %d elements.\n", runHash, runSize); removedDuplicates += 
mergeRun(complexObjects, hashes.subList(runStart, i)); } runHash = hash; 
runStart = i; } } int runSize = hashes.size() - runStart; if (runSize != 1) \{ 
System.out.printf("Equal hash %d for %d elements.\n", runHash, runSize); 
removedDuplicates += mergeRun(complexObjects, hashes.subList(runStart, 
hashes.size())); } } return removedDuplicates; } int mergeRun(Map<COSBase, 
Collection<Reference>> complexObjects, List<HashOfCOSBase> run) \{ // 
Implementation of mergeRun method int removedDuplicates = 0; 
List<List<COSBase>> duplicateSets = new ArrayList<>(); for (HashOfCOSBase entry 
: run) { COSBase element = entry.object; for (List<COSBase> duplicateSet : 
duplicateSets) { if (equals(element, duplicateSet.get(0))) { 
duplicateSet.add(element); element = null; break; } } if (element != null) \{ 
List<COSBase> duplicateSet = new ArrayList<>(); duplicateSet.add(element); 
duplicateSets.add(duplicateSet); } } System.out.printf("Identified %d set(s) of 
identical objects in run.\n", duplicateSets.size()); for (List<COSBase> 
duplicateSet : duplicateSets) \{ if (duplicateSet.size() > 1) { COSBase 
surviver = duplicateSet.remove(0); Collection<Reference> surviverReferences = 
complexObjects.get(surviver); for (COSBase object : duplicateSet) { 
Collection<Reference> references = complexObjects.get(object); for (Reference 
reference : references) { reference.setTo(surviver); 
surviverReferences.add(reference); } complexObjects.remove(object); 
removedDuplicates++; } surviver.setDirect(false); } } return removedDuplicates; 
} boolean equals(COSBase a, COSBase b) \{ // Implementation of equals method if 
(a instanceof COSArray) { if (b instanceof COSArray) { COSArray aArray = 
(COSArray) a; COSArray bArray = (COSArray) b; if (aArray.size() == 
bArray.size()) { for (int i=0; i < aArray.size(); i++) { if 
(!resolve(aArray.get(i)).equals(resolve(bArray.get(i)))) return false; } return 
true; } } } else if (a instanceof COSDictionary) \{ if (b instanceof 
COSDictionary) { COSDictionary aDict = (COSDictionary) a; COSDictionary bDict = 
(COSDictionary) b; Set<COSName> keys = aDict.keySet(); if 
(keys.equals(bDict.keySet())) { for (COSName key : keys) { if 
(!resolve(aDict.getItem(key)).equals(bDict.getItem(key))) return false; } // In 
case of COSStreams we strictly speaking should // also compare the stream 
contents here. But apparently // their hashes coincide well enough for the 
original // hashing equality, so let's just assume... return true; } } } return 
false; } static COSBase resolve(COSBase object) \{ // Implementation of resolve 
method while (object instanceof COSObject) object = 
((COSObject)object).getObject(); return object; } interface Reference \{ public 
COSBase getFrom(); public COSBase getTo(); public void setTo(COSBase to); } 
static class ArrayReference implements Reference \{ public 
ArrayReference(COSArray array, int index) { this.from = array; this.index = 
index; } @Override public COSBase getFrom() \{ return from; } @Override public 
COSBase getTo() \{ return resolve(from.get(index)); } @Override public void 
setTo(COSBase to) \{ from.set(index, to); } final COSArray from; final int 
index; } static class DictionaryReference implements Reference \{ public 
DictionaryReference(COSDictionary dictionary, COSName key) { this.from = 
dictionary; this.key = key; } @Override public COSBase getFrom() \{ return 
from; } @Override public COSBase getTo() \{ return 
resolve(from.getDictionaryObject(key)); } @Override public void setTo(COSBase 
to) \{ from.setItem(key, to); } final COSDictionary from; final COSName key; } 
static class HashOfCOSBase implements Comparable<HashOfCOSBase> \{ public 
HashOfCOSBase(COSBase object) throws IOException { this.object = object; 
this.hash = calculateHash(object); } int calculateHash(COSBase object) throws 
IOException \{ if (object instanceof COSArray) { int result = 1; for (COSBase 
member : (COSArray)object) result = 31 * result + member.hashCode(); return 
result; } else if (object instanceof COSDictionary) \{ int result = 3; for 
(Map.Entry<COSName, COSBase> entry : ((COSDictionary)object).entrySet()) result 
+= entry.hashCode(); if (object instanceof COSStream) { try ( InputStream data 
= ((COSStream)object).createRawInputStream() ) { MessageDigest md = 
MessageDigest.getInstance("MD5"); byte[] buffer = new byte[8192]; int bytesRead 
= 0; while((bytesRead = data.read(buffer)) >= 0) md.update(buffer, 0, 
bytesRead); result = 31 * result + Arrays.hashCode(md.digest()); } catch 
(NoSuchAlgorithmException e) \{ throw new IOException(e); } } return result; } 
else \{ throw new IllegalArgumentException(String.format("Unknown complex 
COSBase type %s", object.getClass().getName())); } } final COSBase object; 
final int hash; @Override public int compareTo(HashOfCOSBase o) \{ int result = 
Integer.compare(hash, o.hash); if (result == 0) result = 
Integer.compare(hashCode(), o.hashCode()); return result; } } }

 

However, it is not working for me. Can anyone have an idea to remove the 
duplicate embedded subsets of font in merging PDFs?

 

 

 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to