ma hei pang created PDFBOX-5708:
-----------------------------------
Summary: Remove duplicate Embedded Subsets of font in PDF
Key: PDFBOX-5708
URL: https://issues.apache.org/jira/browse/PDFBOX-5708
Project: PDFBox
Issue Type: Wish
Components: PDModel
Affects Versions: 2.0.24
Reporter: ma hei pang
Attachments: 1.pdf, 2.pdf, picture1.PNG
I am using pdfbox-2.0.24 to merge multiple PDFs. However, the merged file size
is too large because it contains many duplicate embedded subsets of font. For
an example, I merge 1.pdf and 2.pdf which shows some repeated font list as
picture1.
I tried the source code provided by mkl : [How to reduce the size of merged
PDF/A-1b files with pdfbox or other java library - Stack
Overflow|https://stackoverflow.com/questions/53420344/how-to-reduce-the-size-of-merged-pdf-a-1b-files-with-pdfbox-or-other-java-librar/53544377#53544377]
{code:java}
// code placeholder
{code}
import org.apache.pdfbox.cos.*; import org.apache.pdfbox.pdmodel.PDDocument;
import java.io.File; import java.io.IOException; import java.io.InputStream;
import java.security.MessageDigest; import
java.security.NoSuchAlgorithmException; import java.util.*; public class
PdfOptimizer \{ public PdfOptimizer() {} public void optimize(PDDocument
pdDocument) throws IOException \{ Map<COSBase, Collection<Reference>>
complexObjects = findComplexObjects(pdDocument); for (int pass = 0; ; pass++) {
int merges = mergeDuplicates(complexObjects); if (merges <= 0) {
System.out.printf("Pass %d - No merged objects\n\n", pass); break; }
System.out.printf("Pass %d - Merged objects: %d\n\n", pass, merges); } }
Map<COSBase, Collection<Reference>> findComplexObjects(PDDocument pdDocument)
\{ // Implementation of findComplexObjects method COSDictionary
catalogDictionary = pdDocument.getDocumentCatalog().getCOSObject();
Map<COSBase, Collection<Reference>> incomingReferences = new HashMap<>();
incomingReferences.put(catalogDictionary, new ArrayList<>()); Set<COSBase>
lastPass = Collections.<COSBase>singleton(catalogDictionary); Set<COSBase>
thisPass = new HashSet<>(); while(!lastPass.isEmpty()) { for (COSBase object :
lastPass) { if (object instanceof COSArray) { COSArray array = (COSArray)
object; for (int i = 0; i < array.size(); i++) { addTarget(new
ArrayReference(array, i), incomingReferences, thisPass); } } else if (object
instanceof COSDictionary) \{ COSDictionary dictionary = (COSDictionary) object;
for (COSName key : dictionary.keySet()) { addTarget(new
DictionaryReference(dictionary, key), incomingReferences, thisPass); } } }
lastPass = thisPass; thisPass = new HashSet<>(); } return incomingReferences; }
void addTarget(Reference reference, Map<COSBase, Collection<Reference>>
incomingReferences, Set<COSBase> thisPass) \{ // Implementation of addTarget
method COSBase object = reference.getTo(); if (object instanceof COSArray ||
object instanceof COSDictionary) { Collection<Reference> incoming =
incomingReferences.get(object); if (incoming == null) { incoming = new
ArrayList<>(); incomingReferences.put(object, incoming); thisPass.add(object);
} incoming.add(reference); } } int mergeDuplicates(Map<COSBase,
Collection<Reference>> complexObjects) throws IOException \{ // Implementation
of mergeDuplicates method List<HashOfCOSBase> hashes = new
ArrayList<>(complexObjects.size()); for (COSBase object :
complexObjects.keySet()) { hashes.add(new HashOfCOSBase(object)); }
Collections.sort(hashes); int removedDuplicates = 0; if (!hashes.isEmpty()) \{
int runStart = 0; int runHash = hashes.get(0).hash; for (int i = 1; i <
hashes.size(); i++) { int hash = hashes.get(i).hash; if (hash != runHash) { int
runSize = i - runStart; if (runSize != 1) { System.out.printf("Equal hash %d
for %d elements.\n", runHash, runSize); removedDuplicates +=
mergeRun(complexObjects, hashes.subList(runStart, i)); } runHash = hash;
runStart = i; } } int runSize = hashes.size() - runStart; if (runSize != 1) \{
System.out.printf("Equal hash %d for %d elements.\n", runHash, runSize);
removedDuplicates += mergeRun(complexObjects, hashes.subList(runStart,
hashes.size())); } } return removedDuplicates; } int mergeRun(Map<COSBase,
Collection<Reference>> complexObjects, List<HashOfCOSBase> run) \{ //
Implementation of mergeRun method int removedDuplicates = 0;
List<List<COSBase>> duplicateSets = new ArrayList<>(); for (HashOfCOSBase entry
: run) { COSBase element = entry.object; for (List<COSBase> duplicateSet :
duplicateSets) { if (equals(element, duplicateSet.get(0))) {
duplicateSet.add(element); element = null; break; } } if (element != null) \{
List<COSBase> duplicateSet = new ArrayList<>(); duplicateSet.add(element);
duplicateSets.add(duplicateSet); } } System.out.printf("Identified %d set(s) of
identical objects in run.\n", duplicateSets.size()); for (List<COSBase>
duplicateSet : duplicateSets) \{ if (duplicateSet.size() > 1) { COSBase
surviver = duplicateSet.remove(0); Collection<Reference> surviverReferences =
complexObjects.get(surviver); for (COSBase object : duplicateSet) {
Collection<Reference> references = complexObjects.get(object); for (Reference
reference : references) { reference.setTo(surviver);
surviverReferences.add(reference); } complexObjects.remove(object);
removedDuplicates++; } surviver.setDirect(false); } } return removedDuplicates;
} boolean equals(COSBase a, COSBase b) \{ // Implementation of equals method if
(a instanceof COSArray) { if (b instanceof COSArray) { COSArray aArray =
(COSArray) a; COSArray bArray = (COSArray) b; if (aArray.size() ==
bArray.size()) { for (int i=0; i < aArray.size(); i++) { if
(!resolve(aArray.get(i)).equals(resolve(bArray.get(i)))) return false; } return
true; } } } else if (a instanceof COSDictionary) \{ if (b instanceof
COSDictionary) { COSDictionary aDict = (COSDictionary) a; COSDictionary bDict =
(COSDictionary) b; Set<COSName> keys = aDict.keySet(); if
(keys.equals(bDict.keySet())) { for (COSName key : keys) { if
(!resolve(aDict.getItem(key)).equals(bDict.getItem(key))) return false; } // In
case of COSStreams we strictly speaking should // also compare the stream
contents here. But apparently // their hashes coincide well enough for the
original // hashing equality, so let's just assume... return true; } } } return
false; } static COSBase resolve(COSBase object) \{ // Implementation of resolve
method while (object instanceof COSObject) object =
((COSObject)object).getObject(); return object; } interface Reference \{ public
COSBase getFrom(); public COSBase getTo(); public void setTo(COSBase to); }
static class ArrayReference implements Reference \{ public
ArrayReference(COSArray array, int index) { this.from = array; this.index =
index; } @Override public COSBase getFrom() \{ return from; } @Override public
COSBase getTo() \{ return resolve(from.get(index)); } @Override public void
setTo(COSBase to) \{ from.set(index, to); } final COSArray from; final int
index; } static class DictionaryReference implements Reference \{ public
DictionaryReference(COSDictionary dictionary, COSName key) { this.from =
dictionary; this.key = key; } @Override public COSBase getFrom() \{ return
from; } @Override public COSBase getTo() \{ return
resolve(from.getDictionaryObject(key)); } @Override public void setTo(COSBase
to) \{ from.setItem(key, to); } final COSDictionary from; final COSName key; }
static class HashOfCOSBase implements Comparable<HashOfCOSBase> \{ public
HashOfCOSBase(COSBase object) throws IOException { this.object = object;
this.hash = calculateHash(object); } int calculateHash(COSBase object) throws
IOException \{ if (object instanceof COSArray) { int result = 1; for (COSBase
member : (COSArray)object) result = 31 * result + member.hashCode(); return
result; } else if (object instanceof COSDictionary) \{ int result = 3; for
(Map.Entry<COSName, COSBase> entry : ((COSDictionary)object).entrySet()) result
+= entry.hashCode(); if (object instanceof COSStream) { try ( InputStream data
= ((COSStream)object).createRawInputStream() ) { MessageDigest md =
MessageDigest.getInstance("MD5"); byte[] buffer = new byte[8192]; int bytesRead
= 0; while((bytesRead = data.read(buffer)) >= 0) md.update(buffer, 0,
bytesRead); result = 31 * result + Arrays.hashCode(md.digest()); } catch
(NoSuchAlgorithmException e) \{ throw new IOException(e); } } return result; }
else \{ throw new IllegalArgumentException(String.format("Unknown complex
COSBase type %s", object.getClass().getName())); } } final COSBase object;
final int hash; @Override public int compareTo(HashOfCOSBase o) \{ int result =
Integer.compare(hash, o.hash); if (result == 0) result =
Integer.compare(hashCode(), o.hashCode()); return result; } } }
However, it is not working for me. Can anyone have an idea to remove the
duplicate embedded subsets of font in merging PDFs?
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]