Author: msahyoun
Date: Thu Mar 19 16:39:50 2026
New Revision: 1932392
Log:
PDFBOX-6178, PDFBOX-4076: use byte[] instead of String for internal storage in
COSName
Modified:
pdfbox/trunk/pdfbox/pom.xml
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSName.java
Modified: pdfbox/trunk/pdfbox/pom.xml
==============================================================================
--- pdfbox/trunk/pdfbox/pom.xml Thu Mar 19 12:45:02 2026 (r1932391)
+++ pdfbox/trunk/pdfbox/pom.xml Thu Mar 19 16:39:50 2026 (r1932392)
@@ -920,10 +920,22 @@
<sha512>60e7b46f11a655083a57f4a627edf75bee477cd1ebfb06fbaefeb8ba16a01be37039cfd51c7c181a3ea368b53d22e327d480a7b0699c0edb2e06b4faaae7790f</sha512>
</configuration>
</execution>
+ <execution>
+ <id>PDFBOX-6178</id>
+ <phase>generate-test-resources</phase>
+ <goals>
+ <goal>wget</goal>
+ </goals>
+ <configuration>
+
<url>https://issues.apache.org/jira/secure/attachment/13081297/form_empty.pdf</url>
+
<outputDirectory>${project.build.directory}/pdfs</outputDirectory>
+ <outputFileName>PDFBOX-6178.pdf</outputFileName>
+
<sha512>d39486af0614bd099167a6adaab833aed41a0ebec7b85b13b382a2fdb6fddbcaaea9ab26ed0a81b72822258c8dd66dd535fa5c76afd1e5a8b1bff7d81e890274</sha512>
+ </configuration>
+ </execution>
</executions>
</plugin>
</plugins>
</build>
</project>
-
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java
Thu Mar 19 12:45:02 2026 (r1932391)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java
Thu Mar 19 16:39:50 2026 (r1932392)
@@ -20,7 +20,9 @@ import java.io.IOException;
import java.io.OutputStream;
import java.lang.ref.Cleaner;
import java.lang.ref.WeakReference;
+import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
@@ -34,7 +36,7 @@ import org.apache.pdfbox.util.Hex;
public final class COSName extends COSBase implements Comparable<COSName>
{
// using ConcurrentHashMap because this can be accessed by multiple threads
- private static final Map<String, WeakReference<COSName>> NAME_MAP = //
+ private static final Map<ByteBuffer, WeakReference<COSName>> NAME_MAP = //
new ConcurrentHashMap<>(8192);
private static final Cleaner CLEANER = Cleaner.create();
@@ -673,18 +675,59 @@ public final class COSName extends COSBa
public static final COSName ZA_DB = getPDFName("ZaDb");
// fields
- private final String name;
/**
- * This will get a COSName object with that name.
- *
- * @param aName The name of the object.
- *
- * @return A COSName with the specified name.
+ * <p>Per PDF 32000-1:2008 §7.3.5: Beginning with PDF 1.2 a name object is
an atomic symbol
+ * uniquely defined by a sequence of any characters (8-bit values) except
null
+ * (character code 0).</p>
+ */
+ private final byte[] nameBytes;
+
+
+ /**
+ * Returns a {@code COSName} whose byte sequence is the UTF-8 encoding of
{@code aName}.
+ *
+ * <p>This is the standard factory for names defined in Java source code
(e.g. the static
+ * constants above). All well-formed PDF names defined by the spec are
ASCII, so the UTF-8
+ * encoding is a transparent identity transform for those cases.</p>
+ *
+ * @param aName the name string; must not be {@code null}
+ * @return a canonicalised {@code COSName} instance
*/
public static COSName getPDFName(String aName)
{
- WeakReference<COSName> weakRef = NAME_MAP.get(aName);
+ return getPDFName(aName.getBytes(StandardCharsets.UTF_8));
+ }
+
+ /**
+ * Returns a {@code COSName} whose byte sequence is exactly {@code bytes}.
+ *
+ * <p>This is the preferred factory when constructing a name directly from
a PDF byte stream
+ * (i.e. after the parser has stripped the leading {@code /} and expanded
all {@code #XX}
+ * escape sequences). Using this method preserves the spec-correct,
byte-level identity of the
+ * name even when the bytes are not valid UTF-8.</p>
+ *
+ * <p>Null bytes (0x00) are rejected; the spec explicitly excludes
them.</p>
+ *
+ * @param bytes the raw decoded byte sequence; must not be {@code null}
and must not contain 0x00
+ * @return a canonicalised {@code COSName} instance
+ * @throws IllegalArgumentException if {@code bytes} contains a null byte
+ */
+ public static COSName getPDFName(byte[] bytes)
+ {
+ for (byte b : bytes)
+ {
+ if (b == 0)
+ {
+ throw new IllegalArgumentException(
+ "PDF name bytes must not contain null (0x00)
characters");
+ }
+ }
+
+ // Wrap for lookup only to avoid unnecessary copying of the byte array
for the key.
+ ByteBuffer lookupKey = ByteBuffer.wrap(bytes);
+
+ WeakReference<COSName> weakRef = NAME_MAP.get(lookupKey);
COSName name = weakRef != null ? weakRef.get() : null;
if (name == null)
@@ -694,13 +737,19 @@ public final class COSName extends COSBa
// Use double checked locking to make the code thread safe.
synchronized (NAME_MAP)
{
- weakRef = NAME_MAP.get(aName);
+ weakRef = NAME_MAP.get(lookupKey);
name = weakRef != null ? weakRef.get() : null;
if (name == null)
{
- name = new COSName(aName);
- CLEANER.register(name, () -> NAME_MAP.remove(aName));
- NAME_MAP.put(aName, new WeakReference<>(name));
+ // Denesive copy which is OK to share as the key and the
nameBytes
+ // of the COSName are immutable.
+ byte[] storedBytes = Arrays.copyOf(bytes, bytes.length);
+ ByteBuffer storedKey = ByteBuffer.wrap(storedBytes);
+
+ name = new COSName(storedBytes);
+
+ CLEANER.register(name, () -> NAME_MAP.remove(storedKey));
+ NAME_MAP.put(storedKey, new WeakReference<>(name));
}
}
}
@@ -713,43 +762,78 @@ public final class COSName extends COSBa
*
* @param aName The name of the COSName object.
*/
- private COSName(String aName)
+ private COSName(byte[] storedBytes)
+ {
+ this.nameBytes = storedBytes;
+ }
+
+
+ /**
+ * Returns the raw byte sequence that defines this name.
+ *
+ * <p>This is the atomic content/identity of the name. Prefer this over
+ * {@link #getName()} whenever you need to write name bytes to an output
stream, compare names
+ * parsed from a PDF, or otherwise operate at the byte level.</p>
+ *
+ * @return a defensive copy of the internal byte array; never {@code null}
+ */
+ public byte[] getBytes()
{
- this.name = aName;
+ return Arrays.copyOf(nameBytes, nameBytes.length);
}
/**
- * This will get the name of this COSName object.
+ * Returns the name decoded as a UTF-8 {@code String}.
+ *
+ * <p>This method exists primarily for backward compatibility and for
cases where the
+ * readable value needs to be stored.</p>
+ *
+ * <p>Per PDF 32000-1:2008 §7.3.5, ... However, occasionally the need
arises to treat a name object
+ * as text, such as one that represents a font ... </p>
+ *
+ * <p>... In such situations, the sequence of bytes (after expansion of
NUMBER SIGN sequences, if any)
+ * should be interpreted according to UTF-8... </p>
+ *
+ * Use {@link #getBytes()} when byte-level fidelity is required.</p>
*
* @return The name of the object.
*/
public String getName()
{
- return name;
+ String utf8String = new String(nameBytes, StandardCharsets.UTF_8);
+
+ //check for lossy decoding, which can happen if the name contains
+ // bytes that are not valid UTF-8
+ if (utf8String.indexOf('\uFFFD') >= 0) {
+ // fall back to ISO-8859-1, which is a single-byte encoding that
can decode any
+ // byte sequence without loss
+ return new String(nameBytes, StandardCharsets.ISO_8859_1);
+ }
+ return utf8String;
}
@Override
public String toString()
{
- return "COSName{" + name + "}";
+ return "COSName{" + getName() + "}";
}
@Override
public boolean equals(Object object)
{
- return object instanceof COSName && name.equals(((COSName)
object).name);
+ return object instanceof COSName && Arrays.equals(nameBytes,
((COSName) object).nameBytes);
}
@Override
public int hashCode()
{
- return name.hashCode();
+ return Arrays.hashCode(nameBytes);
}
@Override
public int compareTo(COSName other)
{
- return name.compareTo(other.name);
+ return Arrays.compare(nameBytes, other.nameBytes);
}
/**
@@ -758,7 +842,7 @@ public final class COSName extends COSBa
*/
public boolean isEmpty()
{
- return name.isEmpty();
+ return nameBytes.length == 0;
}
@Override
@@ -776,7 +860,7 @@ public final class COSName extends COSBa
public void writePDF(OutputStream output) throws IOException
{
output.write('/');
- byte[] bytes = getName().getBytes(StandardCharsets.UTF_8);
+ byte[] bytes = getBytes();
for (byte b : bytes)
{
int current = b & 0xFF;
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
Thu Mar 19 12:45:02 2026 (r1932391)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
Thu Mar 19 16:39:50 2026 (r1932392)
@@ -1840,6 +1840,7 @@ public class COSParser extends BaseParse
{
int ch1 = source.read();
int ch2 = source.read();
+
// Prior to PDF v1.2, the # was not a special character. Also,
// it has been observed that various PDF tools do not follow
the
// spec with respect to the # escape, even though they report
@@ -1884,7 +1885,7 @@ public class COSParser extends BaseParse
source.rewind(1);
}
- return COSName.getPDFName(decodeBuffer(buffer));
+ return COSName.getPDFName(buffer.toByteArray());
}
private static boolean isHexDigit(char ch)
Modified:
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSName.java
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSName.java
Thu Mar 19 12:45:02 2026 (r1932391)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/cos/TestCOSName.java
Thu Mar 19 16:39:50 2026 (r1932392)
@@ -20,15 +20,19 @@ import static org.junit.jupiter.api.Asse
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.junit.jupiter.api.Test;
class TestCOSName
{
+ private static final File TARGETPDFDIR = new File("target/pdfs");
+
/**
* PDFBOX-4076: Check that characters outside of US_ASCII are not replaced
with "?".
*
@@ -56,4 +60,36 @@ class TestCOSName
}
}
+ /**
+ * PDFBOX-4076: Check that characters outside of US_ASCII are not replaced
with "?".
+ *
+ * @throws IOException
+ */
+ @Test
+ void PDFBox6178() throws IOException
+ {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+
+ try (PDDocument document = Loader.loadPDF(new
File(TARGETPDFDIR,"PDFBOX-6178.pdf"))) {
+ PDField field = document.getDocumentCatalog()
+ .getAcroForm(null)
+ .getField("Geschlecht");
+
+ field.setValue("männlich");
+
+ field.getWidgets()
+ .get(0).getAppearance().getNormalAppearance().getCOSObject()
+ .keySet().forEach(k -> {
+ try {
+ k.writePDF(baos);
+ } catch (IOException e) {
+ // ignored
+ }
+ });
+
+ String writtenKeys = new String(baos.toByteArray(), "UTF-8");
+ assertTrue(writtenKeys.contains("/m#E4nnlich"), "Output should be
/m#e4nnlich (with 0xE4 as hex escape)");
+ System.out.println(writtenKeys);
+ }
+ }
}