Author: tilman
Date: Sun Jan 26 13:43:43 2025
New Revision: 1923374
URL: http://svn.apache.org/viewvc?rev=1923374&view=rev
Log:
PDFBOX-5747: fix combining diacritics, by Richard Eckart de Castilho
Added:
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf
(with props)
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt
(with props)
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt
(with props)
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java?rev=1923374&r1=1923373&r2=1923374&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
(original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
Sun Jan 26 13:43:43 2025
@@ -759,16 +759,26 @@ public final class TextPosition
float[] widths2 = new float[widths.length + 1];
System.arraycopy(widths, 0, widths2, 0, i);
+ // First we add a zero-width entry for the diacritic in the widths
array
+ widths2[i] = widths[i];
+ widths2[i + 1] = 0;
+ System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1);
+
// Unicode combining diacritics always go after the base character,
regardless of whether
// the string is in presentation order or logical order
sb.append(unicode.charAt(i));
- widths2[i] = widths[i];
+
+ // If a surrogate starts at the current position, make sure we
preserve it
+ if (i < unicode.length() - 1 &&
Character.isSurrogatePair(unicode.charAt(i), unicode.charAt(i + 1)))
+ {
+ sb.append(unicode.charAt(i + 1));
+ i++;
+ }
+
sb.append(combineDiacritic(diacritic.getUnicode()));
- widths2[i + 1] = 0;
// get the rest of the string
sb.append(unicode.substring(i + 1));
- System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1);
unicode = sb.toString();
widths = widths2;
Added:
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf?rev=1923374&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf
------------------------------------------------------------------------------
svn:mime-type = application/pdf
Added:
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt?rev=1923374&view=auto
==============================================================================
---
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt
(added)
+++
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt
Sun Jan 26 13:43:43 2025
@@ -0,0 +1 @@
+ðÌ
Propchange:
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt
------------------------------------------------------------------------------
svn:eol-style = native
Added:
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt?rev=1923374&view=auto
==============================================================================
---
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt
(added)
+++
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt
Sun Jan 26 13:43:43 2025
@@ -0,0 +1 @@
+ðÌ
Propchange:
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt
------------------------------------------------------------------------------
svn:eol-style = native