Author: tilman
Date: Sun Jan 26 13:43:43 2025
New Revision: 1923374

URL: http://svn.apache.org/viewvc?rev=1923374&view=rev
Log:
PDFBOX-5747: fix combining diacritics, by Richard Eckart de Castilho

Added:
    
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf
   (with props)
    
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt
   (with props)
    
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt
   (with props)
Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java?rev=1923374&r1=1923373&r2=1923374&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java 
(original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java 
Sun Jan 26 13:43:43 2025
@@ -759,16 +759,26 @@ public final class TextPosition
         float[] widths2 = new float[widths.length + 1];
         System.arraycopy(widths, 0, widths2, 0, i);
 
+        // First we add a zero-width entry for the diacritic in the widths 
array
+        widths2[i] = widths[i];
+        widths2[i + 1] = 0;
+        System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1);
+
         // Unicode combining diacritics always go after the base character, 
regardless of whether
         // the string is in presentation order or logical order
         sb.append(unicode.charAt(i));
-        widths2[i] = widths[i];
+
+        // If a surrogate starts at the current position, make sure we 
preserve it
+        if (i < unicode.length() - 1 && 
Character.isSurrogatePair(unicode.charAt(i), unicode.charAt(i + 1)))
+        {
+            sb.append(unicode.charAt(i + 1));
+            i++;
+        }
+
         sb.append(combineDiacritic(diacritic.getUnicode()));
-        widths2[i + 1] = 0;
 
         // get the rest of the string
         sb.append(unicode.substring(i + 1));
-        System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1);
 
         unicode = sb.toString();
         widths = widths2;

Added: 
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf?rev=1923374&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/pdf

Added: 
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt?rev=1923374&view=auto
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt
 (added)
+++ 
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt
 Sun Jan 26 13:43:43 2025
@@ -0,0 +1 @@
+𝑋̂

Propchange: 
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf-sorted.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt?rev=1923374&view=auto
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt
 (added)
+++ 
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt
 Sun Jan 26 13:43:43 2025
@@ -0,0 +1 @@
+𝑋̂

Propchange: 
pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic-reduced.pdf.txt
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to