[plc4x] branch develop updated: Fix reading UTF-8 strings (from OPC UA nodes) (#832)

sruehl Mon, 20 Mar 2023 06:43:23 -0700

This is an automated email from the ASF dual-hosted git repository.

sruehl pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/plc4x.git



The following commit(s) were added to refs/heads/develop by this push:
     new 93341ca76c Fix reading UTF-8 strings (from OPC UA nodes) (#832)
93341ca76c is described below

commit 93341ca76c503705c737ecc86675db39ac1db26e
Author: Marc Aurel Fritz <[email protected]>
AuthorDate: Mon Mar 20 14:43:10 2023 +0100

    Fix reading UTF-8 strings (from OPC UA nodes) (#832)
    
    * plc4j/spi: Fix readString() on non-ascii encodings
    
    The previous logic could result in a `StringIndexOutOfBoundsException`
    upon calling `substring()`.
    Reason is that the calculated `realLength` is in bytes and the length
    of the created string is the number of characters. For UTF encodings,
    multiple bytes can be just one character. This causes `realLength`
    to be longer than the actual string and thus the `substring()`-call
    to fail.
    
    Fix this by applying `realLength` to the byte-array instead of the
    string. This is done by utilizing a different String constructor.
    
    Note that on the UTF16 cases the `realLength` needs to be incremented
    by 2 to account for the two added bytes.
    
    This commit contains a regression test that fails without these
    adaptations.
    
    Signed-off-by: Marc Aurel Fritz <[email protected]>
    
    * plc4j/spi: Always convert encoding to upper case at readString()
    
    Previously, the inner switch statement at the UTF-16 cases
    would not convert encoding to upper case. This prevented
    matching from working properly.
    Do so directly at the beginning in order to prevent future
    bugs.
    
    Signed-off-by: Marc Aurel Fritz <[email protected]>
    
    * plc4j/spi: Merge ASCII and UTF-8 cases on readString()
    
    Prevent having the same logic twice.
    
    Signed-off-by: Marc Aurel Fritz <[email protected]>
    
    ---------
    
    Signed-off-by: Marc Aurel Fritz <[email protected]>
---
 .../java/spi/generation/ReadBufferByteBased.java   | 37 ++++++++--------------
 .../plc4x/java/spi/generation/ReadBufferTest.java  | 13 ++++++++
 2 files changed, 27 insertions(+), 23 deletions(-)

diff --git 
a/plc4j/spi/src/main/java/org/apache/plc4x/java/spi/generation/ReadBufferByteBased.java
 
b/plc4j/spi/src/main/java/org/apache/plc4x/java/spi/generation/ReadBufferByteBased.java
index 2904705ceb..011d905061 100644
--- 
a/plc4j/spi/src/main/java/org/apache/plc4x/java/spi/generation/ReadBufferByteBased.java
+++ 
b/plc4j/spi/src/main/java/org/apache/plc4x/java/spi/generation/ReadBufferByteBased.java
@@ -473,26 +473,9 @@ public class ReadBufferByteBased implements ReadBuffer, 
BufferCommons {
     public String readString(String logicalName, int bitLength, 
WithReaderArgs... readerArgs) throws ParseException {
         String encoding = extractEncoding(readerArgs).orElse("UTF-8");
         encoding = encoding.replaceAll("[^a-zA-Z0-9]", "");
-        switch (encoding.toUpperCase()) {
-            case "ASCII": {
-                byte[] strBytes = new byte[bitLength / 8];
-                int realLength = 0;
-                boolean finishedReading = false;
-                for (int i = 0; (i < (bitLength / 8)) && hasMore(8); i++) {
-                    try {
-                        byte b = readByte(logicalName);
-                        if (!disable0Termination() && (b == 0x00)) {
-                            finishedReading = true;
-                        } else if (!finishedReading) {
-                            strBytes[i] = b;
-                            realLength++;
-                        }
-                    } catch (Exception e) {
-                        throw new PlcRuntimeException(e);
-                    }
-                }
-                return new String(strBytes, 
StandardCharsets.US_ASCII).substring(0, realLength);
-            }
+        encoding = encoding.toUpperCase();
+        switch (encoding) {
+            case "ASCII":
             case "UTF8": {
                 byte[] strBytes = new byte[bitLength / 8];
                 int realLength = 0;
@@ -510,7 +493,15 @@ public class ReadBufferByteBased implements ReadBuffer, 
BufferCommons {
                         throw new PlcRuntimeException(e);
                     }
                 }
-                return new String(strBytes, 
StandardCharsets.UTF_8).substring(0, realLength);
+                Charset charset;
+                switch (encoding) {
+                    case "UTF8":
+                        charset = StandardCharsets.UTF_8;
+                        break;
+                    default:
+                        charset = StandardCharsets.US_ASCII;
+                }
+                return new String(strBytes, 0, realLength, charset);
             }
             case "UTF16":
             case "UTF16LE":
@@ -527,7 +518,7 @@ public class ReadBufferByteBased implements ReadBuffer, 
BufferCommons {
                         } else if (!finishedReading) {
                             strBytes[(i * 2)] = b1;
                             strBytes[(i * 2) + 1] = b2;
-                            realLength++;
+                            realLength += 2;
                         }
                     } catch (Exception e) {
                         throw new PlcRuntimeException(e);
@@ -544,7 +535,7 @@ public class ReadBufferByteBased implements ReadBuffer, 
BufferCommons {
                     default:
                         charset = StandardCharsets.UTF_16;
                 }
-                return new String(strBytes, charset).substring(0, realLength);
+                return new String(strBytes, 0, realLength, charset);
             }
             default:
                 throw new ParseException("Unsupported encoding: " + encoding);
diff --git 
a/plc4j/spi/src/test/java/org/apache/plc4x/java/spi/generation/ReadBufferTest.java
 
b/plc4j/spi/src/test/java/org/apache/plc4x/java/spi/generation/ReadBufferTest.java
index 798f2ee220..841e67cee4 100644
--- 
a/plc4j/spi/src/test/java/org/apache/plc4x/java/spi/generation/ReadBufferTest.java
+++ 
b/plc4j/spi/src/test/java/org/apache/plc4x/java/spi/generation/ReadBufferTest.java
@@ -38,4 +38,17 @@ class ReadBufferTest {
 
         assertEquals(value, answer);
     }
+
+    /**
+     * Test which makes sure that UTF8 encoding with multi-byte characters 
works
+     */
+    @Test
+    void readStringUtf8() throws ParseException {
+        String value = "molybdän";
+        final var serialized = value.getBytes(StandardCharsets.UTF_8);
+        final ReadBuffer buffer = new ReadBufferByteBased(serialized);
+        String answer = buffer.readString("", serialized.length * 8, 
WithOption.WithEncoding(StandardCharsets.UTF_8.name()));
+
+        assertEquals(value, answer);
+    }
 }
\ No newline at end of file

[plc4x] branch develop updated: Fix reading UTF-8 strings (from OPC UA nodes) (#832)

Reply via email to