This is an automated email from the ASF dual-hosted git repository.

zhaoqingran pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hertzbeat.git


The following commit(s) were added to refs/heads/master by this push:
     new 899e92882 [refactor]Refactoring prometheus label value utf8 support 
(#3810)
899e92882 is described below

commit 899e92882d97a17b5cdff0b1e6a776fe6495bf7b
Author: Duansg <[email protected]>
AuthorDate: Sat Oct 11 11:31:09 2025 +0800

    [refactor]Refactoring prometheus label value utf8 support (#3810)
---
 .../collect/prometheus/parser/OnlineParser.java    | 163 ++++++++++++++-------
 .../prometheus/parser/OnlineParserSingleTest.java  | 130 ++++++++++++++++
 2 files changed, 242 insertions(+), 51 deletions(-)

diff --git 
a/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParser.java
 
b/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParser.java
index bed4eb299..de48f5965 100644
--- 
a/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParser.java
+++ 
b/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParser.java
@@ -23,11 +23,13 @@ import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.StringUtils;
 
 import java.io.IOException;
 import java.io.InputStream;
 import java.math.BigDecimal;
 import java.nio.charset.StandardCharsets;
+import java.util.Optional;
 import java.util.concurrent.ConcurrentHashMap;
 
 
@@ -41,6 +43,7 @@ public class OnlineParser {
 
     private static final char RIGHT_BRACKET = '}';
     private static final char LEFT_BRACKET = '{';
+    private static final char UTF8_REPLACEMENT_CHARACTER = '\uFFFD';
 
     static {
         escapeMap.put((int) 'n', (int) '\n');
@@ -362,37 +365,123 @@ public class OnlineParser {
         return new CharChecker(i);
     }
 
+    /**
+     * Handles multi-byte UTF-8 character parsing from input stream.
+     * Reads additional bytes based on the first byte and validates the UTF-8 
sequence.
+     * Appends the decoded character to the string builder or replacement 
character if invalid.
+     *
+     * @param firstByte the first byte of the UTF-8 character sequence
+     * @param inputStream the input stream to read additional bytes from
+     * @param stringBuilder the string builder to append the decoded character 
to
+     * @throws IOException if an I/O error occurs while reading from the input 
stream
+     */
     private static void handleUtf8Character(int firstByte, InputStream 
inputStream, StringBuilder stringBuilder) throws IOException {
-        List<Integer> bytes = new ArrayList<>();
-        bytes.add(firstByte);
-
-        int additionalBytes = getUtf8AdditionalByteCount(firstByte);
-
-        for (int j = 0; j < additionalBytes; j++) {
-            int nextByte = inputStream.read();
-            if (nextByte == -1) break;
-            bytes.add(nextByte);
+        byte[] byteArray = new byte[4];
+        byteArray[0] = (byte) firstByte;
+        int additionalBytes = calculateUtf8ContinuationBytes(firstByte);
+        if (additionalBytes == -1) {
+            appendInvalidCharacters(stringBuilder);
+            return;
         }
+        int totalBytes = 1;
 
-        byte[] byteArray = new byte[bytes.size()];
-        for (int j = 0; j < bytes.size(); j++) {
-            byteArray[j] = (byte) bytes.get(j).intValue();
+        for (int i = 0; i < additionalBytes; i++) {
+            int nextByte = inputStream.read();
+            if (nextByte == -1) {
+                appendInvalidCharacters(stringBuilder);
+                return;
+            }
+            // Verify subsequent byte format:10xxxxxx
+            if ((nextByte & 0xC0) != 0x80) {
+                appendInvalidCharacters(stringBuilder);
+                return;
+            }
+            byteArray[i + 1] = (byte) nextByte;
+            totalBytes++;
         }
-
         try {
-            String utf8Chars = new String(byteArray, StandardCharsets.UTF_8);
+            // todo: If stricter UTF-8 semantic validation is 
required,boundary conditions are slightly strengthened.
+            String utf8Chars = new String(byteArray, 0, totalBytes, 
StandardCharsets.UTF_8);
             stringBuilder.append(utf8Chars);
         } catch (Exception e) {
-            stringBuilder.append((char) firstByte);
+            log.debug("Invalid UTF-8 sequence detected at firstByte: {}", 
Integer.toHexString(firstByte));
+            appendInvalidCharacters(stringBuilder);
         }
     }
 
-    private static int getUtf8AdditionalByteCount(int firstByte) {
-        if ((firstByte & 0x80) == 0) return 0; // 0xxxxxxx - ASCII (shouldn't 
reach here)
-        if ((firstByte & 0xE0) == 0xC0) return 1; // 110xxxxx - 2 bytes total, 
1 additional
-        if ((firstByte & 0xF0) == 0xE0) return 2; // 1110xxxx - 3 bytes total, 
2 additional
-        if ((firstByte & 0xF8) == 0xF0) return 3; // 11110xxx - 4 bytes total, 
3 additional
-        return 0;
+    /**
+     * Appends the UTF-8 replacement character (\uFFFD) to the StringBuilder.
+     * This method is used to append the replacement character to the string 
builder
+     * when invalid UTF-8 byte sequences are encountered during parsing.
+     * The replacement character (U+FFFD) is a special Unicode character used 
to
+     * represent characters that cannot be decoded properly.
+     *
+     * @param stringBuilder the string builder to append to, no operation if 
null
+     */
+    private static void appendInvalidCharacters(StringBuilder stringBuilder) {
+        Optional.ofNullable(stringBuilder).ifPresent(t -> 
t.append(UTF8_REPLACEMENT_CHARACTER));
+    }
+
+    /**
+     * Checks if a label value contains invalid characters.
+     * This method is used to validate the validity of Prometheus label values.
+     * If the label value contains the UTF-8 replacement character (\uFFFD),
+     * it is considered invalid because the replacement character indicates 
that
+     * byte sequences that could not be properly decoded were encountered 
during parsing.
+     * 
+     * According to Prometheus specifications, label values should not contain
+     * replacement characters as this would cause data inconsistency and query 
issues.
+     *
+     * @param labelValue the label value to check
+     * @return true if the label value is not blank and contains UTF-8 
replacement character, false otherwise
+     */
+    private static boolean isInvalidLabelValue(String labelValue) {
+        return StringUtils.isNotBlank(labelValue) && 
labelValue.contains(String.valueOf(UTF8_REPLACEMENT_CHARACTER));
+    }
+
+    /**
+     * Calculates the number of continuation bytes needed for a UTF-8 
character.
+     * This method analyzes the first byte of a UTF-8 encoding to determine 
how many
+     * continuation bytes (10xxxxxx pattern) are required to complete the 
character.
+     * 
+     * UTF-8 encoding rules:
+     * - 1-byte character: 0xxxxxxx (ASCII characters, this method won't be 
called)
+     * - 2-byte character: 110xxxxx 10xxxxxx (returns 1 continuation byte)
+     * - 3-byte character: 1110xxxx 10xxxxxx 10xxxxxx (returns 2 continuation 
bytes)
+     * - 4-byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (returns 3 
continuation bytes)
+     * 
+     * Also validates byte sequences:
+     * - 0xC0 and 0xC1: overlong encoding, invalid
+     * - 0xF5-0xFF: out of Unicode range, invalid
+     *
+     * Note: A basic Overlong Encoding check has been added here.
+     * If stricter and more comprehensive validation is required later,
+     * this method should be separated out to serve solely as a check for the 
first character.
+     *
+     * @param firstByte the first byte of the UTF-8 character sequence
+     * @return the number of continuation bytes needed, or -1 if the first 
byte is invalid
+     *         - returns 1: 2-byte character (needs 1 continuation byte)
+     *         - returns 2: 3-byte character (needs 2 continuation bytes)
+     *         - returns 3: 4-byte character (needs 3 continuation bytes)
+     *         - returns -1: invalid first byte
+     */
+    private static int calculateUtf8ContinuationBytes(int firstByte) {
+        if ((firstByte & 0xE0) == 0xC0) {
+            if (firstByte <= 0xC1) {
+                return -1;
+            }
+            return 1;
+        }
+        if ((firstByte & 0xF0) == 0xE0) {
+            return 2;
+        }
+        if ((firstByte & 0xF8) == 0xF0) {
+            if (firstByte >= 0xF5) {
+                return -1;
+            }
+            return 3;
+        }
+        return -1;
     }
 
     private static CharChecker skipSpaces(InputStream inputStream) throws 
IOException, FormatException {
@@ -451,10 +540,10 @@ public class OnlineParser {
         skipSpaces(inputStream).maybeQuotationMark().noElse();
         parseLabelValue(inputStream, 
stringBuilder).maybeQuotationMark().noElse();
         String labelValue = stringBuilder.toString();
-        if (!isValidLabelValue(labelValue)) {
+        if (isInvalidLabelValue(labelValue)) {
+            log.error("Invalid UTF-8 sequence detected at labelValue.");
             throw new FormatException();
         }
-
         label.setValue(labelValue);
         stringBuilder.delete(0, stringBuilder.length());
         labelList.add(label);
@@ -527,32 +616,4 @@ public class OnlineParser {
         metricFamily.getMetricList().add(metric);
         return new CharChecker(i);
     }
-
-    private static boolean isValidLabelValue(String labelValue) {
-        if (labelValue == null) {
-            return false;
-        }
-
-        //Check if all characters are ASCII (0-127)
-        boolean isAscii = true;
-        for (int i = 0; i < labelValue.length(); i++) {
-            char c = labelValue.charAt(i);
-            if (c > 127) {
-                isAscii = false;
-                break;
-            }
-        }
-
-        if (isAscii) {
-            return true;
-        }
-
-        try {
-            byte[] bytes = labelValue.getBytes(StandardCharsets.UTF_8);
-            String reconstructed = new String(bytes, StandardCharsets.UTF_8);
-            return labelValue.equals(reconstructed);
-        } catch (Exception e) {
-            return false;
-        }
-    }
 }
diff --git 
a/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParserSingleTest.java
 
b/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParserSingleTest.java
index 3cc57bacc..a563ffa28 100644
--- 
a/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParserSingleTest.java
+++ 
b/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParserSingleTest.java
@@ -29,6 +29,7 @@ import java.util.Map;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
 
 /**
  * Parse a Single metric in prometheus test
@@ -468,4 +469,133 @@ public class OnlineParserSingleTest {
     private Map<String, MetricFamily> parseMetrics(InputStream inputStream, 
String metric) throws IOException {
         return OnlineParser.parseMetrics(inputStream, metric);
     }
+
+    @Test
+    void testHandleUtf8Character_ValidTwoByteSequence() throws Exception {
+        // Test valid 2-byte UTF-8 sequence: é (U+00E9) = 0xC3 0xA9
+        String str = "test_metric{label=\"caf\u00e9\"} 1\n";
+        InputStream inputStream = new 
ByteArrayInputStream(str.getBytes(StandardCharsets.UTF_8));
+        Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream, 
"test_metric");
+
+        assertNotNull(metricFamilyMap);
+        MetricFamily metricFamily = metricFamilyMap.get("test_metric");
+        assertNotNull(metricFamily);
+        assertEquals(1, metricFamily.getMetricList().size());
+        
+        MetricFamily.Label label = 
metricFamily.getMetricList().get(0).getLabels().get(0);
+        assertEquals("label", label.getName());
+        assertEquals("café", label.getValue());
+    }
+
+    @Test
+    void testHandleUtf8Character_ValidThreeByteSequence() throws Exception {
+        // Test valid 3-byte UTF-8 sequence: Chinese character (U+4E2D) = 0xE4 
0xB8 0xAD
+        String str = "test_metric{label=\"\u4e2d\u6587\"} 1\n";
+        InputStream inputStream = new 
ByteArrayInputStream(str.getBytes(StandardCharsets.UTF_8));
+        Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream, 
"test_metric");
+
+        assertNotNull(metricFamilyMap);
+        MetricFamily metricFamily = metricFamilyMap.get("test_metric");
+        assertNotNull(metricFamily);
+        assertEquals(1, metricFamily.getMetricList().size());
+        
+        MetricFamily.Label label = 
metricFamily.getMetricList().get(0).getLabels().get(0);
+        assertEquals("label", label.getName());
+        assertEquals("\u4e2d\u6587", label.getValue());
+    }
+
+    @Test
+    void testHandleUtf8Character_ValidFourByteSequence() throws Exception {
+        // Test valid 4-byte UTF-8 sequence: 𝕳 (U+1D573) = 0xF0 0x9D 0x95 0xB3
+        String str = "test_metric{label=\"\uD835\uDD73ello\"} 1\n";
+        InputStream inputStream = new 
ByteArrayInputStream(str.getBytes(StandardCharsets.UTF_8));
+        Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream, 
"test_metric");
+
+        assertNotNull(metricFamilyMap);
+        MetricFamily metricFamily = metricFamilyMap.get("test_metric");
+        assertNotNull(metricFamily);
+        assertEquals(1, metricFamily.getMetricList().size());
+        
+        MetricFamily.Label label = 
metricFamily.getMetricList().get(0).getLabels().get(0);
+        assertEquals("label", label.getName());
+        assertEquals("𝕳ello", label.getValue());
+    }
+
+    @Test
+    void testHandleUtf8Character_InvalidFirstByte_0xC0() throws Exception {
+        // Test invalid first byte 0xC0 (overlong encoding) - should cause 
parsing failure
+        byte[] invalidBytes = {
+            't', 'e', 's', 't', '_', 'm', 'e', 't', 'r', 'i', 'c', '{', 'l', 
'a', 'b', 'e', 'l', '=', '"',
+            (byte) 0xC0, (byte) 0x80,
+            '"', '}', ' ', '1', '\n'
+        };
+        InputStream inputStream = new ByteArrayInputStream(invalidBytes);
+        Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream, 
"test_metric");
+        assertNull(metricFamilyMap);
+    }
+
+    @Test
+    void testHandleUtf8Character_InvalidFirstByte_0xC1() throws Exception {
+        // Test invalid first byte 0xC1 (overlong encoding) - should cause 
parsing failure
+        byte[] invalidBytes = {
+            't', 'e', 's', 't', '_', 'm', 'e', 't', 'r', 'i', 'c', '{', 'l', 
'a', 'b', 'e', 'l', '=', '"',
+            (byte) 0xC1, (byte) 0x80,
+            '"', '}', ' ', '1', '\n'
+        };
+        InputStream inputStream = new ByteArrayInputStream(invalidBytes);
+        Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream, 
"test_metric");
+        assertNull(metricFamilyMap);
+    }
+
+    @Test
+    void testHandleUtf8Character_InvalidFirstByte_0xF5() throws Exception {
+        // Test invalid first byte 0xF5 (out of range) - should cause parsing 
failure
+        byte[] invalidBytes = {
+            't', 'e', 's', 't', '_', 'm', 'e', 't', 'r', 'i', 'c', '{', 'l', 
'a', 'b', 'e', 'l', '=', '"',
+            (byte) 0xF5, (byte) 0x80, (byte) 0x80, (byte) 0x80,
+            '"', '}', ' ', '1', '\n'
+        };
+        InputStream inputStream = new ByteArrayInputStream(invalidBytes);
+        Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream, 
"test_metric");
+        assertNull(metricFamilyMap);
+    }
+
+    @Test
+    void testHandleUtf8Character_InvalidContinuationByte() throws Exception {
+        // Test invalid continuation byte (should be 10xxxxxx but is 11xxxxxx) 
- should cause parsing failure
+        byte[] invalidBytes = {
+            't', 'e', 's', 't', '_', 'm', 'e', 't', 'r', 'i', 'c', '{', 'l', 
'a', 'b', 'e', 'l', '=', '"',
+            (byte) 0xC3, (byte) 0xC0,
+            '"', '}', ' ', '1', '\n'
+        };
+        InputStream inputStream = new ByteArrayInputStream(invalidBytes);
+        Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream, 
"test_metric");
+        assertNull(metricFamilyMap);
+    }
+
+    @Test
+    void testHandleUtf8Character_MixedValidInvalid() throws Exception {
+        // Test mixed valid UTF-8 sequences (removed invalid sequences that 
cause parsing to fail)
+        byte[] mixedBytes = {
+            't', 'e', 's', 't', '_', 'm', 'e', 't', 'r', 'i', 'c', '{', 'l', 
'a', 'b', 'e', 'l', '=', '"',
+            'H', 'e', 'l', 'l', 'o',
+            (byte) 0xC3, (byte) 0xA9,
+            (byte) 0xE4, (byte) 0xB8, (byte) 0xAD,
+            (byte) 0xF0, (byte) 0x9F, (byte) 0x98, (byte) 0x80,
+            '"', '}', ' ', '1', '\n'
+        };
+        InputStream inputStream = new ByteArrayInputStream(mixedBytes);
+        Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream, 
"test_metric");
+
+        assertNotNull(metricFamilyMap);
+        MetricFamily metricFamily = metricFamilyMap.get("test_metric");
+        assertNotNull(metricFamily);
+        assertEquals(1, metricFamily.getMetricList().size());
+        
+        MetricFamily.Label label = 
metricFamily.getMetricList().get(0).getLabels().get(0);
+        assertEquals("label", label.getName());
+        String value = label.getValue();
+        assertNotNull(value);
+        assertEquals("Hello\u00e9\u4e2d\ud83d\ude00", value);
+    }
 }
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to