This is an automated email from the ASF dual-hosted git repository.
zhaoqingran pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hertzbeat.git
The following commit(s) were added to refs/heads/master by this push:
new 899e92882 [refactor]Refactoring prometheus label value utf8 support
(#3810)
899e92882 is described below
commit 899e92882d97a17b5cdff0b1e6a776fe6495bf7b
Author: Duansg <[email protected]>
AuthorDate: Sat Oct 11 11:31:09 2025 +0800
[refactor]Refactoring prometheus label value utf8 support (#3810)
---
.../collect/prometheus/parser/OnlineParser.java | 163 ++++++++++++++-------
.../prometheus/parser/OnlineParserSingleTest.java | 130 ++++++++++++++++
2 files changed, 242 insertions(+), 51 deletions(-)
diff --git
a/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParser.java
b/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParser.java
index bed4eb299..de48f5965 100644
---
a/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParser.java
+++
b/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParser.java
@@ -23,11 +23,13 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.nio.charset.StandardCharsets;
+import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap;
@@ -41,6 +43,7 @@ public class OnlineParser {
private static final char RIGHT_BRACKET = '}';
private static final char LEFT_BRACKET = '{';
+ private static final char UTF8_REPLACEMENT_CHARACTER = '\uFFFD';
static {
escapeMap.put((int) 'n', (int) '\n');
@@ -362,37 +365,123 @@ public class OnlineParser {
return new CharChecker(i);
}
+ /**
+ * Handles multi-byte UTF-8 character parsing from input stream.
+ * Reads additional bytes based on the first byte and validates the UTF-8
sequence.
+ * Appends the decoded character to the string builder or replacement
character if invalid.
+ *
+ * @param firstByte the first byte of the UTF-8 character sequence
+ * @param inputStream the input stream to read additional bytes from
+ * @param stringBuilder the string builder to append the decoded character
to
+ * @throws IOException if an I/O error occurs while reading from the input
stream
+ */
private static void handleUtf8Character(int firstByte, InputStream
inputStream, StringBuilder stringBuilder) throws IOException {
- List<Integer> bytes = new ArrayList<>();
- bytes.add(firstByte);
-
- int additionalBytes = getUtf8AdditionalByteCount(firstByte);
-
- for (int j = 0; j < additionalBytes; j++) {
- int nextByte = inputStream.read();
- if (nextByte == -1) break;
- bytes.add(nextByte);
+ byte[] byteArray = new byte[4];
+ byteArray[0] = (byte) firstByte;
+ int additionalBytes = calculateUtf8ContinuationBytes(firstByte);
+ if (additionalBytes == -1) {
+ appendInvalidCharacters(stringBuilder);
+ return;
}
+ int totalBytes = 1;
- byte[] byteArray = new byte[bytes.size()];
- for (int j = 0; j < bytes.size(); j++) {
- byteArray[j] = (byte) bytes.get(j).intValue();
+ for (int i = 0; i < additionalBytes; i++) {
+ int nextByte = inputStream.read();
+ if (nextByte == -1) {
+ appendInvalidCharacters(stringBuilder);
+ return;
+ }
+ // Verify subsequent byte format:10xxxxxx
+ if ((nextByte & 0xC0) != 0x80) {
+ appendInvalidCharacters(stringBuilder);
+ return;
+ }
+ byteArray[i + 1] = (byte) nextByte;
+ totalBytes++;
}
-
try {
- String utf8Chars = new String(byteArray, StandardCharsets.UTF_8);
+ // todo: If stricter UTF-8 semantic validation is
required,boundary conditions are slightly strengthened.
+ String utf8Chars = new String(byteArray, 0, totalBytes,
StandardCharsets.UTF_8);
stringBuilder.append(utf8Chars);
} catch (Exception e) {
- stringBuilder.append((char) firstByte);
+ log.debug("Invalid UTF-8 sequence detected at firstByte: {}",
Integer.toHexString(firstByte));
+ appendInvalidCharacters(stringBuilder);
}
}
- private static int getUtf8AdditionalByteCount(int firstByte) {
- if ((firstByte & 0x80) == 0) return 0; // 0xxxxxxx - ASCII (shouldn't
reach here)
- if ((firstByte & 0xE0) == 0xC0) return 1; // 110xxxxx - 2 bytes total,
1 additional
- if ((firstByte & 0xF0) == 0xE0) return 2; // 1110xxxx - 3 bytes total,
2 additional
- if ((firstByte & 0xF8) == 0xF0) return 3; // 11110xxx - 4 bytes total,
3 additional
- return 0;
+ /**
+ * Appends the UTF-8 replacement character (\uFFFD) to the StringBuilder.
+ * This method is used to append the replacement character to the string
builder
+ * when invalid UTF-8 byte sequences are encountered during parsing.
+ * The replacement character (U+FFFD) is a special Unicode character used
to
+ * represent characters that cannot be decoded properly.
+ *
+ * @param stringBuilder the string builder to append to, no operation if
null
+ */
+ private static void appendInvalidCharacters(StringBuilder stringBuilder) {
+ Optional.ofNullable(stringBuilder).ifPresent(t ->
t.append(UTF8_REPLACEMENT_CHARACTER));
+ }
+
+ /**
+ * Checks if a label value contains invalid characters.
+ * This method is used to validate the validity of Prometheus label values.
+ * If the label value contains the UTF-8 replacement character (\uFFFD),
+ * it is considered invalid because the replacement character indicates
that
+ * byte sequences that could not be properly decoded were encountered
during parsing.
+ *
+ * According to Prometheus specifications, label values should not contain
+ * replacement characters as this would cause data inconsistency and query
issues.
+ *
+ * @param labelValue the label value to check
+ * @return true if the label value is not blank and contains UTF-8
replacement character, false otherwise
+ */
+ private static boolean isInvalidLabelValue(String labelValue) {
+ return StringUtils.isNotBlank(labelValue) &&
labelValue.contains(String.valueOf(UTF8_REPLACEMENT_CHARACTER));
+ }
+
+ /**
+ * Calculates the number of continuation bytes needed for a UTF-8
character.
+ * This method analyzes the first byte of a UTF-8 encoding to determine
how many
+ * continuation bytes (10xxxxxx pattern) are required to complete the
character.
+ *
+ * UTF-8 encoding rules:
+ * - 1-byte character: 0xxxxxxx (ASCII characters, this method won't be
called)
+ * - 2-byte character: 110xxxxx 10xxxxxx (returns 1 continuation byte)
+ * - 3-byte character: 1110xxxx 10xxxxxx 10xxxxxx (returns 2 continuation
bytes)
+ * - 4-byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (returns 3
continuation bytes)
+ *
+ * Also validates byte sequences:
+ * - 0xC0 and 0xC1: overlong encoding, invalid
+ * - 0xF5-0xFF: out of Unicode range, invalid
+ *
+ * Note: A basic Overlong Encoding check has been added here.
+ * If stricter and more comprehensive validation is required later,
+ * this method should be separated out to serve solely as a check for the
first character.
+ *
+ * @param firstByte the first byte of the UTF-8 character sequence
+ * @return the number of continuation bytes needed, or -1 if the first
byte is invalid
+ * - returns 1: 2-byte character (needs 1 continuation byte)
+ * - returns 2: 3-byte character (needs 2 continuation bytes)
+ * - returns 3: 4-byte character (needs 3 continuation bytes)
+ * - returns -1: invalid first byte
+ */
+ private static int calculateUtf8ContinuationBytes(int firstByte) {
+ if ((firstByte & 0xE0) == 0xC0) {
+ if (firstByte <= 0xC1) {
+ return -1;
+ }
+ return 1;
+ }
+ if ((firstByte & 0xF0) == 0xE0) {
+ return 2;
+ }
+ if ((firstByte & 0xF8) == 0xF0) {
+ if (firstByte >= 0xF5) {
+ return -1;
+ }
+ return 3;
+ }
+ return -1;
}
private static CharChecker skipSpaces(InputStream inputStream) throws
IOException, FormatException {
@@ -451,10 +540,10 @@ public class OnlineParser {
skipSpaces(inputStream).maybeQuotationMark().noElse();
parseLabelValue(inputStream,
stringBuilder).maybeQuotationMark().noElse();
String labelValue = stringBuilder.toString();
- if (!isValidLabelValue(labelValue)) {
+ if (isInvalidLabelValue(labelValue)) {
+ log.error("Invalid UTF-8 sequence detected at labelValue.");
throw new FormatException();
}
-
label.setValue(labelValue);
stringBuilder.delete(0, stringBuilder.length());
labelList.add(label);
@@ -527,32 +616,4 @@ public class OnlineParser {
metricFamily.getMetricList().add(metric);
return new CharChecker(i);
}
-
- private static boolean isValidLabelValue(String labelValue) {
- if (labelValue == null) {
- return false;
- }
-
- //Check if all characters are ASCII (0-127)
- boolean isAscii = true;
- for (int i = 0; i < labelValue.length(); i++) {
- char c = labelValue.charAt(i);
- if (c > 127) {
- isAscii = false;
- break;
- }
- }
-
- if (isAscii) {
- return true;
- }
-
- try {
- byte[] bytes = labelValue.getBytes(StandardCharsets.UTF_8);
- String reconstructed = new String(bytes, StandardCharsets.UTF_8);
- return labelValue.equals(reconstructed);
- } catch (Exception e) {
- return false;
- }
- }
}
diff --git
a/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParserSingleTest.java
b/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParserSingleTest.java
index 3cc57bacc..a563ffa28 100644
---
a/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParserSingleTest.java
+++
b/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParserSingleTest.java
@@ -29,6 +29,7 @@ import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
/**
* Parse a Single metric in prometheus test
@@ -468,4 +469,133 @@ public class OnlineParserSingleTest {
private Map<String, MetricFamily> parseMetrics(InputStream inputStream,
String metric) throws IOException {
return OnlineParser.parseMetrics(inputStream, metric);
}
+
+ @Test
+ void testHandleUtf8Character_ValidTwoByteSequence() throws Exception {
+ // Test valid 2-byte UTF-8 sequence: é (U+00E9) = 0xC3 0xA9
+ String str = "test_metric{label=\"caf\u00e9\"} 1\n";
+ InputStream inputStream = new
ByteArrayInputStream(str.getBytes(StandardCharsets.UTF_8));
+ Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream,
"test_metric");
+
+ assertNotNull(metricFamilyMap);
+ MetricFamily metricFamily = metricFamilyMap.get("test_metric");
+ assertNotNull(metricFamily);
+ assertEquals(1, metricFamily.getMetricList().size());
+
+ MetricFamily.Label label =
metricFamily.getMetricList().get(0).getLabels().get(0);
+ assertEquals("label", label.getName());
+ assertEquals("café", label.getValue());
+ }
+
+ @Test
+ void testHandleUtf8Character_ValidThreeByteSequence() throws Exception {
+ // Test valid 3-byte UTF-8 sequence: Chinese character (U+4E2D) = 0xE4
0xB8 0xAD
+ String str = "test_metric{label=\"\u4e2d\u6587\"} 1\n";
+ InputStream inputStream = new
ByteArrayInputStream(str.getBytes(StandardCharsets.UTF_8));
+ Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream,
"test_metric");
+
+ assertNotNull(metricFamilyMap);
+ MetricFamily metricFamily = metricFamilyMap.get("test_metric");
+ assertNotNull(metricFamily);
+ assertEquals(1, metricFamily.getMetricList().size());
+
+ MetricFamily.Label label =
metricFamily.getMetricList().get(0).getLabels().get(0);
+ assertEquals("label", label.getName());
+ assertEquals("\u4e2d\u6587", label.getValue());
+ }
+
+ @Test
+ void testHandleUtf8Character_ValidFourByteSequence() throws Exception {
+ // Test valid 4-byte UTF-8 sequence: 𝕳 (U+1D573) = 0xF0 0x9D 0x95 0xB3
+ String str = "test_metric{label=\"\uD835\uDD73ello\"} 1\n";
+ InputStream inputStream = new
ByteArrayInputStream(str.getBytes(StandardCharsets.UTF_8));
+ Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream,
"test_metric");
+
+ assertNotNull(metricFamilyMap);
+ MetricFamily metricFamily = metricFamilyMap.get("test_metric");
+ assertNotNull(metricFamily);
+ assertEquals(1, metricFamily.getMetricList().size());
+
+ MetricFamily.Label label =
metricFamily.getMetricList().get(0).getLabels().get(0);
+ assertEquals("label", label.getName());
+ assertEquals("𝕳ello", label.getValue());
+ }
+
+ @Test
+ void testHandleUtf8Character_InvalidFirstByte_0xC0() throws Exception {
+ // Test invalid first byte 0xC0 (overlong encoding) - should cause
parsing failure
+ byte[] invalidBytes = {
+ 't', 'e', 's', 't', '_', 'm', 'e', 't', 'r', 'i', 'c', '{', 'l',
'a', 'b', 'e', 'l', '=', '"',
+ (byte) 0xC0, (byte) 0x80,
+ '"', '}', ' ', '1', '\n'
+ };
+ InputStream inputStream = new ByteArrayInputStream(invalidBytes);
+ Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream,
"test_metric");
+ assertNull(metricFamilyMap);
+ }
+
+ @Test
+ void testHandleUtf8Character_InvalidFirstByte_0xC1() throws Exception {
+ // Test invalid first byte 0xC1 (overlong encoding) - should cause
parsing failure
+ byte[] invalidBytes = {
+ 't', 'e', 's', 't', '_', 'm', 'e', 't', 'r', 'i', 'c', '{', 'l',
'a', 'b', 'e', 'l', '=', '"',
+ (byte) 0xC1, (byte) 0x80,
+ '"', '}', ' ', '1', '\n'
+ };
+ InputStream inputStream = new ByteArrayInputStream(invalidBytes);
+ Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream,
"test_metric");
+ assertNull(metricFamilyMap);
+ }
+
+ @Test
+ void testHandleUtf8Character_InvalidFirstByte_0xF5() throws Exception {
+ // Test invalid first byte 0xF5 (out of range) - should cause parsing
failure
+ byte[] invalidBytes = {
+ 't', 'e', 's', 't', '_', 'm', 'e', 't', 'r', 'i', 'c', '{', 'l',
'a', 'b', 'e', 'l', '=', '"',
+ (byte) 0xF5, (byte) 0x80, (byte) 0x80, (byte) 0x80,
+ '"', '}', ' ', '1', '\n'
+ };
+ InputStream inputStream = new ByteArrayInputStream(invalidBytes);
+ Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream,
"test_metric");
+ assertNull(metricFamilyMap);
+ }
+
+ @Test
+ void testHandleUtf8Character_InvalidContinuationByte() throws Exception {
+ // Test invalid continuation byte (should be 10xxxxxx but is 11xxxxxx)
- should cause parsing failure
+ byte[] invalidBytes = {
+ 't', 'e', 's', 't', '_', 'm', 'e', 't', 'r', 'i', 'c', '{', 'l',
'a', 'b', 'e', 'l', '=', '"',
+ (byte) 0xC3, (byte) 0xC0,
+ '"', '}', ' ', '1', '\n'
+ };
+ InputStream inputStream = new ByteArrayInputStream(invalidBytes);
+ Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream,
"test_metric");
+ assertNull(metricFamilyMap);
+ }
+
+ @Test
+ void testHandleUtf8Character_MixedValidInvalid() throws Exception {
+ // Test mixed valid UTF-8 sequences (removed invalid sequences that
cause parsing to fail)
+ byte[] mixedBytes = {
+ 't', 'e', 's', 't', '_', 'm', 'e', 't', 'r', 'i', 'c', '{', 'l',
'a', 'b', 'e', 'l', '=', '"',
+ 'H', 'e', 'l', 'l', 'o',
+ (byte) 0xC3, (byte) 0xA9,
+ (byte) 0xE4, (byte) 0xB8, (byte) 0xAD,
+ (byte) 0xF0, (byte) 0x9F, (byte) 0x98, (byte) 0x80,
+ '"', '}', ' ', '1', '\n'
+ };
+ InputStream inputStream = new ByteArrayInputStream(mixedBytes);
+ Map<String, MetricFamily> metricFamilyMap = parseMetrics(inputStream,
"test_metric");
+
+ assertNotNull(metricFamilyMap);
+ MetricFamily metricFamily = metricFamilyMap.get("test_metric");
+ assertNotNull(metricFamily);
+ assertEquals(1, metricFamily.getMetricList().size());
+
+ MetricFamily.Label label =
metricFamily.getMetricList().get(0).getLabels().get(0);
+ assertEquals("label", label.getName());
+ String value = label.getValue();
+ assertNotNull(value);
+ assertEquals("Hello\u00e9\u4e2d\ud83d\ude00", value);
+ }
}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]