mengnankkkk commented on code in PR #3810:
URL: https://github.com/apache/hertzbeat/pull/3810#discussion_r2418434112
##########
hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/prometheus/parser/OnlineParser.java:
##########
@@ -362,37 +365,123 @@ private static CharChecker parseLabelValue(InputStream
inputStream, StringBuilde
return new CharChecker(i);
}
+ /**
+ * Handles multi-byte UTF-8 character parsing from input stream.
+ * Reads additional bytes based on the first byte and validates the UTF-8
sequence.
+ * Appends the decoded character to the string builder or replacement
character if invalid.
+ *
+ * @param firstByte the first byte of the UTF-8 character sequence
+ * @param inputStream the input stream to read additional bytes from
+ * @param stringBuilder the string builder to append the decoded character
to
+ * @throws IOException if an I/O error occurs while reading from the input
stream
+ */
private static void handleUtf8Character(int firstByte, InputStream
inputStream, StringBuilder stringBuilder) throws IOException {
- List<Integer> bytes = new ArrayList<>();
- bytes.add(firstByte);
-
- int additionalBytes = getUtf8AdditionalByteCount(firstByte);
-
- for (int j = 0; j < additionalBytes; j++) {
- int nextByte = inputStream.read();
- if (nextByte == -1) break;
- bytes.add(nextByte);
+ byte[] byteArray = new byte[4];
+ byteArray[0] = (byte) firstByte;
+ int additionalBytes = calculateUtf8ContinuationBytes(firstByte);
+ if (additionalBytes == -1) {
+ appendInvalidCharacters(stringBuilder);
+ return;
}
+ int totalBytes = 1;
- byte[] byteArray = new byte[bytes.size()];
- for (int j = 0; j < bytes.size(); j++) {
- byteArray[j] = (byte) bytes.get(j).intValue();
+ for (int i = 0; i < additionalBytes; i++) {
+ int nextByte = inputStream.read();
+ if (nextByte == -1) {
+ appendInvalidCharacters(stringBuilder);
+ return;
+ }
+ // Verify subsequent byte format:10xxxxxx
+ if ((nextByte & 0xC0) != 0x80) {
+ appendInvalidCharacters(stringBuilder);
+ return;
Review Comment:
There seems to be a lack of surrogate and U+10FFFF checks here ❤
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]