This is an automated email from the ASF dual-hosted git repository.
tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 441a2df742 Add tests to clarify the limits of base64 encoded string
detector. (#15497)
441a2df742 is described below
commit 441a2df74292c90dc099301c9d06864be95f2f21
Author: Ting Chen <[email protected]>
AuthorDate: Fri Apr 11 16:58:46 2025 -0700
Add tests to clarify the limits of base64 encoded string detector. (#15497)
---
.../local/recordtransformer/SchemaConformingTransformer.java | 7 ++++---
.../local/recordtransformer/SchemaConformingTransformerTest.java | 7 +++++++
2 files changed, 11 insertions(+), 3 deletions(-)
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java
index a049d37e67..554409821a 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java
@@ -94,8 +94,8 @@ import org.slf4j.LoggerFactory;
* }
* Apart from the basic transformation above, this transformer today also does
the following additional tasks (which in
* future can be decoupled from this transformer):
- * 1. Put all field + value pair in a special column "_mergedTextIndex" to
facilitate text indexing and search. This
- * extra step can be enabled via mergedTextIndexFieldSpec.
+ * 1. Put all field + value pair in a special column "_mergedTextIndex" to
facilitate full text indexing and search.
+ * This extra step can be enabled via mergedTextIndexFieldSpec.
* 2. Allow users to tag certain fields in the input record not to be
included in the catch-all field.
* </pre>
* <p>
@@ -338,7 +338,8 @@ public class SchemaConformingTransformer implements
RecordTransformer {
putExtrasField(_transformerConfig.getUnindexableExtrasField(),
_unindexableExtrasFieldType,
extraFieldsContainer.getUnindexableExtras(), outputRecord);
- // Generate merged text index
+ // Generate merged text index. This optional step puts all field + value
pairs in the input record in a special
+ // column "_mergedTextIndex" to perform full text indexing and search.
if (null != _mergedTextIndexFieldSpec && !mergedTextIndexMap.isEmpty()) {
List<String> luceneDocuments =
getLuceneDocumentsFromMergedTextIndexMap(mergedTextIndexMap);
if (_mergedTextIndexFieldSpec.isSingleValueField()) {
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java
index 32985f9832..6ed751f0d4 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java
@@ -1037,13 +1037,20 @@ public class SchemaConformingTransformerTest {
String binaryDataWithTrailingPeriods = "ABCxyz12345-_+/=..";
String binaryDataWithRandomPeriods = "A.BCxy.z12345-_+/=..";
String shortBinaryData = "short";
+ String longBinaryDataWithColon =
"field:1:1:v1Cgy+ypzk8yf9JzsdkBjvZ1jM8Mem/BTtNilst64Df/34xmJzeRstmihpfrWZ";
+ String jsonBinaryData =
"{\"field\":\"text:1:1:v1Cgy+ypzk8yf9JzsdkBjvZ1jM8Mem/BTtNilst64Df/34xmJzeRstmihpfrWZ\"}";
int minLength = 10;
+ // A space is not expected in a based64 encoded string.
assertFalse(SchemaConformingTransformer.base64ValueFilter(text.getBytes(),
minLength));
assertTrue(SchemaConformingTransformer.base64ValueFilter(binaryData.getBytes(),
minLength));
assertTrue(SchemaConformingTransformer.base64ValueFilter(binaryDataWithTrailingPeriods.getBytes(),
minLength));
assertFalse(SchemaConformingTransformer.base64ValueFilter(binaryDataWithRandomPeriods.getBytes(),
minLength));
assertFalse(SchemaConformingTransformer.base64ValueFilter(shortBinaryData.getBytes(),
minLength));
+ // A colon : is not expected in base64 encoded string.
+
assertFalse(SchemaConformingTransformer.base64ValueFilter(longBinaryDataWithColon.getBytes(),
minLength));
+ // Json string can not be detected as base64 encoded string even one field
has base64 encoded strings.
+
assertFalse(SchemaConformingTransformer.base64ValueFilter(jsonBinaryData.getBytes(),
minLength));
}
@Test
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]