This is an automated email from the ASF dual-hosted git repository.
xbli pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 1e0f87053c add logs to debug why crc values are different upon same
input data and found Text Index data is not deterministic (#14188)
1e0f87053c is described below
commit 1e0f87053c7bee2316223ec4152c80341c291fb1
Author: Xiaobing <[email protected]>
AuthorDate: Tue Oct 8 14:35:41 2024 -0700
add logs to debug why crc values are different upon same input data and
found Text Index data is not deterministic (#14188)
---
.../org/apache/pinot/core/util/CrcUtilsTest.java | 101 +++++++++++++++++++--
.../apache/pinot/segment/local/utils/CrcUtils.java | 3 +
2 files changed, 98 insertions(+), 6 deletions(-)
diff --git
a/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
b/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
index 120f84e07e..494d4f518f 100644
--- a/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
+++ b/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
@@ -19,16 +19,24 @@
package org.apache.pinot.core.util;
import java.io.File;
+import java.io.IOException;
import java.net.URL;
import java.util.concurrent.TimeUnit;
import org.apache.commons.io.FileUtils;
import org.apache.pinot.segment.local.segment.creator.SegmentTestUtils;
import
org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl;
import
org.apache.pinot.segment.local.segment.index.converter.SegmentV1V2ToV3FormatConverter;
+import
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
import org.apache.pinot.segment.local.utils.CrcUtils;
import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig;
import org.apache.pinot.segment.spi.creator.SegmentIndexCreationDriver;
+import org.apache.pinot.segment.spi.index.FieldIndexConfigs;
+import org.apache.pinot.segment.spi.index.FstIndexConfig;
+import org.apache.pinot.segment.spi.index.StandardIndexes;
+import org.apache.pinot.segment.spi.index.TextIndexConfig;
+import org.apache.pinot.spi.config.table.FSTType;
import org.apache.pinot.util.TestUtils;
+import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import static org.testng.Assert.assertEquals;
@@ -38,30 +46,111 @@ import static org.testng.Assert.assertNotNull;
public class CrcUtilsTest {
private static final File INDEX_DIR = new File(FileUtils.getTempDirectory(),
"CrcUtilsTest");
private static final String AVRO_DATA = "data/test_data-mv.avro";
- private static final long EXPECTED_V1_CRC = 2708456273L;
- private static final long EXPECTED_V3_CRC = 2796149869L;
+
+ @BeforeMethod
+ public void setup()
+ throws IOException {
+ FileUtils.deleteDirectory(INDEX_DIR);
+ }
+
+ @BeforeMethod
+ public void tearDown()
+ throws IOException {
+ FileUtils.deleteDirectory(INDEX_DIR);
+ }
@Test
public void testCrc()
throws Exception {
- FileUtils.deleteDirectory(INDEX_DIR);
+ URL resource = getClass().getClassLoader().getResource(AVRO_DATA);
+ assertNotNull(resource);
+ String filePath = TestUtils.getFileFromResourceUrl(resource);
+ SegmentGeneratorConfig config =
+ SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new
File(filePath), INDEX_DIR, "daysSinceEpoch",
+ TimeUnit.DAYS, "testTable");
+ SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl();
+ driver.init(config);
+ driver.build();
+
+ File indexDir = driver.getOutputDirectory();
+ assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(),
2708456273L);
+ new SegmentV1V2ToV3FormatConverter().convert(indexDir);
+ assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(),
2796149869L);
+ }
+
+ @Test
+ public void testCrcWithNativeFstIndex()
+ throws Exception {
URL resource = getClass().getClassLoader().getResource(AVRO_DATA);
assertNotNull(resource);
String filePath = TestUtils.getFileFromResourceUrl(resource);
SegmentGeneratorConfig config =
SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new
File(filePath), INDEX_DIR, "daysSinceEpoch",
TimeUnit.DAYS, "testTable");
+ FstIndexConfig fstIndexConfig = new FstIndexConfig(FSTType.NATIVE);
+ config.setIndexOn(StandardIndexes.fst(), fstIndexConfig, "column5");
SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl();
driver.init(config);
driver.build();
File indexDir = driver.getOutputDirectory();
- assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(),
EXPECTED_V1_CRC);
+ assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(),
3358657641L);
new SegmentV1V2ToV3FormatConverter().convert(indexDir);
- assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(),
EXPECTED_V3_CRC);
+ assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(),
961102604L);
+ }
- FileUtils.deleteDirectory(INDEX_DIR);
+ @Test
+ public void testCrcWithLuceneFstIndex()
+ throws Exception {
+ URL resource = getClass().getClassLoader().getResource(AVRO_DATA);
+ assertNotNull(resource);
+ String filePath = TestUtils.getFileFromResourceUrl(resource);
+ SegmentGeneratorConfig config =
+ SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new
File(filePath), INDEX_DIR, "daysSinceEpoch",
+ TimeUnit.DAYS, "testTable");
+ FstIndexConfig fstIndexConfig = new FstIndexConfig(FSTType.LUCENE);
+ config.setIndexOn(StandardIndexes.fst(), fstIndexConfig, "column5");
+ SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl();
+ driver.init(config);
+ driver.build();
+
+ File indexDir = driver.getOutputDirectory();
+ assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(),
3294819300L);
+
+ new SegmentV1V2ToV3FormatConverter().convert(indexDir);
+ assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(),
2552900261L);
+ }
+
+ // @Test
+ public void testCrcWithLuceneTextIndex()
+ throws Exception {
+ URL resource = getClass().getClassLoader().getResource(AVRO_DATA);
+ assertNotNull(resource);
+ String filePath = TestUtils.getFileFromResourceUrl(resource);
+ SegmentGeneratorConfig config =
+ SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new
File(filePath), INDEX_DIR, "daysSinceEpoch",
+ TimeUnit.DAYS, "testTable");
+ addTextIndex(config, "column5");
+ SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl();
+ driver.init(config);
+ driver.build();
+
+ // Lucene text index data is not deterministic, thus leading to different
segment crc across each test runs.
+ // When using text index in RealTime table, different crc values can cause
servers to have to download segments
+ // from deep store to make segment replicas in sync.
+ File indexDir = driver.getOutputDirectory();
+ System.out.println(CrcUtils.forAllFilesInFolder(indexDir).computeCrc());
+
+ new SegmentV1V2ToV3FormatConverter().convert(indexDir);
+ System.out.println(CrcUtils.forAllFilesInFolder(indexDir).computeCrc());
+ }
+
+ private void addTextIndex(SegmentGeneratorConfig config, String colName) {
+ FieldIndexConfigs fieldIndexConfigs =
config.getIndexConfigsByColName().get(colName);
+ TextIndexConfig textConfig =
fieldIndexConfigs.getConfig(StandardIndexes.text());
+ TextIndexConfig newTextConfig = new
TextIndexConfigBuilder(textConfig).build();
+ config.setIndexOn(StandardIndexes.text(), newTextConfig, colName);
}
}
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java
index 659d54f9ea..84dfb8283c 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java
@@ -84,6 +84,9 @@ public class CrcUtils {
while ((len = input.read(buffer)) > 0) {
checksum.update(buffer, 0, len);
}
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER.debug("Updated crc = {}, based on file {} of length {}",
checksum.getValue(), file, file.length());
+ }
}
}
long crc = checksum.getValue();
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]