Re: [PR] PHOENIX-7751 : [SyncTable Tool] Feature to validate table data using PhoenixSyncTable tool b/w source and target cluster [phoenix]

via GitHub Fri, 20 Mar 2026 17:22:14 -0700


tkhurana commented on code in PR #2379:
URL: https://github.com/apache/phoenix/pull/2379#discussion_r2968495244



##########
phoenix-core-server/src/main/java/org/apache/phoenix/mapreduce/PhoenixSyncTableMapper.java:
##########
@@ -0,0 +1,723 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.mapreduce;
+
+import static org.apache.phoenix.schema.types.PDataType.TRUE_BYTES;
+
+import java.io.IOException;
+import java.security.MessageDigest;
+import java.sql.Connection;
+import java.sql.SQLException;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.Cell;
+import org.apache.hadoop.hbase.CellUtil;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.ResultScanner;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Pair;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
+import org.apache.phoenix.coprocessorclient.BaseScannerRegionObserverConstants;
+import org.apache.phoenix.jdbc.PhoenixConnection;
+import org.apache.phoenix.mapreduce.util.ConnectionUtil;
+import org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil;
+import org.apache.phoenix.mapreduce.util.PhoenixMapReduceUtil;
+import org.apache.phoenix.query.KeyRange;
+import org.apache.phoenix.query.QueryServicesOptions;
+import org.apache.phoenix.schema.PTable;
+import org.apache.phoenix.util.MetaDataUtil;
+import org.apache.phoenix.util.PhoenixRuntime;
+import org.apache.phoenix.util.SHA256DigestUtil;
+import org.apache.phoenix.util.ScanUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import 
org.apache.phoenix.thirdparty.com.google.common.annotations.VisibleForTesting;
+
+/**
+ * Mapper that acts as a driver for validating table data between source and 
target clusters. The
+ * actual work of chunking and hashing is done server-side by the coprocessor. 
This mapper fetches
+ * chunk hashes from both clusters, compares them and write to checkpoint 
table.
+ */
+public class PhoenixSyncTableMapper
+  extends Mapper<NullWritable, DBInputFormat.NullDBWritable, NullWritable, 
NullWritable> {
+
+  private static final Logger LOGGER = 
LoggerFactory.getLogger(PhoenixSyncTableMapper.class);
+
+  public enum SyncCounters {
+    CHUNKS_VERIFIED,
+    CHUNKS_MISMATCHED,
+    SOURCE_ROWS_PROCESSED,
+    TARGET_ROWS_PROCESSED
+  }
+
+  private String tableName;
+  private String targetZkQuorum;
+  private Long fromTime;
+  private Long toTime;
+  private boolean isDryRun;
+  private long chunkSizeBytes;
+  private Configuration conf;
+  private Connection sourceConnection;
+  private Connection targetConnection;
+  private Connection globalConnection;
+  private PTable pTable;
+  private byte[] physicalTableName;
+  private byte[] mapperRegionStart;
+  private byte[] mapperRegionEnd;
+  private PhoenixSyncTableOutputRepository syncTableOutputRepository;
+  private Timestamp mapperStartTime;
+
+  @Override
+  protected void setup(Context context) throws InterruptedException {
+    try {
+      super.setup(context);
+      mapperStartTime = new Timestamp(System.currentTimeMillis());
+      this.conf = context.getConfiguration();
+      tableName = PhoenixConfigurationUtil.getPhoenixSyncTableName(conf);
+      targetZkQuorum = 
PhoenixConfigurationUtil.getPhoenixSyncTableTargetZkQuorum(conf);
+      fromTime = PhoenixConfigurationUtil.getPhoenixSyncTableFromTime(conf);
+      toTime = PhoenixConfigurationUtil.getPhoenixSyncTableToTime(conf);
+      isDryRun = PhoenixConfigurationUtil.getPhoenixSyncTableDryRun(conf);
+      chunkSizeBytes = 
PhoenixConfigurationUtil.getPhoenixSyncTableChunkSizeBytes(conf);
+      extractRegionBoundariesFromSplit(context);
+      sourceConnection = ConnectionUtil.getInputConnection(conf);
+      pTable = 
sourceConnection.unwrap(PhoenixConnection.class).getTable(tableName);
+      physicalTableName = pTable.getPhysicalName().getBytes();
+      connectToTargetCluster();
+      globalConnection = createGlobalConnection(conf);
+      syncTableOutputRepository = new 
PhoenixSyncTableOutputRepository(globalConnection);
+    } catch (SQLException | IOException e) {
+      tryClosingResources();
+      throw new RuntimeException(
+        String.format("Failed to setup PhoenixSyncTableMapper for table: %s", 
tableName), e);
+    }
+  }
+
+  /**
+   * Extracts mapper region boundaries from the PhoenixInputSplit
+   */
+  private void extractRegionBoundariesFromSplit(Context context) {
+    PhoenixInputSplit split = (PhoenixInputSplit) context.getInputSplit();
+    KeyRange keyRange = split.getKeyRange();
+    if (keyRange == null) {
+      throw new IllegalStateException(String.format(
+        "PhoenixInputSplit has no KeyRange for table: %s . Cannot determine 
region boundaries for sync operation.",
+        tableName));
+    }
+    mapperRegionStart = keyRange.getLowerRange();
+    mapperRegionEnd = keyRange.getUpperRange();
+  }
+
+  /**
+   * Connects to the target cluster using the target ZK quorum, port, znode, 
krb principal
+   */
+  private void connectToTargetCluster() throws SQLException, IOException {
+    Configuration targetConf =
+      PhoenixMapReduceUtil.createConfigurationForZkQuorum(conf, 
targetZkQuorum);
+    targetConnection = ConnectionUtil.getInputConnection(targetConf);
+  }
+
+  /**
+   * Creates a global (non-tenant) connection for the checkpoint table.
+   */
+  private Connection createGlobalConnection(Configuration conf) throws 
SQLException {
+    Configuration globalConf = new Configuration(conf);
+    globalConf.unset(PhoenixConfigurationUtil.MAPREDUCE_TENANT_ID);
+    globalConf.unset(PhoenixRuntime.CURRENT_SCN_ATTRIB);
+    return ConnectionUtil.getInputConnection(globalConf);
+  }
+
+  /**
+   * Processes a mapper region by comparing chunks between source and target 
clusters. Gets already
+   * processed chunks from checkpoint table, resumes from check pointed 
progress and records final
+   * status for chunks & mapper (VERIFIED/MISMATCHED).
+   */
+  @Override
+  protected void map(NullWritable key, DBInputFormat.NullDBWritable value, 
Context context)
+    throws IOException, InterruptedException {
+    context.getCounter(PhoenixJobCounters.INPUT_RECORDS).increment(1);

Review Comment:
   What is the meaning of INPUT_RECORDS in the context of sync tool ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] PHOENIX-7751 : [SyncTable Tool] Feature to validate table data using PhoenixSyncTable tool b/w source and target cluster [phoenix]

Reply via email to