[ https://issues.apache.org/jira/browse/IOTDB-4380?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
刘珍 reopened IOTDB-4380: ----------------------- master 20230214 eaee1d8 3副本3C3D IoT协议 1.先启动集群 2.再运行脚本: test_node="192.168.10.74" sh iotdb.sh 4380 > 0215_4380_1.out & //bm运行起来15秒后,kill ip74的datanode sleep 15 ip74_dn_pid=`ssh liuzhen@192.168.10.74 "source /etc/profile;jps|grep -i datanode"` v_pid=`echo ${ip74_dn_pid}|awk '{print $1}'` echo ${v_pid} ssh liuzhen@192.168.10.74 "kill -9 ${v_pid}" ssh liuzhen@192.168.10.74 "sudo sh -c \"sync; echo 3 > /proc/sys/vm/drop_caches\"" wait //bm运行完成,启动ip74的datanode start_db.sh见附件 sh ./start_db.sh ${test_node} 启动失败,ip74 datanode 日志见附件,error日志如下 2023-02-16 09:14:07,778 [pool-23-IoTDB-IoTConsensusRPC-Processor-1] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[3] for TSyncLogEntriesReq which size is 2 2023-02-16 09:14:07,860 [main] ERROR o.a.i.c.s.m.MetricService:54 - MetricService failed to start Metrics ServerService because: java.util.ConcurrentModificationException: null at java.base/java.util.HashMap$HashIterator.nextNode(HashMap.java:1510) at java.base/java.util.HashMap$KeyIterator.next(HashMap.java:1533) at org.apache.iotdb.metrics.AbstractMetricService.startService(AbstractMetricService.java:80) at org.apache.iotdb.commons.service.metric.MetricService.start(MetricService.java:51) at org.apache.iotdb.commons.service.RegisterManager.register(RegisterManager.java:51) at org.apache.iotdb.db.service.DataNode.setUpMetricService(DataNode.java:553) at org.apache.iotdb.db.service.DataNode.doAddNode(DataNode.java:175) at org.apache.iotdb.db.service.DataNodeServerCommandLine.run(DataNodeServerCommandLine.java:81) at org.apache.iotdb.commons.ServerCommandLine.doMain(ServerCommandLine.java:58) at org.apache.iotdb.db.service.DataNode.main(DataNode.java:145) 2023-02-16 09:14:07,863 [main] ERROR o.a.i.db.service.DataNode:181 - Fail to start server org.apache.iotdb.commons.exception.StartupException: Failed to start [Metrics ServerService], because [null] at org.apache.iotdb.commons.service.metric.MetricService.start(MetricService.java:55) at org.apache.iotdb.commons.service.RegisterManager.register(RegisterManager.java:51) at org.apache.iotdb.db.service.DataNode.setUpMetricService(DataNode.java:553) at org.apache.iotdb.db.service.DataNode.doAddNode(DataNode.java:175) at org.apache.iotdb.db.service.DataNodeServerCommandLine.run(DataNodeServerCommandLine.java:81) at org.apache.iotdb.commons.ServerCommandLine.doMain(ServerCommandLine.java:58) at org.apache.iotdb.db.service.DataNode.main(DataNode.java:145) 2023-02-16 09:14:07,866 [main] ERROR o.a.i.c.s.RegisterManager:63 - Failed to stop Metrics ServerService because: java.util.ConcurrentModificationException: null at java.base/java.util.HashMap$HashIterator.nextNode(HashMap.java:1510) at java.base/java.util.HashMap$KeyIterator.next(HashMap.java:1533) at org.apache.iotdb.metrics.AbstractMetricService.stopService(AbstractMetricService.java:87) at org.apache.iotdb.commons.service.metric.MetricService.stop(MetricService.java:78) at org.apache.iotdb.commons.service.IService.waitAndStop(IService.java:36) at org.apache.iotdb.commons.service.RegisterManager.deregisterAll(RegisterManager.java:60) at org.apache.iotdb.db.service.DataNode.deactivate(DataNode.java:857) at org.apache.iotdb.db.service.DataNode.stop(DataNode.java:826) at org.apache.iotdb.db.service.DataNode.doAddNode(DataNode.java:187) at org.apache.iotdb.db.service.DataNodeServerCommandLine.run(DataNodeServerCommandLine.java:81) at org.apache.iotdb.commons.ServerCommandLine.doMain(ServerCommandLine.java:58) at org.apache.iotdb.db.service.DataNode.main(DataNode.java:145) 2023-02-16 09:14:08,956 [pool-23-IoTDB-IoTConsensusRPC-Processor-5] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[7] for TSyncLogEntriesReq which size is 2 2023-02-16 09:14:09,085 [pool-23-IoTDB-IoTConsensusRPC-Processor-6] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[5] for TSyncLogEntriesReq which size is 2 2023-02-16 09:14:09,087 [pool-23-IoTDB-IoTConsensusRPC-Processor-7] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[5] for TSyncLogEntriesReq which size is 3 2023-02-16 09:14:09,199 [pool-23-IoTDB-IoTConsensusRPC-Processor-8] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[8] for TSyncLogEntriesReq which size is 3 2023-02-16 09:14:09,491 [pool-23-IoTDB-IoTConsensusRPC-Processor-9] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[7] for TSyncLogEntriesReq which size is 4 2023-02-16 09:14:09,513 [pool-23-IoTDB-IoTConsensusRPC-Processor-11] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[8] for TSyncLogEntriesReq which size is 5 2023-02-16 09:14:10,173 [pool-23-IoTDB-IoTConsensusRPC-Processor-12] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[5] for TSyncLogEntriesReq which size is 2 2023-02-16 09:14:11,571 [pool-23-IoTDB-IoTConsensusRPC-Processor-13] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[7] for TSyncLogEntriesReq which size is 7 2023-02-16 09:14:11,842 [pool-23-IoTDB-IoTConsensusRPC-Processor-14] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[8] for TSyncLogEntriesReq which size is 3 2023-02-16 09:14:11,965 [pool-23-IoTDB-IoTConsensusRPC-Processor-15] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[6] for TSyncLogEntriesReq which size is 3 2023-02-16 09:14:12,309 [pool-23-IoTDB-IoTConsensusRPC-Processor-17] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[5] for TSyncLogEntriesReq which size is 6 2023-02-16 09:14:12,426 [pool-23-IoTDB-IoTConsensusRPC-Processor-19] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[7] for TSyncLogEntriesReq which size is 6 2023-02-16 09:14:13,154 [pool-23-IoTDB-IoTConsensusRPC-Processor-21] ERROR o.a.i.c.i.s.IoTConsensusRPCServiceProcessor:84 - unexpected consensusGroupId DataRegion[8] for TSyncLogEntriesReq which size is 5 > The log dispatcher is inconsistent with the search index of the wal node > ------------------------------------------------------------------------ > > Key: IOTDB-4380 > URL: https://issues.apache.org/jira/browse/IOTDB-4380 > Project: Apache IoTDB > Issue Type: Bug > Components: mpp-cluster > Affects Versions: 0.14.0-SNAPSHOT > Reporter: 刘珍 > Assignee: Hongyin Zhang > Priority: Major > Fix For: 1.0.0 > > Attachments: image-2022-11-25-20-47-33-174.png, > image-2022-11-25-20-57-30-523.png, ip74_logs.tar.gz, more_metadata.conf, > screenshot-1.png > > > m_0908_7915b3f。 > 问题描述 > {color:#DE350B}*log dispatcher与wal node的search index不一致 , > datanode重启成功后日志一直刷*{color}: > 2022-09-09 16:32:00,011 [pool-33-IoTDB-LogDispatcher-DataRegion[12]-2] INFO > o.a.i.d.w.n.WALNode$PlanNodeIterator:695 - timeout when waiting for next WAL > entry ready, execute rollWALFile. {color:#DE350B}*Current search index in wal > buffer is 2959, and next target index is 2501 *{color} > MultiLeaderConsensus,3副本3节点 > 1. 创建元数据过程中,kill ip74 的datanode PID > benchmark配置文件见附件。 > 2. 清空ip74 的操作系统缓存,启动ip74的datanode > 3. 再次重新运行benchmark同一配置,IS_DELETE_DATA=true > 这个参数为true,会先执行delete storage group root.test.*; > benchmark运行完成,stop ip74的datanode服务 > 备份data 为/data/mpp_test/m_0908_7915b3f/datanode/data_for_recovery_Test > 4. 清ip74操作系统缓存,启动datanode服务 > 再次运行benchmark同一配置,benchmark运行完成, > 查看ip74的日志,看到 > 2022-09-09 15:43:13,691 [pool-23-IoTDB-MPPDataExchangeRPC-Processor-40] ERROR > o.a.t.ProcessFunction:47 - Internal error processing getDataBlock > org.apache.thrift.TException: Source fragment instance not found. Fragment > instance ID: TFragmentInstanceId(queryId:20220909_074205_19400_3, > fragmentId:2, instanceId:0). > at > org.apache.iotdb.db.mpp.execution.exchange.MPPDataExchangeManager$MPPDataExchangeServiceImpl.getDataBlock(MPPDataExchangeManager.java:90) > at > org.apache.iotdb.mpp.rpc.thrift.MPPDataExchangeService$Processor$getDataBlock.getResult(MPPDataExchangeService.java:326) > at > org.apache.iotdb.mpp.rpc.thrift.MPPDataExchangeService$Processor$getDataBlock.getResult(MPPDataExchangeService.java:306) > at org.apache.thrift.ProcessFunction.process(ProcessFunction.java:38) > at org.apache.thrift.TBaseProcessor.process(TBaseProcessor.java:38) > at > org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:248) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:834) > 2022-09-09 15:43:15,312 [20220909_074205_19400_3.2.0.SinkHandle-3074] ERROR > o.a.i.d.m.e.e.SinkHandle:281 - The TsBlock doesn't exist. Sequence ID is 1, > remaining map is > [0=<org.apache.iotdb.tsfile.read.common.block.TsBlock@5f617979,1048576>] > 2022-09-09 15:43:17,119 [pool-23-IoTDB-MPPDataExchangeRPC-Processor-22] ERROR > o.a.t.ProcessFunction:47 - Internal error processing getDataBlock > java.lang.IllegalStateException: The data block doesn't exist. Sequence ID: 1 > at > org.apache.iotdb.db.mpp.execution.exchange.SinkHandle.getSerializedTsBlock(SinkHandle.java:285) > at > org.apache.iotdb.db.mpp.execution.exchange.MPPDataExchangeManager$MPPDataExchangeServiceImpl.getDataBlock(MPPDataExchangeManager.java:97) > at > org.apache.iotdb.mpp.rpc.thrift.MPPDataExchangeService$Processor$getDataBlock.getResult(MPPDataExchangeService.java:326) > at > org.apache.iotdb.mpp.rpc.thrift.MPPDataExchangeService$Processor$getDataBlock.getResult(MPPDataExchangeService.java:306) > at org.apache.thrift.ProcessFunction.process(ProcessFunction.java:38) > at org.apache.thrift.TBaseProcessor.process(TBaseProcessor.java:38) > at > org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:248) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:834) > 5. 停止ip74的datanode服务 > 备份data 到/data/mpp_test/m_0908_7915b3f/datanode/data_for_recovery_Test_2 > 清ip74操作系统缓存,启动ip74的datanode ,可以成功,日志一直刷(此节点不能继续同步): > 2022-09-09 16:44:00,039 [pool-33-IoTDB-LogDispatcher-DataRegion[12]-2] INFO > o.a.i.d.w.n.WALNode$PlanNodeIterator:695 - timeout when waiting for next WAL > entry ready, execute rollWALFile. Current search index in wal buffer is 2959, > and next target index is 2501 > 机器 与 集群配置 > 1. 192.168.10.72/ 73/74 48核384G > benchmark 在71 > 2. 集群参数 > confignode > MAX_HEAP_SIZE="8G" > schema_region_consensus_protocol_class=org.apache.iotdb.consensus.ratis.RatisConsensus > data_region_consensus_protocol_class=org.apache.iotdb.consensus.multileader.MultiLeaderConsensus > schema_replication_factor=3 > data_replication_factor=3 > datanode > MAX_HEAP_SIZE="256G" > MAX_DIRECT_MEMORY_SIZE="32G" > max_connection_for_internal_service=300 > enable_timed_flush_seq_memtable=true > seq_memtable_flush_interval_in_ms=600000 > seq_memtable_flush_check_interval_in_ms=300000 > enable_timed_flush_unseq_memtable=true > unseq_memtable_flush_interval_in_ms=600000 > unseq_memtable_flush_check_interval_in_ms=300000 > max_waiting_time_when_insert_blocked=3600000 > query_timeout_threshold=3600000 -- This message was sent by Atlassian Jira (v8.20.10#820010)