dev 1.this script is running after yarn-cluster mode 2.no special configuration in carbon.properties,therefore using default configuration 3.filter data location and find a deletedelta file which is of size 0. hdfs dfs -du -h /user/ip_crm/public/c_compact1/*/*/*/*.deletedelta |grep "0 /" 0 /user/ip_crm/public/c_compact1/Fact/Part0/Segment_1/part-0-0_batchno0-0-1519639964744.deletedelta 4.delete this deletedelta file,table can select,but get "Multiple input rows matched for same row." when doing update operation. 5.following is shell contents : /usr/lib/spark-2.1.1-bin-hadoop2.7/bin/spark-shell \ --driver-memory 3g \ --executor-memory 3g \ --executor-cores 1 \ --jars carbondata_2.11-1.3.0-shade-hadoop2.7.2.jar \ --driver-class-path /home/ip_crm/testdata/ojdbc14.jar \ --queue ip_crm \ --master yarn \ --deploy-mode client \ --keytab /etc/security/keytabs/ip_crm.keytab \ --principal ip_crm \ --files /usr/hdp/2.4.0.0-169/hadoop/conf/hdfs-site.xml \ --conf "spark.driver.extraJavaOptions=-server -XX:+AggressiveOpts -XX:MaxMetaspaceSize=256m -XX:CompressedClassSpaceSize=512m -XX:+AlwaysPreTouch -XX:+UseG1GC -XX:+ScavengeBeforeFullGC -Djava.net.preferIPv4Stack=true -Xss16m -Dhdp.version=2.4.0.0-169 -Dcarbon.properties.filepath=/home/ip_crm/testdata/carbon.conf" \ --conf "spark.executor.extraJavaOptions=-server -XX:+AggressiveOpts -XX:MaxMetaspaceSize=256m -XX:CompressedClassSpaceSize=512m -XX:+AlwaysPreTouch -XX:+UseG1GC -XX:+ScavengeBeforeFullGC -Djava.net.preferIPv4Stack=true -Xss16m -Dhdp.version=2.4.0.0-169 -Dcarbon.properties.filepath=/home/ip_crm/testdata/carbon.conf" \ --conf "spark.dynamicAllocation.enabled=true" \ --conf "spark.network.timeout=300" \ --conf "spark.sql.shuffle.partitions=200" \ --conf "spark.default.parallelism=200" \ --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ --conf "spark.kryo.referenceTracking=false" \ --conf "spark.kryoserializer.buffer.max=1g" \ --conf "spark.debug.maxToStringFields=1000" \ --conf "spark.dynamicAllocation.executorIdleTimeout=30" \ --conf "spark.dynamicAllocation.maxExecutors=30" \ --conf "spark.dynamicAllocation.minExecutors=1" \ --conf "spark.dynamicAllocation.sustainedSchedulerBacklogTimeout=1s" \ --conf "spark.yarn.executor.memoryOverhead=2048" \ --conf "spark.yarn.driver.memoryOverhead=1024" \ --conf "spark.speculation=true" \ --conf "spark.sql.warehouse.dir=/apps/hive/warehouse" \ --conf "spark.rpc.askTimeout=300" \ --conf "spark.locality.wait=0"
yixu2001 From: sounak Date: 2018-02-26 17:04 To: dev Subject: Re: Getting [Problem in loading segment blocks] error after doing multi update operations Hi, I tried to reproduce the issue but it is running fine. Are you running this script in a cluster and any special configuration you have set in carbon.properties? The script almost ran 200 times but no problem was observed. On Sun, Feb 25, 2018 at 1:59 PM, 杨义 <yixu2...@163.com> wrote: > I'm using carbondata1.3+spark2.1.1+hadoop2.7.1 to do multi update > operations > here is the replay step: > > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.CarbonSession._ > val cc = SparkSession.builder().config(sc.getConf). > getOrCreateCarbonSession("hdfs://ns1/user/ip_crm") > // create table > cc.sql("CREATE TABLE IF NOT EXISTS public.c_compact3 (id string,qqnum > string,nick string,age string,gender string,auth string,qunnum string,mvcc > string) STORED BY 'carbondata' TBLPROPERTIES ('SORT_COLUMNS'='id')").show; > // data prepare > import org.apache.spark.sql.types._ > import org.apache.spark.sql.Row > val schema = StructType(StructField("id",StringType,true)::StructField( > "qqnum",StringType,true)::StructField("nick",StringType, > true)::StructField("age",StringType,true)::StructField( > "gender",StringType,true)::StructField("auth",StringType, > true)::StructField("qunnum",StringType,true)::StructField( > "mvcc",IntegerType,true)::Nil) > val data = cc.sparkContext.parallelize(1 to 50000000,4).map { i => > Row.fromSeq(Seq(i.toString,i.toString.concat("aaaaaaaa"). > concat(i.toString),"2009-05-27",i.toString.concat("c"). > concat(i.toString),"1","1",i.toString.concat("dddddd"). > concat(i.toString),1)) > } > cc.createDataFrame(data, schema).createOrReplaceTempView("ddd") > cc.sql("insert into public.c_compact3 select * from ddd").show; > > // update table multi times in while loop > import scala.util.Random > var bcnum=1; > while (true) { > bcnum=1+bcnum; > println(bcnum); > println("111111111"); > var randomNmber = Random.nextInt(1000) > cc.sql(s"DROP TABLE IF EXISTS cache_compact3").show; > cc.sql(s"cache table cache_compact3 as select * from > public.c_compact3 where pmod(cast(id as int),1000)=$randomNmber").show(100, > false); > cc.sql("select count(*) from cache_compact3").show; > cc.sql("update public.c_compact3 a set (a.id > ,a.qqnum,a.nick,a.age,a.gender,a.auth,a.qunnum,a.mvcc)=(select b.id > ,b.qqnum,b.nick,b.age,b.gender,b.auth,b.qunnum,b.mvcc from > cache_compact3 b where b.id=a.id)").show; > println("222222222"); > Thread.sleep(30000); > } > > after about 30 loop,[Problem in loading segment blocks] error happended. > then performing select count operations on the table and get exception > like follows: > > scala>cc.sql("select count(*) from public.c_compact3").show; > 18/02/25 08:49:46 AUDIT CarbonMetaStoreFactory: > [hdd340][ip_crm][Thread-1]File based carbon metastore is enabled > Exchange SinglePartition > +- *HashAggregate(keys=[], functions=[partial_count(1)], > output=[count#33L]) > +- *BatchedScan CarbonDatasourceHadoopRelation [ Database name :public, > Table name :c_compact3, Schema > :Some(StructType(StructField(id,StringType,true), > StructField(qqnum,StringType,true), StructField(nick,StringType,true), > StructField(age,StringType,true), StructField(gender,StringType,true), > StructField(auth,StringType,true), StructField(qunnum,StringType,true), > StructField(mvcc,StringType,true))) ] public.c_compact3[] > > at org.apache.spark.sql.catalyst.errors.package$.attachTree( > package.scala:56) > at org.apache.spark.sql.execution.exchange.ShuffleExchange.doExecute( > ShuffleExchange.scala:112) > at org.apache.spark.sql.execution.SparkPlan$$anonfun$ > execute$1.apply(SparkPlan.scala:114) > at org.apache.spark.sql.execution.SparkPlan$$anonfun$ > execute$1.apply(SparkPlan.scala:114) > at org.apache.spark.sql.execution.SparkPlan$$anonfun$ > executeQuery$1.apply(SparkPlan.scala:135) > at org.apache.spark.rdd.RDDOperationScope$.withScope( > RDDOperationScope.scala:151) > at org.apache.spark.sql.execution.SparkPlan. > executeQuery(SparkPlan.scala:132) > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113) > at org.apache.spark.sql.execution.InputAdapter.inputRDDs( > WholeStageCodegenExec.scala:235) > at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs( > HashAggregateExec.scala:141) > at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute( > WholeStageCodegenExec.scala:368) > at org.apache.spark.sql.execution.SparkPlan$$anonfun$ > execute$1.apply(SparkPlan.scala:114) > at org.apache.spark.sql.execution.SparkPlan$$anonfun$ > execute$1.apply(SparkPlan.scala:114) > at org.apache.spark.sql.execution.SparkPlan$$anonfun$ > executeQuery$1.apply(SparkPlan.scala:135) > at org.apache.spark.rdd.RDDOperationScope$.withScope( > RDDOperationScope.scala:151) > at org.apache.spark.sql.execution.SparkPlan. > executeQuery(SparkPlan.scala:132) > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113) > at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan. > scala:225) > at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala: > 308) > at org.apache.spark.sql.execution.CollectLimitExec. > executeCollect(limit.scala:38) > at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$ > Dataset$$execute$1$1.apply(Dataset.scala:2386) > at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId( > SQLExecution.scala:57) > at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2788) > at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$ > execute$1(Dataset.scala:2385) > at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$ > collect(Dataset.scala:2392) > at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset. > scala:2128) > at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset. > scala:2127) > at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2818) > at org.apache.spark.sql.Dataset.head(Dataset.scala:2127) > at org.apache.spark.sql.Dataset.take(Dataset.scala:2342) > at org.apache.spark.sql.Dataset.showString(Dataset.scala:248) > at org.apache.spark.sql.Dataset.show(Dataset.scala:638) > at org.apache.spark.sql.Dataset.show(Dataset.scala:597) > at org.apache.spark.sql.Dataset.show(Dataset.scala:606) > ... 50 elided > Caused by: java.io.IOException: Problem in loading segment blocks. > at org.apache.carbondata.core.indexstore.BlockletDataMapIndexStore. > getAll(BlockletDataMapIndexStore.java:153) > at org.apache.carbondata.core.indexstore.blockletindex. > BlockletDataMapFactory.getDataMaps(BlockletDataMapFactory.java:76) > at org.apache.carbondata.core.datamap.TableDataMap.prune( > TableDataMap.java:72) > at org.apache.carbondata.hadoop.api.CarbonTableInputFormat. > getDataBlocksOfSegment(CarbonTableInputFormat.java:739) > at org.apache.carbondata.hadoop.api.CarbonTableInputFormat.getSplits( > CarbonTableInputFormat.java:666) > at org.apache.carbondata.hadoop.api.CarbonTableInputFormat.getSplits( > CarbonTableInputFormat.java:426) > at org.apache.carbondata.spark.rdd.CarbonScanRDD. > getPartitions(CarbonScanRDD.scala:96) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) > at scala.Option.getOrElse(Option.scala:121) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) > at org.apache.spark.rdd.MapPartitionsRDD.getPartitions( > MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) > at scala.Option.getOrElse(Option.scala:121) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) > at org.apache.spark.rdd.MapPartitionsRDD.getPartitions( > MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) > at scala.Option.getOrElse(Option.scala:121) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) > at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91) > at org.apache.spark.sql.execution.exchange.ShuffleExchange$. > prepareShuffleDependency(ShuffleExchange.scala:261) > at org.apache.spark.sql.execution.exchange.ShuffleExchange. > prepareShuffleDependency(ShuffleExchange.scala:84) > at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$ > doExecute$1.apply(ShuffleExchange.scala:121) > at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$ > doExecute$1.apply(ShuffleExchange.scala:112) > at org.apache.spark.sql.catalyst.errors.package$.attachTree( > package.scala:52) > ... 83 more > Caused by: java.lang.ArrayIndexOutOfBoundsException: 0 > at org.apache.carbondata.core.datastore.filesystem. > AbstractDFSCarbonFile.getLocations(AbstractDFSCarbonFile.java:514) > at org.apache.carbondata.core.indexstore.BlockletDataMapIndexStore. > getAll(BlockletDataMapIndexStore.java:142) > ... 109 more -- Thanks Sounak