退订
> 2023年12月5日 19:33,李甜彪 <ltb1...@163.com> 写道: > > 构建时报错,数据在hive中是没有问题的,空数据构建时可以成功,反思有可能是数据问题,自己手写几条数据,构建时又同样的错误,证明不是原来的数据的问题。 > 页面的看到的报错信息如下: > java.io.IOException: OS command error exit with return code: 1, error > message: che.spark.rdd.RDD.iterator(RDD.scala:337) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) > at > org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) > at org.apache.spark.scheduler.Task.run(Task.scala:131) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:498) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:501) > ... 3 more > > } > RetryInfo{ > overrideConf : {}, > throwable : java.lang.RuntimeException: Error execute > org.apache.kylin.engine.spark.job.CubeBuildJob > at > org.apache.kylin.engine.spark.application.SparkApplication.execute(SparkApplication.java:96) > at org.apache.spark.application.JobWorker$$anon$2.run(JobWorker.scala:55) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: > Task 0 in stage 74.0 failed 4 times, most recent failure: Lost task 0.3 in > stage 74.0 (TID 186) (store2 executor 20): java.lang.NoClassDefFoundError: > Could not initialize class org.apache.hadoop.hive.conf.HiveConf$ConfVars > at > org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters.(LazySerDeParameters.java:103) > at > org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.initialize(LazySimpleSerDe.java:125) > at > org.apache.spark.sql.hive.HadoopTableReader.$anonfun$makeRDDForTable$3(TableReader.scala:136) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) > at > org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) > at org.apache.spark.scheduler.Task.run(Task.scala:131) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:498) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:501) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2303) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2252) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2251) > at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) > at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) > at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2251) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1124) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1124) > at scala.Option.foreach(Option.scala:407) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1124) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2490) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2432) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2421) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:902) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2261) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:414) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1029) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:390) > at org.apache.spark.sql.Dataset.$anonfun$count$1(Dataset.scala:3019) > at org.apache.spark.sql.Dataset.$anonfun$count$1$adapted(Dataset.scala:3018) > at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3700) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3698) > at org.apache.spark.sql.Dataset.count(Dataset.scala:3018) > at > org.apache.kylin.engine.spark.builder.CubeSnapshotBuilder.$anonfun$checkDupKey$1(CubeSnapshotBuilder.scala:196) > at > org.apache.kylin.engine.spark.builder.CubeSnapshotBuilder.$anonfun$checkDupKey$1$adapted(CubeSnapshotBuilder.scala:190) > at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36) > at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33) > at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198) > at > org.apache.kylin.engine.spark.builder.CubeSnapshotBuilder.checkDupKey(CubeSnapshotBuilder.scala:190) > at > org.apache.kylin.engine.spark.job.ParentSourceChooser.decideFlatTableSource(ParentSourceChooser.scala:88) > at > org.apache.kylin.engine.spark.job.ParentSourceChooser.$anonfun$decideSources$1(ParentSourceChooser.scala:76) > at > org.apache.kylin.engine.spark.job.ParentSourceChooser.$anonfun$decideSources$1$adapted(ParentSourceChooser.scala:71) > at scala.collection.Iterator.foreach(Iterator.scala:941) > at scala.collection.Iterator.foreach$(Iterator.scala:941) > at scala.collection.AbstractIterator.foreach(Iterator.scala:1429) > at scala.collection.IterableLike.foreach(IterableLike.scala:74) > at scala.collection.IterableLike.foreach$(IterableLike.scala:73) > at scala.collection.AbstractIterable.foreach(Iterable.scala:56) > at > org.apache.kylin.engine.spark.job.ParentSourceChooser.decideSources(ParentSourceChooser.scala:71) > at > org.apache.kylin.engine.spark.job.CubeBuildJob.doExecute(CubeBuildJob.java:181) > at > org.apache.kylin.engine.spark.application.SparkApplication.execute(SparkApplication.java:307) > at > org.apache.kylin.engine.spark.application.SparkApplication.execute(SparkApplication.java:93) > ... 4 more > Caused by: java.lang.NoClassDefFoundError: Could not initialize class > org.apache.hadoop.hive.conf.HiveConf$ConfVars > at > org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters.(LazySerDeParameters.java:103) > at > org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.initialize(LazySimpleSerDe.java:125) > at > org.apache.spark.sql.hive.HadoopTableReader.$anonfun$makeRDDForTable$3(TableReader.scala:136) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) > at > org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) > at org.apache.spark.scheduler.Task.run(Task.scala:131) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:498) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:501) > ... 3 more > > } > ==========================[BUILD CUBE]=============================== > > 2023-12-05 19:21:28,789 INFO [shutdown-hook-0] server.AbstractConnector : > Stopped Spark@447f1c33{HTTP/1.1, (http/1.1)}{0.0.0.0:4040} > The command is: > export HADOOP_CONF_DIR=/srv/kylin/kylin-job/hadoop_conf && > /srv/kylin/kylin-job/spark/bin/spark-submit --class > org.apache.kylin.engine.spark.application.SparkEntry --conf > 'spark.sql.hive.metastore.version=2.1.1' --conf > 'spark.executor.instances=40' --conf 'spark.yarn.queue=default' --conf > 'spark.history.fs.logDirectory=hdfs:///kylin4_3/spark-history' > <hdfs:///kylin4_3/spark-history'> --conf > 'spark.driver.extraJavaOptions=-XX:+CrashOnOutOfMemoryError > -Dlog4j.configuration=file:/srv/kylin/kylin-job/conf/spark-driver-log4j-default.properties > -Dkylin.kerberos.enabled=false > -Dkylin.hdfs.working.dir=hdfs://xxxx:8020/kylin4_3/kylin_metadata/ > <hdfs://xxxx:8020/kylin4_3/kylin_metadata/> > -Dspark.driver.log4j.appender.hdfs.File=hdfs://xxxx:8020/kylin4_3/kylin_metadata/bjdw/spark_logs/driver/c3179a16-b198-46b1-8242-38e77024c91a-01/execute_output.json.1701775097302.log > > <hdfs://xxxx:8020/kylin4_3/kylin_metadata/bjdw/spark_logs/driver/c3179a16-b198-46b1-8242-38e77024c91a-01/execute_output.json.1701775097302.log> > -Dlog4j.debug=true -Dspark.driver.rest.server.address=query1:7071 > -Dspark.driver.param.taskId=c3179a16-b198-46b1-8242-38e77024c91a-01 > -Dspark.driver.local.logDir=/srv/kylin/kylin-job/logs/spark' --conf > 'spark.master=yarn' --conf > 'spark.executor.extraJavaOptions=-Dfile.encoding=UTF-8 -Dhdp.version=current > -Dlog4j.configuration=spark-executor-log4j-default.properties -Dlog4j.debug > -Dkylin.hdfs.working.dir=hdfs://xxxx:8020/kylin4_3/kylin_metadata/ > <hdfs://xxxx:8020/kylin4_3/kylin_metadata/> > -Dkylin.metadata.identifier=kylin_metadata -Dkylin.spark.category=job > -Dkylin.spark.project=bjdw > -Dkylin.spark.identifier=c3179a16-b198-46b1-8242-38e77024c91a > -Dkylin.spark.jobName=c3179a16-b198-46b1-8242-38e77024c91a-01 > -Duser.timezone=GMT+8' --conf > 'spark.hadoop.yarn.timeline-service.enabled=false' --conf > 'spark.driver.cores=2' --conf 'spark.executor.memory=6G' --conf > 'spark.eventLog.enabled=true' --conf > 'spark.eventLog.dir=hdfs:///kylin4_3/spark-history' > <hdfs:///kylin4_3/spark-history'> --conf 'spark.executor.cores=2' --conf > 'spark.sql.hive.metastore.jars=/opt/cloudera/parcels/CDH/lib/hive/lib/*:/opt/cloudera/parcels/CDH/lib/hadoop/*:/opt/cloudera/parcels/CDH/lib/hadoop/lib/*:/opt/cloudera/parcels/CDH/lib/hadoop-hdfs/*:/opt/cloudera/parcels/CDH/lib/hadoop-yarn/*:/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/*' > --conf 'spark.executor.memoryOverhead=1024M' --conf > 'spark.driver.memory=4G' --conf 'spark.driver.memoryOverhead=512M' --conf > 'spark.submit.deployMode=client' --conf > 'spark.executor.extraClassPath=kylin-parquet-job-4.0.3.jar' --conf > 'spark.driver.extraClassPath=/srv/kylin/kylin-job/lib/kylin-parquet-job-4.0.3.jar' > --files /srv/kylin/kylin-job/conf/spark-executor-log4j-default.properties > --name job_step_c3179a16-b198-46b1-8242-38e77024c91a-01 --jars > /srv/kylin/kylin-job/lib/kylin-parquet-job-4.0.3.jar > /srv/kylin/kylin-job/lib/kylin-parquet-job-4.0.3.jar -className > org.apache.kylin.engine.spark.job.CubeBuildJob > hdfs://xxxx:8020/kylin4_3/kylin_metadata/bjdw/job_tmp/c3179a16-b198-46b1-8242-38e77024c91a-01_jobId > > <hdfs://xxxx:8020/kylin4_3/kylin_metadata/bjdw/job_tmp/c3179a16-b198-46b1-8242-38e77024c91a-01_jobId> > at > org.apache.kylin.common.util.CliCommandExecutor.execute(CliCommandExecutor.java:99) > at > org.apache.kylin.engine.spark.job.NSparkExecutable.runSparkSubmit(NSparkExecutable.java:295) > at > org.apache.kylin.engine.spark.job.NSparkExecutable.doWork(NSparkExecutable.java:181) > at > org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:206) > at > org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(DefaultChainedExecutable.java:94) > at > org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:206) > at > org.apache.kylin.job.impl.threadpool.DefaultScheduler$JobRunner.run(DefaultScheduler.java:113) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > > > > 李甜彪 > ltb1...@163.com > > <https://dashi.163.com/projects/signature-manager/detail/index.html?ftlId=1&name=%E6%9D%8E%E7%94%9C%E5%BD%AA&uid=ltb1028%40163.com&iconUrl=https%3A%2F%2Fmail-online.nosdn.127.net%2Fsm8788d5844515319904345e01d58862c9.jpg&items=%5B%22ltb1028%40163.com%22%5D>