构建时报错,数据在hive中是没有问题的,空数据构建时可以成功,反思有可能是数据问题,自己手写几条数据,构建时又同样的错误,证明不是原来的数据的问题。 页面的看到的报错信息如下: java.io.IOException: OS command error exit with return code: 1, error message: che.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) at org.apache.spark.scheduler.Task.run(Task.scala:131) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:498) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:501) ... 3 more
} RetryInfo{ overrideConf : {}, throwable : java.lang.RuntimeException: Error execute org.apache.kylin.engine.spark.job.CubeBuildJob at org.apache.kylin.engine.spark.application.SparkApplication.execute(SparkApplication.java:96) at org.apache.spark.application.JobWorker$$anon$2.run(JobWorker.scala:55) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 74.0 failed 4 times, most recent failure: Lost task 0.3 in stage 74.0 (TID 186) (store2 executor 20): java.lang.NoClassDefFoundError: Could not initialize class org.apache.hadoop.hive.conf.HiveConf$ConfVars at org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters.(LazySerDeParameters.java:103) at org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.initialize(LazySimpleSerDe.java:125) at org.apache.spark.sql.hive.HadoopTableReader.$anonfun$makeRDDForTable$3(TableReader.scala:136) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) at org.apache.spark.scheduler.Task.run(Task.scala:131) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:498) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:501) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2303) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2252) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2251) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2251) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1124) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1124) at scala.Option.foreach(Option.scala:407) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1124) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2490) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2432) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2421) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:902) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2261) at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:414) at org.apache.spark.rdd.RDD.collect(RDD.scala:1029) at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:390) at org.apache.spark.sql.Dataset.$anonfun$count$1(Dataset.scala:3019) at org.apache.spark.sql.Dataset.$anonfun$count$1$adapted(Dataset.scala:3018) at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3700) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3698) at org.apache.spark.sql.Dataset.count(Dataset.scala:3018) at org.apache.kylin.engine.spark.builder.CubeSnapshotBuilder.$anonfun$checkDupKey$1(CubeSnapshotBuilder.scala:196) at org.apache.kylin.engine.spark.builder.CubeSnapshotBuilder.$anonfun$checkDupKey$1$adapted(CubeSnapshotBuilder.scala:190) at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36) at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198) at org.apache.kylin.engine.spark.builder.CubeSnapshotBuilder.checkDupKey(CubeSnapshotBuilder.scala:190) at org.apache.kylin.engine.spark.job.ParentSourceChooser.decideFlatTableSource(ParentSourceChooser.scala:88) at org.apache.kylin.engine.spark.job.ParentSourceChooser.$anonfun$decideSources$1(ParentSourceChooser.scala:76) at org.apache.kylin.engine.spark.job.ParentSourceChooser.$anonfun$decideSources$1$adapted(ParentSourceChooser.scala:71) at scala.collection.Iterator.foreach(Iterator.scala:941) at scala.collection.Iterator.foreach$(Iterator.scala:941) at scala.collection.AbstractIterator.foreach(Iterator.scala:1429) at scala.collection.IterableLike.foreach(IterableLike.scala:74) at scala.collection.IterableLike.foreach$(IterableLike.scala:73) at scala.collection.AbstractIterable.foreach(Iterable.scala:56) at org.apache.kylin.engine.spark.job.ParentSourceChooser.decideSources(ParentSourceChooser.scala:71) at org.apache.kylin.engine.spark.job.CubeBuildJob.doExecute(CubeBuildJob.java:181) at org.apache.kylin.engine.spark.application.SparkApplication.execute(SparkApplication.java:307) at org.apache.kylin.engine.spark.application.SparkApplication.execute(SparkApplication.java:93) ... 4 more Caused by: java.lang.NoClassDefFoundError: Could not initialize class org.apache.hadoop.hive.conf.HiveConf$ConfVars at org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters.(LazySerDeParameters.java:103) at org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.initialize(LazySimpleSerDe.java:125) at org.apache.spark.sql.hive.HadoopTableReader.$anonfun$makeRDDForTable$3(TableReader.scala:136) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) at org.apache.spark.scheduler.Task.run(Task.scala:131) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:498) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:501) ... 3 more } ==========================[BUILD CUBE]=============================== 2023-12-05 19:21:28,789 INFO [shutdown-hook-0] server.AbstractConnector : Stopped Spark@447f1c33{HTTP/1.1, (http/1.1)}{0.0.0.0:4040} The command is: export HADOOP_CONF_DIR=/srv/kylin/kylin-job/hadoop_conf && /srv/kylin/kylin-job/spark/bin/spark-submit --class org.apache.kylin.engine.spark.application.SparkEntry --conf 'spark.sql.hive.metastore.version=2.1.1' --conf 'spark.executor.instances=40' --conf 'spark.yarn.queue=default' --conf 'spark.history.fs.logDirectory=hdfs:///kylin4_3/spark-history' --conf 'spark.driver.extraJavaOptions=-XX:+CrashOnOutOfMemoryError -Dlog4j.configuration=file:/srv/kylin/kylin-job/conf/spark-driver-log4j-default.properties -Dkylin.kerberos.enabled=false -Dkylin.hdfs.working.dir=hdfs://xxxx:8020/kylin4_3/kylin_metadata/ -Dspark.driver.log4j.appender.hdfs.File=hdfs://xxxx:8020/kylin4_3/kylin_metadata/bjdw/spark_logs/driver/c3179a16-b198-46b1-8242-38e77024c91a-01/execute_output.json.1701775097302.log -Dlog4j.debug=true -Dspark.driver.rest.server.address=query1:7071 -Dspark.driver.param.taskId=c3179a16-b198-46b1-8242-38e77024c91a-01 -Dspark.driver.local.logDir=/srv/kylin/kylin-job/logs/spark' --conf 'spark.master=yarn' --conf 'spark.executor.extraJavaOptions=-Dfile.encoding=UTF-8 -Dhdp.version=current -Dlog4j.configuration=spark-executor-log4j-default.properties -Dlog4j.debug -Dkylin.hdfs.working.dir=hdfs://xxxx:8020/kylin4_3/kylin_metadata/ -Dkylin.metadata.identifier=kylin_metadata -Dkylin.spark.category=job -Dkylin.spark.project=bjdw -Dkylin.spark.identifier=c3179a16-b198-46b1-8242-38e77024c91a -Dkylin.spark.jobName=c3179a16-b198-46b1-8242-38e77024c91a-01 -Duser.timezone=GMT+8' --conf 'spark.hadoop.yarn.timeline-service.enabled=false' --conf 'spark.driver.cores=2' --conf 'spark.executor.memory=6G' --conf 'spark.eventLog.enabled=true' --conf 'spark.eventLog.dir=hdfs:///kylin4_3/spark-history' --conf 'spark.executor.cores=2' --conf 'spark.sql.hive.metastore.jars=/opt/cloudera/parcels/CDH/lib/hive/lib/*:/opt/cloudera/parcels/CDH/lib/hadoop/*:/opt/cloudera/parcels/CDH/lib/hadoop/lib/*:/opt/cloudera/parcels/CDH/lib/hadoop-hdfs/*:/opt/cloudera/parcels/CDH/lib/hadoop-yarn/*:/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/*' --conf 'spark.executor.memoryOverhead=1024M' --conf 'spark.driver.memory=4G' --conf 'spark.driver.memoryOverhead=512M' --conf 'spark.submit.deployMode=client' --conf 'spark.executor.extraClassPath=kylin-parquet-job-4.0.3.jar' --conf 'spark.driver.extraClassPath=/srv/kylin/kylin-job/lib/kylin-parquet-job-4.0.3.jar' --files /srv/kylin/kylin-job/conf/spark-executor-log4j-default.properties --name job_step_c3179a16-b198-46b1-8242-38e77024c91a-01 --jars /srv/kylin/kylin-job/lib/kylin-parquet-job-4.0.3.jar /srv/kylin/kylin-job/lib/kylin-parquet-job-4.0.3.jar -className org.apache.kylin.engine.spark.job.CubeBuildJob hdfs://xxxx:8020/kylin4_3/kylin_metadata/bjdw/job_tmp/c3179a16-b198-46b1-8242-38e77024c91a-01_jobId at org.apache.kylin.common.util.CliCommandExecutor.execute(CliCommandExecutor.java:99) at org.apache.kylin.engine.spark.job.NSparkExecutable.runSparkSubmit(NSparkExecutable.java:295) at org.apache.kylin.engine.spark.job.NSparkExecutable.doWork(NSparkExecutable.java:181) at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:206) at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(DefaultChainedExecutable.java:94) at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:206) at org.apache.kylin.job.impl.threadpool.DefaultScheduler$JobRunner.run(DefaultScheduler.java:113) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) | | 李甜彪 | | ltb1...@163.com |