[jira] [Commented] (SPARK-32380) sparksql cannot access hive table while data in hbase
[ https://issues.apache.org/jira/browse/SPARK-32380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17629239#comment-17629239 ] Apache Spark commented on SPARK-32380: -- User 'attilapiros' has created a pull request for this issue: https://github.com/apache/spark/pull/38516 > sparksql cannot access hive table while data in hbase > - > > Key: SPARK-32380 > URL: https://issues.apache.org/jira/browse/SPARK-32380 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.0.0 > Environment: ||component||version|| > |hadoop|2.8.5| > |hive|2.3.7| > |spark|3.0.0| > |hbase|1.4.9| >Reporter: deyzhong >Priority: Major > Original Estimate: 72h > Remaining Estimate: 72h > > * step1: create hbase table > {code:java} > hbase(main):001:0>create 'hbase_test1', 'cf1' > hbase(main):001:0> put 'hbase_test', 'r1', 'cf1:c1', '123' > {code} > * step2: create hive table related to hbase table > > {code:java} > hive> > CREATE EXTERNAL TABLE `hivetest.hbase_test`( > `key` string COMMENT '', > `value` string COMMENT '') > ROW FORMAT SERDE > 'org.apache.hadoop.hive.hbase.HBaseSerDe' > STORED BY > 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' > WITH SERDEPROPERTIES ( > 'hbase.columns.mapping'=':key,cf1:v1', > 'serialization.format'='1') > TBLPROPERTIES ( > 'hbase.table.name'='hbase_test') > {code} > * step3: sparksql query hive table while data in hbase > {code:java} > spark-sql --master yarn -e "select * from hivetest.hbase_test" > {code} > > The error log as follow: > java.io.IOException: Cannot create a record reader because of a previous > error. Please look at the previous logs lines from the task's full log for > more details. > at > org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.getSplits(TableInputFormatBase.java:270) > at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:131) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1003) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385) > at > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:412) > at > org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:58) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:377) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.$anonfun$processLine$1(SparkSQLCLIDriver.scala:496) > at scala.collection.Iterator.foreach(Iterator.scala:941) > at scala.collection.I
[jira] [Commented] (SPARK-32380) sparksql cannot access hive table while data in hbase
[ https://issues.apache.org/jira/browse/SPARK-32380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17627704#comment-17627704 ] Ranga Reddy commented on SPARK-32380: - The below pull request will solve the issue but needs to check if there are any other issues. [https://github.com/apache/spark/pull/29178] > sparksql cannot access hive table while data in hbase > - > > Key: SPARK-32380 > URL: https://issues.apache.org/jira/browse/SPARK-32380 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.0.0 > Environment: ||component||version|| > |hadoop|2.8.5| > |hive|2.3.7| > |spark|3.0.0| > |hbase|1.4.9| >Reporter: deyzhong >Priority: Major > Original Estimate: 72h > Remaining Estimate: 72h > > * step1: create hbase table > {code:java} > hbase(main):001:0>create 'hbase_test1', 'cf1' > hbase(main):001:0> put 'hbase_test', 'r1', 'cf1:c1', '123' > {code} > * step2: create hive table related to hbase table > > {code:java} > hive> > CREATE EXTERNAL TABLE `hivetest.hbase_test`( > `key` string COMMENT '', > `value` string COMMENT '') > ROW FORMAT SERDE > 'org.apache.hadoop.hive.hbase.HBaseSerDe' > STORED BY > 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' > WITH SERDEPROPERTIES ( > 'hbase.columns.mapping'=':key,cf1:v1', > 'serialization.format'='1') > TBLPROPERTIES ( > 'hbase.table.name'='hbase_test') > {code} > * step3: sparksql query hive table while data in hbase > {code:java} > spark-sql --master yarn -e "select * from hivetest.hbase_test" > {code} > > The error log as follow: > java.io.IOException: Cannot create a record reader because of a previous > error. Please look at the previous logs lines from the task's full log for > more details. > at > org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.getSplits(TableInputFormatBase.java:270) > at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:131) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1003) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385) > at > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:412) > at > org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:58) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:377) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.$anonfun$processLine$1(SparkSQLCLIDriver.scala:496) > at scala.collection.Iterator.foreach(Iterato
[jira] [Commented] (SPARK-32380) sparksql cannot access hive table while data in hbase
[ https://issues.apache.org/jira/browse/SPARK-32380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17622497#comment-17622497 ] Mehul Thakkar commented on SPARK-32380: --- [~meimile] Since changes were merged on Jan 11, 2021, I expect this issue to be fixed in version 3.0.2 but it is not fixed even in Spark 3.2.2. > sparksql cannot access hive table while data in hbase > - > > Key: SPARK-32380 > URL: https://issues.apache.org/jira/browse/SPARK-32380 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.0.0 > Environment: ||component||version|| > |hadoop|2.8.5| > |hive|2.3.7| > |spark|3.0.0| > |hbase|1.4.9| >Reporter: deyzhong >Priority: Major > Original Estimate: 72h > Remaining Estimate: 72h > > * step1: create hbase table > {code:java} > hbase(main):001:0>create 'hbase_test1', 'cf1' > hbase(main):001:0> put 'hbase_test', 'r1', 'cf1:c1', '123' > {code} > * step2: create hive table related to hbase table > > {code:java} > hive> > CREATE EXTERNAL TABLE `hivetest.hbase_test`( > `key` string COMMENT '', > `value` string COMMENT '') > ROW FORMAT SERDE > 'org.apache.hadoop.hive.hbase.HBaseSerDe' > STORED BY > 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' > WITH SERDEPROPERTIES ( > 'hbase.columns.mapping'=':key,cf1:v1', > 'serialization.format'='1') > TBLPROPERTIES ( > 'hbase.table.name'='hbase_test') > {code} > * step3: sparksql query hive table while data in hbase > {code:java} > spark-sql --master yarn -e "select * from hivetest.hbase_test" > {code} > > The error log as follow: > java.io.IOException: Cannot create a record reader because of a previous > error. Please look at the previous logs lines from the task's full log for > more details. > at > org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.getSplits(TableInputFormatBase.java:270) > at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:131) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1003) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385) > at > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:412) > at > org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:58) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:377) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.$anonfun$processLine$1(SparkSQLCLIDriver.scala:496) > at scala.collection.Iterator.foreach(I
[jira] [Commented] (SPARK-32380) sparksql cannot access hive table while data in hbase
[ https://issues.apache.org/jira/browse/SPARK-32380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17622266#comment-17622266 ] Mehul Thakkar commented on SPARK-32380: --- I am waiting for the permanent fix for this issue in Spark 3.x > sparksql cannot access hive table while data in hbase > - > > Key: SPARK-32380 > URL: https://issues.apache.org/jira/browse/SPARK-32380 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.0.0 > Environment: ||component||version|| > |hadoop|2.8.5| > |hive|2.3.7| > |spark|3.0.0| > |hbase|1.4.9| >Reporter: deyzhong >Priority: Major > Original Estimate: 72h > Remaining Estimate: 72h > > * step1: create hbase table > {code:java} > hbase(main):001:0>create 'hbase_test1', 'cf1' > hbase(main):001:0> put 'hbase_test', 'r1', 'cf1:c1', '123' > {code} > * step2: create hive table related to hbase table > > {code:java} > hive> > CREATE EXTERNAL TABLE `hivetest.hbase_test`( > `key` string COMMENT '', > `value` string COMMENT '') > ROW FORMAT SERDE > 'org.apache.hadoop.hive.hbase.HBaseSerDe' > STORED BY > 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' > WITH SERDEPROPERTIES ( > 'hbase.columns.mapping'=':key,cf1:v1', > 'serialization.format'='1') > TBLPROPERTIES ( > 'hbase.table.name'='hbase_test') > {code} > * step3: sparksql query hive table while data in hbase > {code:java} > spark-sql --master yarn -e "select * from hivetest.hbase_test" > {code} > > The error log as follow: > java.io.IOException: Cannot create a record reader because of a previous > error. Please look at the previous logs lines from the task's full log for > more details. > at > org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.getSplits(TableInputFormatBase.java:270) > at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:131) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1003) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385) > at > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:412) > at > org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:58) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:377) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.$anonfun$processLine$1(SparkSQLCLIDriver.scala:496) > at scala.collection.Iterator.foreach(Iterator.scala:941) > at scala.collection.Iterator.foreach$(Iterator.scala:941) >
[jira] [Commented] (SPARK-32380) sparksql cannot access hive table while data in hbase
[ https://issues.apache.org/jira/browse/SPARK-32380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17329119#comment-17329119 ] Yennam ShowryPremSagar Reddy commented on SPARK-32380: -- [~meimile] Will there be any permanent fix for this issues spark3 and HIve on top of Hbase tables access > sparksql cannot access hive table while data in hbase > - > > Key: SPARK-32380 > URL: https://issues.apache.org/jira/browse/SPARK-32380 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.0.0 > Environment: ||component||version|| > |hadoop|2.8.5| > |hive|2.3.7| > |spark|3.0.0| > |hbase|1.4.9| >Reporter: deyzhong >Priority: Major > Original Estimate: 72h > Remaining Estimate: 72h > > * step1: create hbase table > {code:java} > hbase(main):001:0>create 'hbase_test1', 'cf1' > hbase(main):001:0> put 'hbase_test', 'r1', 'cf1:c1', '123' > {code} > * step2: create hive table related to hbase table > > {code:java} > hive> > CREATE EXTERNAL TABLE `hivetest.hbase_test`( > `key` string COMMENT '', > `value` string COMMENT '') > ROW FORMAT SERDE > 'org.apache.hadoop.hive.hbase.HBaseSerDe' > STORED BY > 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' > WITH SERDEPROPERTIES ( > 'hbase.columns.mapping'=':key,cf1:v1', > 'serialization.format'='1') > TBLPROPERTIES ( > 'hbase.table.name'='hbase_test') > {code} > * step3: sparksql query hive table while data in hbase > {code:java} > spark-sql --master yarn -e "select * from hivetest.hbase_test" > {code} > > The error log as follow: > java.io.IOException: Cannot create a record reader because of a previous > error. Please look at the previous logs lines from the task's full log for > more details. > at > org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.getSplits(TableInputFormatBase.java:270) > at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:131) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1003) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385) > at > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:412) > at > org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:58) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:377) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.$anonfun$processLine$1(SparkSQLCLIDriver.scala:496) > at scala.collection.Iterator.foreach(Iterator.sc
[jira] [Commented] (SPARK-32380) sparksql cannot access hive table while data in hbase
[ https://issues.apache.org/jira/browse/SPARK-32380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17263153#comment-17263153 ] Apache Spark commented on SPARK-32380: -- User 'yangBottle' has created a pull request for this issue: https://github.com/apache/spark/pull/31147 > sparksql cannot access hive table while data in hbase > - > > Key: SPARK-32380 > URL: https://issues.apache.org/jira/browse/SPARK-32380 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.0.0 > Environment: ||component||version|| > |hadoop|2.8.5| > |hive|2.3.7| > |spark|3.0.0| > |hbase|1.4.9| >Reporter: deyzhong >Priority: Major > Original Estimate: 72h > Remaining Estimate: 72h > > * step1: create hbase table > {code:java} > hbase(main):001:0>create 'hbase_test1', 'cf1' > hbase(main):001:0> put 'hbase_test', 'r1', 'cf1:c1', '123' > {code} > * step2: create hive table related to hbase table > > {code:java} > hive> > CREATE EXTERNAL TABLE `hivetest.hbase_test`( > `key` string COMMENT '', > `value` string COMMENT '') > ROW FORMAT SERDE > 'org.apache.hadoop.hive.hbase.HBaseSerDe' > STORED BY > 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' > WITH SERDEPROPERTIES ( > 'hbase.columns.mapping'=':key,cf1:v1', > 'serialization.format'='1') > TBLPROPERTIES ( > 'hbase.table.name'='hbase_test') > {code} > * step3: sparksql query hive table while data in hbase > {code:java} > spark-sql --master yarn -e "select * from hivetest.hbase_test" > {code} > > The error log as follow: > java.io.IOException: Cannot create a record reader because of a previous > error. Please look at the previous logs lines from the task's full log for > more details. > at > org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.getSplits(TableInputFormatBase.java:270) > at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:131) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1003) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385) > at > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:412) > at > org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:58) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:377) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.$anonfun$processLine$1(SparkSQLCLIDriver.scala:496) > at scala.collection.Iterator.foreach(Iterator.scala:941) > at scala.collection.It
[jira] [Commented] (SPARK-32380) sparksql cannot access hive table while data in hbase
[ https://issues.apache.org/jira/browse/SPARK-32380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17263152#comment-17263152 ] Apache Spark commented on SPARK-32380: -- User 'yangBottle' has created a pull request for this issue: https://github.com/apache/spark/pull/31147 > sparksql cannot access hive table while data in hbase > - > > Key: SPARK-32380 > URL: https://issues.apache.org/jira/browse/SPARK-32380 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.0.0 > Environment: ||component||version|| > |hadoop|2.8.5| > |hive|2.3.7| > |spark|3.0.0| > |hbase|1.4.9| >Reporter: deyzhong >Priority: Major > Original Estimate: 72h > Remaining Estimate: 72h > > * step1: create hbase table > {code:java} > hbase(main):001:0>create 'hbase_test1', 'cf1' > hbase(main):001:0> put 'hbase_test', 'r1', 'cf1:c1', '123' > {code} > * step2: create hive table related to hbase table > > {code:java} > hive> > CREATE EXTERNAL TABLE `hivetest.hbase_test`( > `key` string COMMENT '', > `value` string COMMENT '') > ROW FORMAT SERDE > 'org.apache.hadoop.hive.hbase.HBaseSerDe' > STORED BY > 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' > WITH SERDEPROPERTIES ( > 'hbase.columns.mapping'=':key,cf1:v1', > 'serialization.format'='1') > TBLPROPERTIES ( > 'hbase.table.name'='hbase_test') > {code} > * step3: sparksql query hive table while data in hbase > {code:java} > spark-sql --master yarn -e "select * from hivetest.hbase_test" > {code} > > The error log as follow: > java.io.IOException: Cannot create a record reader because of a previous > error. Please look at the previous logs lines from the task's full log for > more details. > at > org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.getSplits(TableInputFormatBase.java:270) > at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:131) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1003) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385) > at > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:412) > at > org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:58) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:377) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.$anonfun$processLine$1(SparkSQLCLIDriver.scala:496) > at scala.collection.Iterator.foreach(Iterator.scala:941) > at scala.collection.It
[jira] [Commented] (SPARK-32380) sparksql cannot access hive table while data in hbase
[ https://issues.apache.org/jira/browse/SPARK-32380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17162059#comment-17162059 ] Apache Spark commented on SPARK-32380: -- User 'DeyinZhong' has created a pull request for this issue: https://github.com/apache/spark/pull/29178 > sparksql cannot access hive table while data in hbase > - > > Key: SPARK-32380 > URL: https://issues.apache.org/jira/browse/SPARK-32380 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.0.0 > Environment: ||component||version|| > |hadoop|2.8.5| > |hive|2.3.7| > |spark|3.0.0| > |hbase|1.4.9| >Reporter: deyzhong >Priority: Major > Original Estimate: 72h > Remaining Estimate: 72h > > * step1: create hbase table > {code:java} > hbase(main):001:0>create 'hbase_test1', 'cf1' > hbase(main):001:0> put 'hbase_test', 'r1', 'cf1:c1', '123' > {code} > * step2: create hive table related to hbase table > > {code:java} > hive> > CREATE EXTERNAL TABLE `hivetest.hbase_test`( > `key` string COMMENT '', > `value` string COMMENT '') > ROW FORMAT SERDE > 'org.apache.hadoop.hive.hbase.HBaseSerDe' > STORED BY > 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' > WITH SERDEPROPERTIES ( > 'hbase.columns.mapping'=':key,cf1:v1', > 'serialization.format'='1') > TBLPROPERTIES ( > 'hbase.table.name'='hbase_test') > {code} > * step3: sparksql query hive table while data in hbase > {code:java} > spark-sql --master yarn -e "select * from hivetest.hbase_test" > {code} > > The error log as follow: > java.io.IOException: Cannot create a record reader because of a previous > error. Please look at the previous logs lines from the task's full log for > more details. > at > org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.getSplits(TableInputFormatBase.java:270) > at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:131) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1003) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385) > at > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:412) > at > org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:58) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:377) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.$anonfun$processLine$1(SparkSQLCLIDriver.scala:496) > at scala.collection.Iterator.foreach(Iterator.scala:941) > at scala.collection.It
[jira] [Commented] (SPARK-32380) sparksql cannot access hive table while data in hbase
[ https://issues.apache.org/jira/browse/SPARK-32380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17162058#comment-17162058 ] Apache Spark commented on SPARK-32380: -- User 'DeyinZhong' has created a pull request for this issue: https://github.com/apache/spark/pull/29178 > sparksql cannot access hive table while data in hbase > - > > Key: SPARK-32380 > URL: https://issues.apache.org/jira/browse/SPARK-32380 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.0.0 > Environment: ||component||version|| > |hadoop|2.8.5| > |hive|2.3.7| > |spark|3.0.0| > |hbase|1.4.9| >Reporter: deyzhong >Priority: Major > Original Estimate: 72h > Remaining Estimate: 72h > > * step1: create hbase table > {code:java} > hbase(main):001:0>create 'hbase_test1', 'cf1' > hbase(main):001:0> put 'hbase_test', 'r1', 'cf1:c1', '123' > {code} > * step2: create hive table related to hbase table > > {code:java} > hive> > CREATE EXTERNAL TABLE `hivetest.hbase_test`( > `key` string COMMENT '', > `value` string COMMENT '') > ROW FORMAT SERDE > 'org.apache.hadoop.hive.hbase.HBaseSerDe' > STORED BY > 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' > WITH SERDEPROPERTIES ( > 'hbase.columns.mapping'=':key,cf1:v1', > 'serialization.format'='1') > TBLPROPERTIES ( > 'hbase.table.name'='hbase_test') > {code} > * step3: sparksql query hive table while data in hbase > {code:java} > spark-sql --master yarn -e "select * from hivetest.hbase_test" > {code} > > The error log as follow: > java.io.IOException: Cannot create a record reader because of a previous > error. Please look at the previous logs lines from the task's full log for > more details. > at > org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.getSplits(TableInputFormatBase.java:270) > at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:131) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1003) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385) > at > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:412) > at > org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:58) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:377) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.$anonfun$processLine$1(SparkSQLCLIDriver.scala:496) > at scala.collection.Iterator.foreach(Iterator.scala:941) > at scala.collection.It
[jira] [Commented] (SPARK-32380) sparksql cannot access hive table while data in hbase
[ https://issues.apache.org/jira/browse/SPARK-32380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17161959#comment-17161959 ] deyzhong commented on SPARK-32380: -- I have solved this bug by modified TableReader.scala. The solution is when the inputformat class is org.apache.hadoop.hive.hbase.HiveHBaseTableInputFormat, will create OldHadoopRDD. I have tested in my product env as well. Can I submit a pr to spark ? [~apachespark] > sparksql cannot access hive table while data in hbase > - > > Key: SPARK-32380 > URL: https://issues.apache.org/jira/browse/SPARK-32380 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.0.0 > Environment: ||component||version|| > |hadoop|2.8.5| > |hive|2.3.7| > |spark|3.0.0| > |hbase|1.4.9| >Reporter: deyzhong >Priority: Major > Original Estimate: 72h > Remaining Estimate: 72h > > * step1: create hbase table > {code:java} > hbase(main):001:0>create 'hbase_test1', 'cf1' > hbase(main):001:0> put 'hbase_test', 'r1', 'cf1:c1', '123' > {code} > * step2: create hive table related to hbase table > > {code:java} > hive> > CREATE EXTERNAL TABLE `hivetest.hbase_test`( > `key` string COMMENT '', > `value` string COMMENT '') > ROW FORMAT SERDE > 'org.apache.hadoop.hive.hbase.HBaseSerDe' > STORED BY > 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' > WITH SERDEPROPERTIES ( > 'hbase.columns.mapping'=':key,cf1:v1', > 'serialization.format'='1') > TBLPROPERTIES ( > 'hbase.table.name'='hbase_test') > {code} > * sparksql query hive table while data in hbase > {code:java} > spark-sql --master yarn -e "select * from hivetest.hbase_test" > {code} > > The error log as follow: > java.io.IOException: Cannot create a record reader because of a previous > error. Please look at the previous logs lines from the task's full log for > more details. > at > org.apache.hadoop.hbase.mapreduce.TableInputFormatBase.getSplits(TableInputFormatBase.java:270) > at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:131) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49) > at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:276) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:272) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1003) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385) > at > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:412) > at > org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:58) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:377) > at > org.apache.spark.sql.hive.thriftserve