[ https://issues.apache.org/jira/browse/PHOENIX-3506?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16389512#comment-16389512 ]
Rene Castberg commented on PHOENIX-3506: ---------------------------------------- I am facing the same issue, i have tried the jdbc connector and the spark df method, both give the same result (Code shown below) Is there any solution or work around for this yet? {code:java} df = sqlContext.read \ .format("org.apache.phoenix.spark") \ .option("table", table) \ .option("zkUrl", zkURL) \ .load() df.show(){code} {code:java} --------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-57-b7d9a96770d4> in <module>() ----> 1 df.take(5) /usr/hdp/2.6.4.0-91/spark2/python/pyspark/sql/dataframe.py in take(self, num) 474 [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] 475 """ --> 476 return self.limit(num).collect() 477 478 @since(1.3) /usr/hdp/2.6.4.0-91/spark2/python/pyspark/sql/dataframe.py in collect(self) 436 """ 437 with SCCallSiteSync(self._sc) as css: --> 438 port = self._jdf.collectToPython() 439 return list(_load_from_socket(port, BatchedSerializer(PickleSerializer()))) 440 /usr/hdp/2.6.4.0-91/spark2/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args) 1131 answer = self.gateway_client.send_command(command) 1132 return_value = get_return_value( -> 1133 answer, self.gateway_client, self.target_id, self.name) 1134 1135 for temp_arg in temp_args: /usr/hdp/2.6.4.0-91/spark2/python/pyspark/sql/utils.py in deco(*a, **kw) 61 def deco(*a, **kw): 62 try: ---> 63 return f(*a, **kw) 64 except py4j.protocol.Py4JJavaError as e: 65 s = e.java_exception.toString() /usr/hdp/2.6.4.0-91/spark2/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 317 raise Py4JJavaError( 318 "An error occurred while calling {0}{1}{2}.\n". --> 319 format(target_id, ".", name), value) 320 else: 321 raise Py4JError( Py4JJavaError: An error occurred while calling o1348.collectToPython. : java.sql.SQLException: Unable to resolve these column names: a,b,c,d,f,g,h,i,j Available columns with column families: PK,A.a,A.b,A.c,A.d,A.f,A.g,A.h,A.i,A.j,A.k,A.l,B.a,B.b,B.c,B.d,B.e,C.a,C.b,C.d,C.c,C.f,C.g,C.h,C.i,C.j at org.apache.phoenix.util.PhoenixRuntime.generateColumnInfo(PhoenixRuntime.java:475) at org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil.getSelectColumnMetadataList(PhoenixConfigurationUtil.java:294) at org.apache.phoenix.spark.PhoenixRDD.toDataFrame(PhoenixRDD.scala:118) at org.apache.phoenix.spark.PhoenixRelation.buildScan(PhoenixRelation.scala:47) at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$11.apply(DataSourceStrategy.scala:288) at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$11.apply(DataSourceStrategy.scala:288) at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$pruneFilterProject$1.apply(DataSourceStrategy.scala:332) at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$pruneFilterProject$1.apply(DataSourceStrategy.scala:331) at org.apache.spark.sql.execution.datasources.DataSourceStrategy.pruneFilterProjectRaw(DataSourceStrategy.scala:412) at org.apache.spark.sql.execution.datasources.DataSourceStrategy.pruneFilterProject(DataSourceStrategy.scala:327) at org.apache.spark.sql.execution.datasources.DataSourceStrategy.apply(DataSourceStrategy.scala:284) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:62) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:62) at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439) at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:77) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:74) at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157) at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157) at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1336) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:74) at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:66) at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92) at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:84) at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:80) at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:89) at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:89) at org.apache.spark.sql.execution.QueryExecution$$anonfun$completeString$3.apply(QueryExecution.scala:229) at org.apache.spark.sql.execution.QueryExecution$$anonfun$completeString$3.apply(QueryExecution.scala:229) at org.apache.spark.sql.execution.QueryExecution.stringOrError(QueryExecution.scala:112) at org.apache.spark.sql.execution.QueryExecution.completeString(QueryExecution.scala:229) at org.apache.spark.sql.execution.QueryExecution.toString(QueryExecution.scala:202) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:62) at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2828) at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:2805) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:280) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:214) at java.lang.Thread.run(Thread.java:745) {code} > Phoenix-Spark plug in cannot select by column family name > --------------------------------------------------------- > > Key: PHOENIX-3506 > URL: https://issues.apache.org/jira/browse/PHOENIX-3506 > Project: Phoenix > Issue Type: Bug > Reporter: Xindian Long > Priority: Major > > I have a table with multiple column family with possible same column names. > I want to use phoenix-spark plug in to select some of the fields, but it > returns a AnalysisException (details in the attached file). > It works with no column family, but I expect that I do not need to make sure > column names are unique across different column families. > I used the following code: > ---- > public void testSpark(JavaSparkContext sc, String tableStr, String > dataSrcUrl) { > //SparkContextBuilder.buildSparkContext("Simple Application", "local"); > // One JVM can only have one Spark Context now > Map<String, String> options = new HashMap<String, String>(); > SQLContext sqlContext = new SQLContext(sc); > options.put("zkUrl", dataSrcUrl); > options.put("table", tableStr); > log.info("Phoenix DB URL: " + dataSrcUrl + " tableStr: " + tableStr); > DataFrame df = null; > try { > df = > sqlContext.read().format("org.apache.phoenix.spark").options(options).load(); > df.explain(true); > df.show(); > df = df.select("I.CI", "I.FA"); > //df = df.select("\"I\".\"CI\"", "\"I\".\"FA\""); // This gives the > same exception too > } catch (Exception ex) { > log.error("sql error: ", ex); > } > try { > log.info("Count By phoenix spark plugin: " + df.count()); > } catch (Exception ex) { > log.error("dataframe error: ", ex); > } > } > ----- > > I can see in the log that there is something like > > 10728 [INFO] main org.apache.phoenix.mapreduce.PhoenixInputFormat - Select > Statement: SELECT > "RID","I"."CI","I"."FA","I"."FPR","I"."FPT","I"."FR","I"."LAT","I"."LNG","I"."NCG","I"."NGPD","I"."VE","I"."VMJ","I"."VMR","I"."VP","I"."CSRE","I"."VIB","I"."IIICS","I"."LICSCD","I"."LEDC","I"."ARM","I"."FBM","I"."FTB","I"."NA2FR","I"."NA2PT","S"."AHDM","S"."ARTJ","S"."ATBM","S"."ATBMR","S"."ATBR","S"."ATBRR","S"."CS","S"."LAMT","S"."LTFCT","S"."LBMT","S"."LDTI","S"."LMT","S"."LMTN","S"."LMTR","S"."LPET","S"."LPORET","S"."LRMT","S"."LRMTP","S"."LRMTR","S"."LSRT","S"."LSST","S"."MHDMS0","S"."MHDMS1","S"."RFD","S"."RRN","S"."RRR","S"."TD","S"."TSM","S"."TC","S"."TPM","S"."LRMCT","S"."SS13FSK34","S"."LERMT","S"."LEMDMT","S"."AGTBRE","S"."SRM","S"."LTET","S"."TPMS","S"."TPMSM","S"."TM","S"."TMF","S"."TMFM","S"."NA2TLS","S"."NA2IT","S"."CWR","S"."BPR","S"."LR","S"."HLB","S"."NA2UFTBFR","S"."DT","S"."NA28ARE","S"."RM","S"."LMTB","S"."LRMTB","S"."RRB","P"."BADUC","P"."UAN","P"."BAPS","P"."BAS","P"."UAS","P"."BATBBR","P"."BBRI","P"."BLBR","P"."ULHT","P"."BLPST","P"."BLPT","P"."UTI","P"."UUC" > FROM TESTING.ENDPOINTS > > But obviously, the column family is left out of the Dataframe column name > somewhere in the process. > Need a fix that can select by ColumnFamilyName.ColumnQualifier -- This message was sent by Atlassian JIRA (v7.6.3#76005)