[ 
https://issues.apache.org/jira/browse/SPARK-27689?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Juan Antonio updated SPARK-27689:
---------------------------------
    Priority: Major  (was: Minor)

> Error to execute hive views with spark
> --------------------------------------
>
>                 Key: SPARK-27689
>                 URL: https://issues.apache.org/jira/browse/SPARK-27689
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.3.0, 2.3.3, 2.4.3
>            Reporter: Juan Antonio
>            Priority: Major
>
> I have a python error when I execute the following code using hive views but 
> it works correctly when I run it with hive tables.
> *Hive databases:*
> {code:java}
> CREATE DATABASE schema_p LOCATION "hdfs:///tmp/schema_p";
> {code}
> *Hive tables:*
> {code:java}
> CREATE TABLE schema_p.person(
>  id_person string, 
>  identifier string, 
>  gender string, 
>  start_date string, 
>  end_date string)
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/person';
> {code}
> {code:java}
> CREATE TABLE schema_p.product(
>  id_product string,
>  name string,
>  country string,
>  city string,
>  start_date string,
>  end_date string
>  )
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/product';
> {code}
> {code:java}
> CREATE TABLE schema_p.person_product(
>  id_person string,
>  id_product string,
>  country string,
>  city string,
>  price string,
>  start_date string,
>  end_date string
>  )
>  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
>  STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
>  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
>  LOCATION 'hdfs:///tmp/schema_p/person_product';
> {code}
> *Hive views:*
> {code:java}
> CREATE VIEW schema_p.person_v AS SELECT CAST(id_person AS INT) AS id_person, 
> CAST(identifier AS INT) AS identifier, gender AS gender, CAST(start_date AS 
> DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person;
>  CREATE VIEW schema_p.product_v AS SELECT CAST(id_product AS INT) AS 
> id_product, name AS name, country AS country, city AS city, CAST(start_date 
> AS DATE) AS start_date, CAST(end_date AS DATE) AS end_date FROM 
> schema_p.product;
>  CREATE VIEW schema_p.person_product_v AS SELECT CAST(id_person AS INT) AS 
> id_person, CAST(id_product AS INT) AS id_product, country AS country, city AS 
> city, CAST(price AS DECIMAL(38,8)) AS price, CAST(start_date AS DATE) AS 
> start_date, CAST(end_date AS DATE) AS end_date FROM schema_p.person_product;
> {code}
> *********************************************
> *Code*:
>  
> {code}
> def read_tables(sc):
>   in_dict = { 'person': 'person_v', 'product': 'product_v', 'person_product': 
> 'person_product_v' }
>   data_dict = {}
>   for n, d in in_dict.iteritems():
>     data_dict[n] = sc.read.table(d)
>   return data_dict
> def get_population(tables, ref_date_str):
>   person = tables['person']
>   product = tables['product']
>   person_product = tables['person_product']
>   person_product_join = person_product.join(product,'id_product')
>   count_prod = 
> person_product.groupBy('id_product').agg(F.count('id_product').alias('count_prod'))
>   person_count = person_product_join.join(count_prod,'id_product')
>   final1 = person_product_join.join(person_count, 'id_person', 'left')
>   final = final1.withColumn('reference_date', F.lit(ref_date_str))
>   return final
> import pyspark.sql.functions as F
> import functools
> from pyspark.sql.functions import col
> from pyspark.sql.functions import add_months, lit, count, coalesce
> spark.sql('use schema_p')
> data_dict = read_tables(spark)
> data_dict
> population = get_population(data_dict, '2019-04-30')
> population.show(){code}
>  
> *Error:*
> {code:java}
> File "<stdin>", line 1, in <module>
>  File "<stdin>", line 10, in get_population
>  File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 
> 931, in join
>  jdf = self._jdf.join(other._jdf, on, how)
>  File 
> "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py",
>  line 1160, in __call__
>  File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, 
> in deco
>  raise AnalysisException(s.split(': ', 1)[1], stackTrace)
>  pyspark.sql.utils.AnalysisException: u'Resolved attribute(s) 
> id_product#124,end_date#129,city#126,price#127,start_date#128,id_person#123,country#125
>  missing from 
> city#47,price#48,start_date#49,id_product#45,end_date#50,id_person#44,country#46
>  in operator !Project [cast(id_person#123 as int) AS id_person#96, 
> cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS 
> country#98, cast(city#126 as string) AS city#99, cast(price#127 as 
> decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101, 
> cast(end_date#129 as date) AS end_date#102|#123 as int) AS id_person#96, 
> cast(id_product#124 as int) AS id_product#97, cast(country#125 as string) AS 
> country#98, cast(city#126 as string) AS city#99, cast(price#127 as 
> decimal(38,8)) AS price#100, cast(start_date#128 as date) AS start_date#101, 
> cast(end_date#129 as date) AS end_date#102]. Attribute(s) with the same name 
> appear in the operation: 
> id_product,end_date,city,price,start_date,id_person,country. Please check if 
> the right attribute(s) are used.;;
>  Project [id_person#44, id_product#45, country#46, city#47, price#48, 
> start_date#49, end_date#50, name#21, country#22, city#23, start_date#24, 
> end_date#25, id_product#124, country#125, city#126, price#127, 
> start_date#128, end_date#129, name#157, country#158, city#159, 
> start_date#160, end_date#161, count_prod#93L|#44, id_product#45, country#46, 
> city#47, price#48, start_date#49, end_date#50, name#21, country#22, city#23, 
> start_date#24, end_date#25, id_product#124, country#125, city#126, price#127, 
> start_date#128, end_date#129, name#157, country#158, city#159, 
> start_date#160, end_date#161, count_prod#93L]
>  +- Join LeftOuter, (id_person#44 = id_person#123)
>  :- Project [id_product#45, id_person#44, country#46, city#47, price#48, 
> start_date#49, end_date#50, name#21, country#22, city#23, start_date#24, 
> end_date#25|#45, id_person#44, country#46, city#47, price#48, start_date#49, 
> end_date#50, name#21, country#22, city#23, start_date#24, end_date#25]
>  : +- Join Inner, (id_product#45 = id_product#20)
>  : :- SubqueryAlias person_product_v
>  : : +- View (`schema_p`.`person_product_v`, 
> [id_person#44,id_product#45,country#46,city#47,price#48,start_date#49,end_date#50|#44,id_product#45,country#46,city#47,price#48,start_date#49,end_date#50])
>  : : +- Project [cast(id_person#51 as int) AS id_person#44, 
> cast(id_product#52 as int) AS id_product#45, cast(country#53 as string) AS 
> country#46, cast(city#54 as string) AS city#47, cast(price#55 as 
> decimal(38,8)) AS price#48, cast(start_date#56 as date) AS start_date#49, 
> cast(end_date#57 as date) AS end_date#50|#51 as int) AS id_person#44, 
> cast(id_product#52 as int) AS id_product#45, cast(country#53 as string) AS 
> country#46, cast(city#54 as string) AS city#47, cast(price#55 as 
> decimal(38,8)) AS price#48, cast(start_date#56 as date) AS start_date#49, 
> cast(end_date#57 as date) AS end_date#50]
>  : : +- Project [cast(id_person#58 as int) AS id_person#51, 
> cast(id_product#59 as int) AS id_product#52, country#60 AS country#53, 
> city#61 AS city#54, cast(price#62 as decimal(38,8)) AS price#55, 
> cast(start_date#63 as date) AS start_date#56, cast(end_date#64 as date) AS 
> end_date#57|#58 as int) AS id_person#51, cast(id_product#59 as int) AS 
> id_product#52, country#60 AS country#53, city#61 AS city#54, cast(price#62 as 
> decimal(38,8)) AS price#55, cast(start_date#63 as date) AS start_date#56, 
> cast(end_date#64 as date) AS end_date#57]
>  : : +- SubqueryAlias person_product
>  : : +- HiveTableRelation `schema_p`.`person_product`, 
> org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#58, 
> id_product#59, country#60, city#61, price#62, start_date#63, end_date#64|#58, 
> id_product#59, country#60, city#61, price#62, start_date#63, end_date#64]
>  : +- SubqueryAlias product_v
>  : +- View (`schema_p`.`product_v`, 
> [id_product#20,name#21,country#22,city#23,start_date#24,end_date#25|#20,name#21,country#22,city#23,start_date#24,end_date#25])
>  : +- Project [cast(id_product#26 as int) AS id_product#20, cast(name#27 as 
> string) AS name#21, cast(country#28 as string) AS country#22, cast(city#29 as 
> string) AS city#23, cast(start_date#30 as date) AS start_date#24, 
> cast(end_date#31 as date) AS end_date#25|#26 as int) AS id_product#20, 
> cast(name#27 as string) AS name#21, cast(country#28 as string) AS country#22, 
> cast(city#29 as string) AS city#23, cast(start_date#30 as date) AS 
> start_date#24, cast(end_date#31 as date) AS end_date#25]
>  : +- Project [cast(id_product#32 as int) AS id_product#26, name#33 AS 
> name#27, country#34 AS country#28, city#35 AS city#29, cast(start_date#36 as 
> date) AS start_date#30, cast(end_date#37 as date) AS end_date#31|#32 as int) 
> AS id_product#26, name#33 AS name#27, country#34 AS country#28, city#35 AS 
> city#29, cast(start_date#36 as date) AS start_date#30, cast(end_date#37 as 
> date) AS end_date#31]
>  : +- SubqueryAlias product
>  : +- HiveTableRelation `schema_p`.`product`, 
> org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_product#32, name#33, 
> country#34, city#35, start_date#36, end_date#37|#32, name#33, country#34, 
> city#35, start_date#36, end_date#37]
>  +- Project [id_product#124, id_person#123, country#125, city#126, price#127, 
> start_date#128, end_date#129, name#157, country#158, city#159, 
> start_date#160, end_date#161, count_prod#93L|#124, id_person#123, 
> country#125, city#126, price#127, start_date#128, end_date#129, name#157, 
> country#158, city#159, start_date#160, end_date#161, count_prod#93L]
>  +- Join Inner, (id_product#124 = id_product#97)
>  :- Project [id_product#124, id_person#123, country#125, city#126, price#127, 
> start_date#128, end_date#129, name#157, country#158, city#159, 
> start_date#160, end_date#161|#124, id_person#123, country#125, city#126, 
> price#127, start_date#128, end_date#129, name#157, country#158, city#159, 
> start_date#160, end_date#161]
>  : +- Join Inner, (id_product#124 = id_product#156)
>  : :- SubqueryAlias person_product_v
>  : : +- View (`schema_p`.`person_product_v`, 
> [id_person#123,id_product#124,country#125,city#126,price#127,start_date#128,end_date#129|#123,id_product#124,country#125,city#126,price#127,start_date#128,end_date#129])
>  : : +- Project [cast(id_person#44 as int) AS id_person#123, 
> cast(id_product#45 as int) AS id_product#124, cast(country#46 as string) AS 
> country#125, cast(city#47 as string) AS city#126, cast(price#48 as 
> decimal(38,8)) AS price#127, cast(start_date#49 as date) AS start_date#128, 
> cast(end_date#50 as date) AS end_date#129|#44 as int) AS id_person#123, 
> cast(id_product#45 as int) AS id_product#124, cast(country#46 as string) AS 
> country#125, cast(city#47 as string) AS city#126, cast(price#48 as 
> decimal(38,8)) AS price#127, cast(start_date#49 as date) AS start_date#128, 
> cast(end_date#50 as date) AS end_date#129]
>  : : +- Project [cast(id_person#51 as int) AS id_person#44, 
> cast(id_product#52 as int) AS id_product#45, cast(country#53 as string) AS 
> country#46, cast(city#54 as string) AS city#47, cast(price#55 as 
> decimal(38,8)) AS price#48, cast(start_date#56 as date) AS start_date#49, 
> cast(end_date#57 as date) AS end_date#50|#51 as int) AS id_person#44, 
> cast(id_product#52 as int) AS id_product#45, cast(country#53 as string) AS 
> country#46, cast(city#54 as string) AS city#47, cast(price#55 as 
> decimal(38,8)) AS price#48, cast(start_date#56 as date) AS start_date#49, 
> cast(end_date#57 as date) AS end_date#50]
>  : : +- Project [cast(id_person#58 as int) AS id_person#51, 
> cast(id_product#59 as int) AS id_product#52, country#60 AS country#53, 
> city#61 AS city#54, cast(price#62 as decimal(38,8)) AS price#55, 
> cast(start_date#63 as date) AS start_date#56, cast(end_date#64 as date) AS 
> end_date#57|#58 as int) AS id_person#51, cast(id_product#59 as int) AS 
> id_product#52, country#60 AS country#53, city#61 AS city#54, cast(price#62 as 
> decimal(38,8)) AS price#55, cast(start_date#63 as date) AS start_date#56, 
> cast(end_date#64 as date) AS end_date#57]
>  : : +- SubqueryAlias person_product
>  : : +- HiveTableRelation `schema_p`.`person_product`, 
> org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#58, 
> id_product#59, country#60, city#61, price#62, start_date#63, end_date#64|#58, 
> id_product#59, country#60, city#61, price#62, start_date#63, end_date#64]
>  : +- SubqueryAlias product_v
>  : +- View (`schema_p`.`product_v`, 
> [id_product#156,name#157,country#158,city#159,start_date#160,end_date#161|#156,name#157,country#158,city#159,start_date#160,end_date#161])
>  : +- Project [cast(id_product#20 as int) AS id_product#156, cast(name#21 as 
> string) AS name#157, cast(country#22 as string) AS country#158, cast(city#23 
> as string) AS city#159, cast(start_date#24 as date) AS start_date#160, 
> cast(end_date#25 as date) AS end_date#161|#20 as int) AS id_product#156, 
> cast(name#21 as string) AS name#157, cast(country#22 as string) AS 
> country#158, cast(city#23 as string) AS city#159, cast(start_date#24 as date) 
> AS start_date#160, cast(end_date#25 as date) AS end_date#161]
>  : +- Project [cast(id_product#26 as int) AS id_product#20, cast(name#27 as 
> string) AS name#21, cast(country#28 as string) AS country#22, cast(city#29 as 
> string) AS city#23, cast(start_date#30 as date) AS start_date#24, 
> cast(end_date#31 as date) AS end_date#25|#26 as int) AS id_product#20, 
> cast(name#27 as string) AS name#21, cast(country#28 as string) AS country#22, 
> cast(city#29 as string) AS city#23, cast(start_date#30 as date) AS 
> start_date#24, cast(end_date#31 as date) AS end_date#25]
>  : +- Project [cast(id_product#32 as int) AS id_product#26, name#33 AS 
> name#27, country#34 AS country#28, city#35 AS city#29, cast(start_date#36 as 
> date) AS start_date#30, cast(end_date#37 as date) AS end_date#31|#32 as int) 
> AS id_product#26, name#33 AS name#27, country#34 AS country#28, city#35 AS 
> city#29, cast(start_date#36 as date) AS start_date#30, cast(end_date#37 as 
> date) AS end_date#31]
>  : +- SubqueryAlias product
>  : +- HiveTableRelation `schema_p`.`product`, 
> org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_product#32, name#33, 
> country#34, city#35, start_date#36, end_date#37|#32, name#33, country#34, 
> city#35, start_date#36, end_date#37]
>  +- Aggregate [id_product#97|#97], [id_product#97, count(id_product#97) AS 
> count_prod#93L|#97, count(id_product#97) AS count_prod#93L]
>  +- SubqueryAlias person_product_v
>  +- View (`schema_p`.`person_product_v`, 
> [id_person#96,id_product#97,country#98,city#99,price#100,start_date#101,end_date#102|#96,id_product#97,country#98,city#99,price#100,start_date#101,end_date#102])
>  +- !Project [cast(id_person#123 as int) AS id_person#96, cast(id_product#124 
> as int) AS id_product#97, cast(country#125 as string) AS country#98, 
> cast(city#126 as string) AS city#99, cast(price#127 as decimal(38,8)) AS 
> price#100, cast(start_date#128 as date) AS start_date#101, cast(end_date#129 
> as date) AS end_date#102|#123 as int) AS id_person#96, cast(id_product#124 as 
> int) AS id_product#97, cast(country#125 as string) AS country#98, 
> cast(city#126 as string) AS city#99, cast(price#127 as decimal(38,8)) AS 
> price#100, cast(start_date#128 as date) AS start_date#101, cast(end_date#129 
> as date) AS end_date#102]
>  +- Project [cast(id_person#51 as int) AS id_person#44, cast(id_product#52 as 
> int) AS id_product#45, cast(country#53 as string) AS country#46, cast(city#54 
> as string) AS city#47, cast(price#55 as decimal(38,8)) AS price#48, 
> cast(start_date#56 as date) AS start_date#49, cast(end_date#57 as date) AS 
> end_date#50|#51 as int) AS id_person#44, cast(id_product#52 as int) AS 
> id_product#45, cast(country#53 as string) AS country#46, cast(city#54 as 
> string) AS city#47, cast(price#55 as decimal(38,8)) AS price#48, 
> cast(start_date#56 as date) AS start_date#49, cast(end_date#57 as date) AS 
> end_date#50]
>  +- Project [cast(id_person#58 as int) AS id_person#51, cast(id_product#59 as 
> int) AS id_product#52, country#60 AS country#53, city#61 AS city#54, 
> cast(price#62 as decimal(38,8)) AS price#55, cast(start_date#63 as date) AS 
> start_date#56, cast(end_date#64 as date) AS end_date#57|#58 as int) AS 
> id_person#51, cast(id_product#59 as int) AS id_product#52, country#60 AS 
> country#53, city#61 AS city#54, cast(price#62 as decimal(38,8)) AS price#55, 
> cast(start_date#63 as date) AS start_date#56, cast(end_date#64 as date) AS 
> end_date#57]
>  +- SubqueryAlias person_product
>  +- HiveTableRelation `schema_p`.`person_product`, 
> org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [id_person#58, 
> id_product#59, country#60, city#61, price#62, start_date#63, end_date#64|#58, 
> id_product#59, country#60, city#61, price#62, start_date#63, end_date#64]
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to