[ https://issues.apache.org/jira/browse/SPARK-44354?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Kai-Michael Roesner updated SPARK-44354: ---------------------------------------- Description: When trying to create a dataframe with a CharType or VarcharType column like so: {code} from datetime import date from decimal import Decimal from pyspark.sql import SparkSession from pyspark.sql.types import * data = [ (1, 'abc', Decimal(3.142), date(2023, 1, 1)), (2, 'bcd', Decimal(1.414), date(2023, 1, 2)), (3, 'cde', Decimal(2.718), date(2023, 1, 3))] schema = StructType([ StructField('INT', IntegerType()), StructField('STR', CharType(3)), StructField('DEC', DecimalType(4, 3)), StructField('DAT', DateType())]) spark = SparkSession.builder.appName('data-types').getOrCreate() df = spark.createDataFrame(data, schema) df.show() {code} a {{java.lang.IllegalStateException}} is thrown [here|https://github.com/apache/spark/blob/85e252e8503534009f4fb5ea005d44c9eda31447/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala#L168]. I'm expecting this to work... PS: Excerpt from the logs: {code} py4j.protocol.Py4JJavaError: An error occurred while calling o24.applySchemaToPythonRDD. : java.lang.IllegalStateException: [BUG] logical plan should not have output of char/varchar type: LogicalRDD [INT#0, STR#1, DEC#2, DAT#3], false at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$1(CheckAnalysis.scala:168) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$1$adapted(CheckAnalysis.scala:163) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:295) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis0(CheckAnalysis.scala:163) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis0$(CheckAnalysis.scala:160) at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis0(Analyzer.scala:188) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis(CheckAnalysis.scala:156) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis$(CheckAnalysis.scala:146) at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:188) at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:211) at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:330) at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:208) at org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:76) at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:202) at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:202) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827) at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:201) at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:76) at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:74) at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:66) at org.apache.spark.sql.Dataset$.$anonfun$ofRows$1(Dataset.scala:90) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827) at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:88) at org.apache.spark.sql.SparkSession.internalCreateDataFrame(SparkSession.scala:571) at org.apache.spark.sql.SparkSession.applySchemaToPythonRDD(SparkSession.scala:804) at org.apache.spark.sql.SparkSession.applySchemaToPythonRDD(SparkSession.scala:789) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:566) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) at py4j.ClientServerConnection.run(ClientServerConnection.java:106) at java.base/java.lang.Thread.run(Thread.java:829) {code} was: When trying to create a dataframe with a CharType or VarcharType column like so: {code} from datetime import date from decimal import Decimal from pyspark.sql import SparkSession from pyspark.sql.types import * data = [ (1, 'abc', Decimal(3.142), date(2023, 1, 1)), (2, 'bcd', Decimal(1.414), date(2023, 1, 2)), (3, 'cde', Decimal(2.718), date(2023, 1, 3))] schema = StructType([ StructField('INT', IntegerType()), StructField('STR', CharType(3)), StructField('DEC', DecimalType(4, 3)), StructField('DAT', DateType())]) spark = SparkSession.builder.appName('data-types').getOrCreate() df = spark.createDataFrame(data, schema) df.show() {code} a {{java.lang.IllegalStateException}} is thrown [here|https://github.com/apache/spark/blob/85e252e8503534009f4fb5ea005d44c9eda31447/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala#L168]. I'm expecting this to work... > Cannot create dataframe with CharType/VarcharType column > -------------------------------------------------------- > > Key: SPARK-44354 > URL: https://issues.apache.org/jira/browse/SPARK-44354 > Project: Spark > Issue Type: Bug > Components: PySpark, SQL > Affects Versions: 3.4.0 > Reporter: Kai-Michael Roesner > Priority: Major > > When trying to create a dataframe with a CharType or VarcharType column like > so: > {code} > from datetime import date > from decimal import Decimal > from pyspark.sql import SparkSession > from pyspark.sql.types import * > data = [ > (1, 'abc', Decimal(3.142), date(2023, 1, 1)), > (2, 'bcd', Decimal(1.414), date(2023, 1, 2)), > (3, 'cde', Decimal(2.718), date(2023, 1, 3))] > schema = StructType([ > StructField('INT', IntegerType()), > StructField('STR', CharType(3)), > StructField('DEC', DecimalType(4, 3)), > StructField('DAT', DateType())]) > spark = SparkSession.builder.appName('data-types').getOrCreate() > df = spark.createDataFrame(data, schema) > df.show() > {code} > a {{java.lang.IllegalStateException}} is thrown > [here|https://github.com/apache/spark/blob/85e252e8503534009f4fb5ea005d44c9eda31447/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala#L168]. > I'm expecting this to work... > PS: Excerpt from the logs: > {code} > py4j.protocol.Py4JJavaError: An error occurred while calling > o24.applySchemaToPythonRDD. > : java.lang.IllegalStateException: [BUG] logical plan should not have output > of char/varchar type: LogicalRDD [INT#0, STR#1, DEC#2, DAT#3], false > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$1(CheckAnalysis.scala:168) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$1$adapted(CheckAnalysis.scala:163) > at > org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:295) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis0(CheckAnalysis.scala:163) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis0$(CheckAnalysis.scala:160) > at > org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis0(Analyzer.scala:188) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis(CheckAnalysis.scala:156) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis$(CheckAnalysis.scala:146) > at > org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:188) > at > org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:211) > at > org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:330) > at > org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:208) > at > org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:76) > at > org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111) > at > org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:202) > at > org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526) > at > org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:202) > at > org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827) > at > org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:201) > at > org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:76) > at > org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:74) > at > org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:66) > at org.apache.spark.sql.Dataset$.$anonfun$ofRows$1(Dataset.scala:90) > at > org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827) > at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:88) > at > org.apache.spark.sql.SparkSession.internalCreateDataFrame(SparkSession.scala:571) > at > org.apache.spark.sql.SparkSession.applySchemaToPythonRDD(SparkSession.scala:804) > at > org.apache.spark.sql.SparkSession.applySchemaToPythonRDD(SparkSession.scala:789) > at > java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.base/java.lang.reflect.Method.invoke(Method.java:566) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374) > at py4j.Gateway.invoke(Gateway.java:282) > at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at > py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) > at py4j.ClientServerConnection.run(ClientServerConnection.java:106) > at java.base/java.lang.Thread.run(Thread.java:829) > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org