Hi,
We are trying to use the Spark connector for Atlas and we are encountering an
issue we do not understand. To reproduce use a clean Atlas installation and
build the Atlas 1.0 connector from
https://github.com/hortonworks-spark/spark-atlas-connector
<https://github.com/hortonworks-spark/spark-atlas-connector> and use a Spark
2.3. Follow the instructions to add the listeners and then run
scala>
Seq((1,2)).toDF("i","j").write.mode("overwrite").saveAsTable("default.atlas_bolke”)
This is the result
org.apache.atlas.AtlasServiceException: Metadata service API
org.apache.atlas.AtlasClientV2$API_V2@1dfc2ecd failed with status 400 (Bad
Request) Response Body ({"errorCode":"ATLAS-400-00-036","errorMessage":"invalid
relationshipDef: avro_schema_associatedEntities: end type 1: DataSet, end type
2: spark_table”})
We have no clue why this error occurs. This relationship is not defined by the
spark connector, neither is it referenced. This is the JSON dump of the entity
definition of spark_table:
{
"enumDefs": [],
"structDefs": [],
"classificationDefs": [],
"entityDefs": [{
"category": "ENTITY",
"guid": "3bd9315c-f159-4865-ac8d-11dbcca79adc",
"createdBy": "admin",
"updatedBy": "admin",
"createTime": 1539112556115,
"updateTime": 1539112556115,
"version": 1,
"name": "spark_table",
"description": "spark_table",
"typeVersion": "1.0",
"attributeDefs": [{
"name": "qualifiedName",
"typeName": "string",
"isOptional": false,
"cardinality": "SINGLE",
"valuesMinCount": 1,
"valuesMaxCount": 1,
"isUnique": true,
"isIndexable": true,
"includeInNotification": false
}, {
"name": "database",
"typeName": "spark_db",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false
}, {
"name": "tableType",
"typeName": "string",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false
}, {
"name": "storage",
"typeName": "spark_storagedesc",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false,
"constraints": [{
"type": "ownedRef"
}]
}, {
"name": "schema",
"typeName": "array<spark_column>",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false,
"constraints": [{
"type": "ownedRef"
}]
}, {
"name": "provider",
"typeName": "string",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false
}, {
"name": "partitionColumnNames",
"typeName": "array<string>",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false
}, {
"name": "bucketSpec",
"typeName": "map<string,string>",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false
}, {
"name": "owner",
"typeName": "string",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false
}, {
"name": "createTime",
"typeName": "long",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false
}, {
"name": "lastAccessTime",
"typeName": "long",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false
}, {
"name": "properties",
"typeName": "map<string,string>",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false
}, {
"name": "comment",
"typeName": "string",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false
}, {
"name": "unsupportedFeatures",
"typeName": "array<string>",
"isOptional": true,
"cardinality": "SINGLE",
"valuesMinCount": 0,
"valuesMaxCount": 1,
"isUnique": false,
"isIndexable": false,
"includeInNotification": false
}],
"superTypes": ["DataSet"],
"subTypes": []
}],
"relationshipDefs": []
}
This is the request that errors:
{
"referredEntities": null,
"entities": [{
"typeName": "spark_table",
"attributes": {
"schema": [{
"typeName": "spark_column",
"attributes": {
"metadata": "{}",
"nullable": true,
"qualifiedName":
"local-1539114333944.default.atlas_bolke.col-i",
"name": "i",
"type": "integer"
},
"guid": "-79141485348090",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
}, {
"typeName": "spark_column",
"attributes": {
"metadata": "{}",
"nullable": true,
"qualifiedName":
"local-1539114333944.default.atlas_bolke.col-j",
"name": "j",
"type": "integer"
},
"guid": "-79141485348091",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
}],
"owner": "bolke",
"lastAccessTime": 0,
"unsupportedFeatures": [],
"qualifiedName":
"local-1539114333944.default.atlas_bolke",
"storage": {
"typeName": "spark_storagedesc",
"attributes": {
"serde":
"org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
"qualifiedName":
"local-1539114333944.default.atlas_bolke.storageFormat",
"compressed": false,
"locationUri": {
"typeName": "fs_path",
"attributes": {
"path":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse/atlas_bolke",
"qualifiedName":
"file:/Users/bolke/Downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse/atlas_bolke",
"name":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse/atlas_bolke"
},
"guid": "-79141485348089",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
},
"inputFormat":
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
"outputFormat":
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
"properties": {
"serialization.format": "1"
}
},
"guid": "-79141485348088",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
},
"tableType": "MANAGED",
"partitionColumnNames": [],
"database": {
"typeName": "spark_db",
"attributes": {
"owner": "bolke",
"qualifiedName":
"local-1539114333944.default",
"name": "default",
"description": "Default Hive database",
"locationUri": {
"typeName": "fs_path",
"attributes": {
"path":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse",
"qualifiedName":
"file:/Users/bolke/Downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse",
"name":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse"
},
"guid": "-79141485348087",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
},
"properties": {}
},
"guid": "-79141485348086",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
},
"provider": "parquet",
"createTime": 1539114452000,
"name": "atlas_bolke",
"properties": {
"transient_lastDdlTime": "1539114452"
}
},
"guid": "-79141485348092",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
}, {
"typeName": "spark_db",
"attributes": {
"owner": "bolke",
"qualifiedName": "local-1539114333944.default",
"name": "default",
"description": "Default Hive database",
"locationUri": {
"typeName": "fs_path",
"attributes": {
"path":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse",
"qualifiedName":
"file:/Users/bolke/Downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse",
"name":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse"
},
"guid": "-79141485348087",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
},
"properties": {}
},
"guid": "-79141485348086",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
}, {
"typeName": "fs_path",
"attributes": {
"path":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse",
"qualifiedName":
"file:/Users/bolke/Downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse",
"name":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse"
},
"guid": "-79141485348087",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
}, {
"typeName": "spark_storagedesc",
"attributes": {
"serde":
"org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
"qualifiedName":
"local-1539114333944.default.atlas_bolke.storageFormat",
"compressed": false,
"locationUri": {
"typeName": "fs_path",
"attributes": {
"path":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse/atlas_bolke",
"qualifiedName":
"file:/Users/bolke/Downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse/atlas_bolke",
"name":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse/atlas_bolke"
},
"guid": "-79141485348089",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
},
"inputFormat":
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
"outputFormat":
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
"properties": {
"serialization.format": "1"
}
},
"guid": "-79141485348088",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
}, {
"typeName": "fs_path",
"attributes": {
"path":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse/atlas_bolke",
"qualifiedName":
"file:/Users/bolke/Downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse/atlas_bolke",
"name":
"/users/bolke/downloads/spark-2.3.2-bin-hadoop2.7/spark-warehouse/atlas_bolke"
},
"guid": "-79141485348089",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
}, {
"typeName": "spark_column",
"attributes": {
"metadata": "{}",
"nullable": true,
"qualifiedName":
"local-1539114333944.default.atlas_bolke.col-i",
"name": "i",
"type": "integer"
},
"guid": "-79141485348090",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
}, {
"typeName": "spark_column",
"attributes": {
"metadata": "{}",
"nullable": true,
"qualifiedName":
"local-1539114333944.default.atlas_bolke.col-j",
"name": "j",
"type": "integer"
},
"guid": "-79141485348091",
"status": null,
"createdBy": null,
"updatedBy": null,
"createTime": null,
"updateTime": null,
"version": 0,
"relationshipAttributes": null,
"classifications": null,
"meanings": null
}]
}
Can someone shed some light on this?
Thanks
Bolke