[
https://issues.apache.org/jira/browse/GOBBLIN-1485?focusedWorklogId=616162&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-616162
]
ASF GitHub Bot logged work on GOBBLIN-1485:
-------------------------------------------
Author: ASF GitHub Bot
Created on: 29/Jun/21 13:19
Start Date: 29/Jun/21 13:19
Worklog Time Spent: 10m
Work Description: sv2000 commented on a change in pull request #3324:
URL: https://github.com/apache/gobblin/pull/3324#discussion_r660136783
##########
File path:
gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/HiveRegistrationUnitComparator.java
##########
@@ -142,12 +145,24 @@ public T compareIsStoredAsSubDirs() {
return (T) this;
}
+ private State extractSchemaVersion(State state) {
+ State newState = new State(state);
+ String schemaFromState =
state.getProp(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName());
+ if (schemaFromState != null && !schemaFromState.isEmpty()) {
+ String schemaVersion = AvroUtils.getSchemaCreationTime(new
Schema.Parser().parse(schemaFromState));
+ if (schemaVersion != null && !schemaVersion.isEmpty()) {
+
newState.removeProp(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName());
Review comment:
Why do we remove this property?
##########
File path:
gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/HiveRegistrationUnitComparator.java
##########
@@ -142,12 +145,24 @@ public T compareIsStoredAsSubDirs() {
return (T) this;
}
+ private State extractSchemaVersion(State state) {
+ State newState = new State(state);
+ String schemaFromState =
state.getProp(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName());
+ if (schemaFromState != null && !schemaFromState.isEmpty()) {
Review comment:
can be re-written using Strings.isNullOrEmpty()
##########
File path:
gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
##########
@@ -75,12 +78,17 @@
public static final String DEFAULT_SERDE_TYPE = "ORC";
public static final String INPUT_FORMAT_CLASS_KEY =
"hiveOrcSerdeManager.inputFormatClass";
public static final String DEFAULT_INPUT_FORMAT_CLASS =
OrcInputFormat.class.getName();
+ public static final String WRITER_LATEST_SCHEMA = "writer.latest.schema";
public static final String OUTPUT_FORMAT_CLASS_KEY =
"hiveOrcSerdeManager.outputFormatClass";
public static final String DEFAULT_OUTPUT_FORMAT_CLASS =
OrcOutputFormat.class.getName();
public static final String HIVE_SPEC_SCHEMA_READING_TIMER =
"hiveOrcSerdeManager.schemaReadTimer";
+ public static final String HIVE_SPEC_SCHEMA_FROM_WRITER =
"hiveOrcSerdeManager.getSchemaFromWriterSchema";
Review comment:
Do we need to introduce a config? Can we always look for writer schema
first and then default to file schema if it doesn't exist?
##########
File path:
gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/HiveRegistrationUnitComparator.java
##########
@@ -142,12 +145,24 @@ public T compareIsStoredAsSubDirs() {
return (T) this;
}
+ private State extractSchemaVersion(State state) {
+ State newState = new State(state);
+ String schemaFromState =
state.getProp(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName());
+ if (schemaFromState != null && !schemaFromState.isEmpty()) {
+ String schemaVersion = AvroUtils.getSchemaCreationTime(new
Schema.Parser().parse(schemaFromState));
+ if (schemaVersion != null && !schemaVersion.isEmpty()) {
Review comment:
can we re-written using Strings.isNullOrEmpty().
##########
File path:
gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
##########
@@ -264,7 +272,18 @@ private void addSchemaProperties(Path path,
HiveRegistrationUnit hiveUnit)
*
*/
protected void addSchemaPropertiesHelper(Path path, HiveRegistrationUnit
hiveUnit) throws IOException {
- TypeInfo schema = getSchemaFromLatestFile(path, this.fs);
+ TypeInfo schema;
+ if(props.getPropAsBoolean(HIVE_SPEC_SCHEMA_FROM_WRITER,
DEFAULT_HIVE_SPEC_SCHEMA_FROM_WRITER)) {
+ try {
+ Preconditions.checkArgument(props.contains(WRITER_LATEST_SCHEMA));
+ Schema avroSchema = new
Schema.Parser().parse(props.getProp(WRITER_LATEST_SCHEMA));
+ schema = TypeInfoUtils.getTypeInfoFromObjectInspector(new
AvroObjectInspectorGenerator(avroSchema).getObjectInspector());
Review comment:
The schema converter returns a TypeDescription object. Not sure if it is
easy to go from TypeDescription to TypeInfo.
##########
File path:
gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
##########
@@ -264,7 +272,18 @@ private void addSchemaProperties(Path path,
HiveRegistrationUnit hiveUnit)
*
*/
protected void addSchemaPropertiesHelper(Path path, HiveRegistrationUnit
hiveUnit) throws IOException {
- TypeInfo schema = getSchemaFromLatestFile(path, this.fs);
+ TypeInfo schema;
+ if(props.getPropAsBoolean(HIVE_SPEC_SCHEMA_FROM_WRITER,
DEFAULT_HIVE_SPEC_SCHEMA_FROM_WRITER)) {
+ try {
+ Preconditions.checkArgument(props.contains(WRITER_LATEST_SCHEMA));
+ Schema avroSchema = new
Schema.Parser().parse(props.getProp(WRITER_LATEST_SCHEMA));
+ schema = TypeInfoUtils.getTypeInfoFromObjectInspector(new
AvroObjectInspectorGenerator(avroSchema).getObjectInspector());
Review comment:
There is a Avro->Orc schema converter utility method in
AvroOrcSchemaConverter class. Can we use that, given that it is being used by
GobblinOrcWriter to create the ORC file?
##########
File path:
gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
##########
@@ -75,12 +78,17 @@
public static final String DEFAULT_SERDE_TYPE = "ORC";
public static final String INPUT_FORMAT_CLASS_KEY =
"hiveOrcSerdeManager.inputFormatClass";
public static final String DEFAULT_INPUT_FORMAT_CLASS =
OrcInputFormat.class.getName();
+ public static final String WRITER_LATEST_SCHEMA = "writer.latest.schema";
public static final String OUTPUT_FORMAT_CLASS_KEY =
"hiveOrcSerdeManager.outputFormatClass";
public static final String DEFAULT_OUTPUT_FORMAT_CLASS =
OrcOutputFormat.class.getName();
public static final String HIVE_SPEC_SCHEMA_READING_TIMER =
"hiveOrcSerdeManager.schemaReadTimer";
+ public static final String HIVE_SPEC_SCHEMA_FROM_WRITER =
"hiveOrcSerdeManager.getSchemaFromWriterSchema";
Review comment:
I see. We should probably not overload the writer schema configuration
to mean different things in different scenarios. We can introduce a separate
configuration for the GMCE use case. Thoughts?
##########
File path:
gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/HiveRegistrationUnitComparator.java
##########
@@ -142,12 +146,24 @@ public T compareIsStoredAsSubDirs() {
return (T) this;
}
+ private State extractSchemaVersion(State state) {
+ State newState = new State(state);
+ String schemaFromState =
state.getProp(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName());
+ if (!Strings.isNullOrEmpty(schemaFromState)) {
+ String schemaVersion = AvroUtils.getSchemaCreationTime(new
Schema.Parser().parse(schemaFromState));
+ if (!Strings.isNullOrEmpty(schemaVersion)) {
+
newState.removeProp(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName());
+ newState.setProp("schema.creationTime", schemaVersion);
Review comment:
Can we define a static variable for this property?
##########
File path:
gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
##########
@@ -264,7 +272,18 @@ private void addSchemaProperties(Path path,
HiveRegistrationUnit hiveUnit)
*
*/
protected void addSchemaPropertiesHelper(Path path, HiveRegistrationUnit
hiveUnit) throws IOException {
- TypeInfo schema = getSchemaFromLatestFile(path, this.fs);
+ TypeInfo schema;
+ if(props.getPropAsBoolean(HIVE_SPEC_SCHEMA_FROM_WRITER,
DEFAULT_HIVE_SPEC_SCHEMA_FROM_WRITER)) {
+ try {
+ Preconditions.checkArgument(props.contains(WRITER_LATEST_SCHEMA));
+ Schema avroSchema = new
Schema.Parser().parse(props.getProp(WRITER_LATEST_SCHEMA));
+ schema = TypeInfoUtils.getTypeInfoFromObjectInspector(new
AvroObjectInspectorGenerator(avroSchema).getObjectInspector());
Review comment:
Makes sense. Thanks for the explanation!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Issue Time Tracking
-------------------
Worklog Id: (was: 616162)
Time Spent: 2h 50m (was: 2h 40m)
> Enable feature to get schema from writer schema when do hive registration
> -------------------------------------------------------------------------
>
> Key: GOBBLIN-1485
> URL: https://issues.apache.org/jira/browse/GOBBLIN-1485
> Project: Apache Gobblin
> Issue Type: New Feature
> Reporter: Zihan Li
> Priority: Major
> Time Spent: 2h 50m
> Remaining Estimate: 0h
>
> Enable feature to get schema from writer schema when do hive registration, so
> that we can avoid list operations to get the latest schema
--
This message was sent by Atlassian Jira
(v8.3.4#803005)