[ 
https://issues.apache.org/jira/browse/AVRO-2779?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mateusz Mrozewski updated AVRO-2779:
------------------------------------
    Description: 
I have a producer that sometimes adds new fields to schema. Producer usually 
gets updated first and starts producing serialized records with new fields 
(data is sent via Kafka).

I have a consumer, that should be able to read the data from Kafka, even when 
produced with newer schema - new fields can be ignored until consumer gets 
updated.

I noticed that adding two fields, one at the top level and one in the nested 
records yields unexpected results.

Old schema:
{code:java}
{
  "namespace" : "some.namespace",
  "name" : "MyRecord",
  "type" : "record",
  "fields" : [
    {"name": "field1", "type": "long"},
    {
      "name": "nested",
      "type": {
        "type" : "record",
        "name" : "nestedRecord",
        "fields" : [
          {"name": "nestedField1", "type": "long"}
        ]
      }
    }
  ]
}
{code}
New Schema:
{code:java}
{
  "namespace" : "some.namespace",
  "name" : "MyRecord",
  "type" : "record",
  "fields" : [
    {"name": "field1", "type": "long"},
    {"name": "field2", "type": "long"},
    {
      "name": "nested",
      "type": {
        "type" : "record",
        "name" : "nestedRecord",
        "fields" : [
          {"name": "nestedField1", "type": "long"},
          {"name": "nestedField2", "type": "long"}
        ]
      }
    }
  ]
}
{code}
And example code:
{code:java}
Schema.Parser parser = new Schema.Parser();
InputStream fin = new FileInputStream("src/main/resources/schemas/old.json");
Schema oldSchema = parser.parse(fin);Schema.Parser parser2 = new 
Schema.Parser();
fin = new FileInputStream("src/main/resources/schemas/new.json");
Schema newSchema = parser2.parse(fin);GenericData.Record nested = new 
GenericRecordBuilder(newSchema.getField("nested").schema())
        .set("nestedField1", 3L)
        .set("nestedField2", 4L)
        .build();
GenericData.Record newRecord = new GenericRecordBuilder(newSchema)
        .set("field1", 1L)
        .set("field2", 2L)
        .set("nested", nested)
        .build();GenericData gd1 = new GenericData();
RawMessageEncoder<GenericRecord> encoder = new RawMessageEncoder<>(gd1, 
newSchema);
ByteBuffer encoded = encoder.encode(newRecord);GenericData gd2 = new 
GenericData();
RawMessageDecoder<GenericRecord> decoder = new RawMessageDecoder<>(gd2, 
oldSchema);
GenericRecord record = 
decoder.decode(encoded);System.out.println(record.get("field1")); // prints 1
System.out.println(record.get("field2")); // prints null
System.out.println(record.get("totally-fake-field")); // prints 
nullSystem.out.println(((GenericRecord) 
record.get("nested")).get("nestedField1")); // prints 2!
System.out.println(((GenericRecord) record.get("nested")).get("nestedField2")); 
// prints null
{code}
Is this an expected behavior? Should such schema evolution be supported?

  was:
I have a producer that sometimes adds new fields to schema. Producer usually 
gets updated first and starts producing serialized records with new fields 
(data is sent via Kafka).

I have a consumer, that should be able to read the data from Kafka, even when 
produced with newer schema - new fields can be ignored until consumer gets 
updated.

I noticed that adding two fields, one at the top level and one in the nested 
records yields unexpected results.

Old schema:
{code:java}
{
  "namespace" : "some.namespace",
  "name" : "MyRecord",
  "type" : "record",
  "fields" : [
    {"name": "field1", "type": "long"},
    {
      "name": "nested",
      "type": {
        "type" : "record",
        "name" : "nestedRecord",
        "fields" : [
          {"name": "nestedField1", "type": "long"}
        ]
      }
    }
  ]
}
{code}
New Schema:
{code:java}
{
  "namespace" : "some.namespace",
  "name" : "MyRecord",
  "type" : "record",
  "fields" : [
    {"name": "field1", "type": "long"},
    {"name": "field2", "type": "long"},
    {
      "name": "nested",
      "type": {
        "type" : "record",
        "name" : "nestedRecord",
        "fields" : [
          {"name": "nestedField1", "type": "long"},
          {"name": "nestedField2", "type": "long"}
        ]
      }
    }
  ]
}
{code}
And example code:
{code:java}
Schema.Parser parser = new Schema.Parser();
InputStream fin = new FileInputStream("src/main/resources/schemas/old.json");
Schema oldSchema = parser.parse(fin);Schema.Parser parser2 = new 
Schema.Parser();
fin = new FileInputStream("src/main/resources/schemas/new.json");
Schema newSchema = parser2.parse(fin);GenericData.Record nested = new 
GenericRecordBuilder(newSchema.getField("nested").schema())
        .set("nestedField1", 3L)
        .set("nestedField2", 4L)
        .build();GenericData.Record newRecord = new 
GenericRecordBuilder(newSchema)
        .set("field1", 1L)
        .set("field2", 2L)
        .set("nested", nested)
        .build();GenericData gd1 = new GenericData();
RawMessageEncoder<GenericRecord> encoder = new RawMessageEncoder<>(gd1, 
newSchema);
ByteBuffer encoded = encoder.encode(newRecord);GenericData gd2 = new 
GenericData();
RawMessageDecoder<GenericRecord> decoder = new RawMessageDecoder<>(gd2, 
oldSchema);
GenericRecord record = 
decoder.decode(encoded);System.out.println(record.get("field1")); // prints 1
System.out.println(record.get("field2")); // prints null
System.out.println(record.get("totally-fake-field")); // prints 
nullSystem.out.println(((GenericRecord) 
record.get("nested")).get("nestedField1")); // prints 2!
System.out.println(((GenericRecord) record.get("nested")).get("nestedField2")); 
// prints null
{code}
Is this an expected behavior? Should such schema evolution be supported?


> Schema evolution and adding fields to nested records
> ----------------------------------------------------
>
>                 Key: AVRO-2779
>                 URL: https://issues.apache.org/jira/browse/AVRO-2779
>             Project: Apache Avro
>          Issue Type: Bug
>          Components: java
>    Affects Versions: 1.9.2
>            Reporter: Mateusz Mrozewski
>            Priority: Major
>
> I have a producer that sometimes adds new fields to schema. Producer usually 
> gets updated first and starts producing serialized records with new fields 
> (data is sent via Kafka).
> I have a consumer, that should be able to read the data from Kafka, even when 
> produced with newer schema - new fields can be ignored until consumer gets 
> updated.
> I noticed that adding two fields, one at the top level and one in the nested 
> records yields unexpected results.
> Old schema:
> {code:java}
> {
>   "namespace" : "some.namespace",
>   "name" : "MyRecord",
>   "type" : "record",
>   "fields" : [
>     {"name": "field1", "type": "long"},
>     {
>       "name": "nested",
>       "type": {
>         "type" : "record",
>         "name" : "nestedRecord",
>         "fields" : [
>           {"name": "nestedField1", "type": "long"}
>         ]
>       }
>     }
>   ]
> }
> {code}
> New Schema:
> {code:java}
> {
>   "namespace" : "some.namespace",
>   "name" : "MyRecord",
>   "type" : "record",
>   "fields" : [
>     {"name": "field1", "type": "long"},
>     {"name": "field2", "type": "long"},
>     {
>       "name": "nested",
>       "type": {
>         "type" : "record",
>         "name" : "nestedRecord",
>         "fields" : [
>           {"name": "nestedField1", "type": "long"},
>           {"name": "nestedField2", "type": "long"}
>         ]
>       }
>     }
>   ]
> }
> {code}
> And example code:
> {code:java}
> Schema.Parser parser = new Schema.Parser();
> InputStream fin = new FileInputStream("src/main/resources/schemas/old.json");
> Schema oldSchema = parser.parse(fin);Schema.Parser parser2 = new 
> Schema.Parser();
> fin = new FileInputStream("src/main/resources/schemas/new.json");
> Schema newSchema = parser2.parse(fin);GenericData.Record nested = new 
> GenericRecordBuilder(newSchema.getField("nested").schema())
>         .set("nestedField1", 3L)
>         .set("nestedField2", 4L)
>         .build();
> GenericData.Record newRecord = new GenericRecordBuilder(newSchema)
>         .set("field1", 1L)
>         .set("field2", 2L)
>         .set("nested", nested)
>         .build();GenericData gd1 = new GenericData();
> RawMessageEncoder<GenericRecord> encoder = new RawMessageEncoder<>(gd1, 
> newSchema);
> ByteBuffer encoded = encoder.encode(newRecord);GenericData gd2 = new 
> GenericData();
> RawMessageDecoder<GenericRecord> decoder = new RawMessageDecoder<>(gd2, 
> oldSchema);
> GenericRecord record = 
> decoder.decode(encoded);System.out.println(record.get("field1")); // prints 1
> System.out.println(record.get("field2")); // prints null
> System.out.println(record.get("totally-fake-field")); // prints 
> nullSystem.out.println(((GenericRecord) 
> record.get("nested")).get("nestedField1")); // prints 2!
> System.out.println(((GenericRecord) 
> record.get("nested")).get("nestedField2")); // prints null
> {code}
> Is this an expected behavior? Should such schema evolution be supported?



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to