[jira] [Commented] (AVRO-1777) Select best matching record when writing a union in python

ASF GitHub Bot (JIRA) Tue, 11 Dec 2018 10:29:53 -0800


    [ 
https://issues.apache.org/jira/browse/AVRO-1777?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16717713#comment-16717713
 ]


ASF GitHub Bot commented on AVRO-1777:
--------------------------------------

dkulp closed pull request #95: AVRO-1777: Select best matching record when 
writing a union in python
URL: https://github.com/apache/avro/pull/95
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/lang/py/src/avro/io.py b/lang/py/src/avro/io.py
index b2fd2f9ba..63907e17c 100644
--- a/lang/py/src/avro/io.py
+++ b/lang/py/src/avro/io.py
@@ -94,6 +94,10 @@ def __init__(self, fail_msg, writers_schema=None, 
readers_schema=None):
     if readers_schema: fail_msg += "\nReader's Schema: %s" % pretty_readers
     schema.AvroException.__init__(self, fail_msg)
 
+class RecordInitializationException(schema.AvroException):
+    def __init__(self, fail_msg):
+        schema.AvroException.__init__(self, fail_msg)
+
 #
 # Validate
 #
@@ -110,14 +114,17 @@ def validate(expected_schema, datum):
   elif schema_type == 'bytes':
     return isinstance(datum, str)
   elif schema_type == 'int':
-    return ((isinstance(datum, int) or isinstance(datum, long)) 
-            and INT_MIN_VALUE <= datum <= INT_MAX_VALUE)
+    return (((isinstance(datum, int) and not isinstance(datum, bool)) or
+            isinstance(datum, long)) and
+            INT_MIN_VALUE <= datum <= INT_MAX_VALUE)
   elif schema_type == 'long':
-    return ((isinstance(datum, int) or isinstance(datum, long)) 
-            and LONG_MIN_VALUE <= datum <= LONG_MAX_VALUE)
+    return (((isinstance(datum, int) and not isinstance(datum, bool)) or
+            isinstance(datum, long)) and
+            LONG_MIN_VALUE <= datum <= LONG_MAX_VALUE)
   elif schema_type in ['float', 'double']:
-    return (isinstance(datum, int) or isinstance(datum, long)
-            or isinstance(datum, float))
+    return (isinstance(datum, long) or
+            (isinstance(datum, int) and not isinstance(datum, bool)) or
+            isinstance(datum, float))
   elif schema_type == 'fixed':
     return isinstance(datum, str) and len(datum) == expected_schema.size
   elif schema_type == 'enum':
@@ -132,6 +139,8 @@ def validate(expected_schema, datum):
         [validate(expected_schema.values, v) for v in datum.values()])
   elif schema_type in ['union', 'error_union']:
     return True in [validate(s, datum) for s in expected_schema.schemas]
+  elif schema_type == 'record' and isinstance(datum, GenericRecord):
+      return expected_schema == datum.schema
   elif schema_type in ['record', 'error', 'request']:
     return (isinstance(datum, dict) and
       False not in
@@ -683,7 +692,7 @@ def read_record(self, writers_schema, readers_schema, 
decoder):
     """
     # schema resolution
     readers_fields_dict = readers_schema.fields_dict
-    read_record = {}
+    read_record = GenericRecord(readers_schema)
     for field in writers_schema.fields:
       readers_field = readers_fields_dict.get(field.name)
       if readers_field is not None:
@@ -888,3 +897,23 @@ def write_record(self, writers_schema, datum, encoder):
     """
     for field in writers_schema.fields:
       self.write_data(field.type, datum.get(field.name), encoder)
+
+class GenericRecord(dict):
+
+    def __init__(self, record_schema, lst = []):
+        if (record_schema is None or
+                not isinstance(record_schema, schema.Schema)):
+            raise RecordInitializationException(
+                    "Cannot initialize a record with schema: {sc}".format(sc = 
record_schema))
+        dict.__init__(self, lst)
+        self.schema = record_schema
+
+    def __eq__(self, other):
+        if other is None or not isinstance(other, dict):
+            return False
+        if not dict.__eq__(self, other):
+            return False
+        if isinstance(other, GenericRecord):
+            return self.schema == other.schema
+        else:
+            return True
diff --git a/lang/py/test/test_io.py b/lang/py/test/test_io.py
index 1e79d3e89..d6e341a47 100644
--- a/lang/py/test/test_io.py
+++ b/lang/py/test/test_io.py
@@ -39,6 +39,8 @@
   ('{"type": "array", "items": "long"}', [1, 3, 2]),
   ('{"type": "map", "values": "long"}', {'a': 1, 'b': 3, 'c': 2}),
   ('["string", "null", "long"]', None),
+  ('["double", "boolean"]', True),
+  ('["boolean", "double"]', True),
   ("""\
    {"type": "record",
     "name": "Test",
@@ -190,6 +192,13 @@ def test_validate(self):
   def test_round_trip(self):
     print_test_name('TEST ROUND TRIP')
     correct = 0
+    def are_equal(datum, round_trip_datum):
+        if datum != round_trip_datum:
+            return False
+        if type(datum) == bool:
+            return type(round_trip_datum) == bool
+        else:
+            return True
     for example_schema, datum in SCHEMAS_TO_VALIDATE:
       print 'Schema: %s' % example_schema
       print 'Datum: %s' % datum
@@ -199,7 +208,7 @@ def test_round_trip(self):
       round_trip_datum = read_datum(writer, writers_schema)
 
       print 'Round Trip Datum: %s' % round_trip_datum
-      if datum == round_trip_datum: correct += 1
+      if are_equal(datum, round_trip_datum): correct += 1
     self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE))
 
   #


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Select best matching record when writing a union in python
> ----------------------------------------------------------
>
>                 Key: AVRO-1777
>                 URL: https://issues.apache.org/jira/browse/AVRO-1777
>             Project: Apache Avro
>          Issue Type: Improvement
>          Components: python
>    Affects Versions: 1.7.7
>            Reporter: Steven Aerts
>            Priority: Major
>             Fix For: 1.9.0
>
>
> Unlike javascript, python is not using wrapped types.
> So when writing a union it needs to guess find out which type it will output.
> At the moment it takes the last validating type.
> I propose to take the type with the most matching fields.
> So I propose to change in {{io.py}}:
> {code}
> # resolve union
> index_of_schema = -1
> for i, candidate_schema in enumerate(writers_schema.schemas):
>   if validate(candidate_schema, datum):
>     index_of_schema = i
> if index_of_schema < 0: raise AvroTypeException(writers_schema, datum)
> {code}
> into
> {code}
> # resolve union
> index_of_schema = -1
> found_fields = -1
> for i, candidate_schema in enumerate(writers_schema.schemas):
>   if validate(candidate_schema, datum):
>     nr_fields = candidate_schema.type in ['record', 'error', 'request'] and 
> len(candidate_schema.fields) or 1
>     if nr_fields > found_fields:
>       index_of_schema = i
>       found_fields = nr_fields
> if index_of_schema < 0: raise AvroTypeException(writers_schema, datum)
> {code}
> If you want, I can create a pull request for this.  And apply it both on py3 
> as py.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (AVRO-1777) Select best matching record when writing a union in python

Reply via email to