[ 
https://issues.apache.org/jira/browse/BEAM-7860?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Niels Stender updated BEAM-7860:
--------------------------------
    Description: 
In the presence of mixed type keys, v1new ReadFromDatastore may return 
duplicate items. The attached example returns 4 records, not the expected 3.

 
{code:java}
// code placeholder
from __future__ import unicode_literals
import apache_beam as beam
from apache_beam.io.gcp.datastore.v1new.types import Key, Entity, Query
from apache_beam.io.gcp.datastore.v1new import datastoreio


config = dict(project='your-google-project', namespace='test')


def test_mixed():
    keys = [
        Key(['mixed', '10038260-iperm_eservice'], **config),
        Key(['mixed', 4812224868188160], **config),
        Key(['mixed', '99152975-pointshop'], **config)
    ]

    entities = map(lambda key: Entity(key=key), keys)

    with beam.Pipeline() as p:
        (p
            | beam.Create(entities)
            | datastoreio.WriteToDatastore(project=config['project'])
        )

    query = Query(kind=u'mixed', **config)

    with beam.Pipeline() as p:
        (p
            | datastoreio.ReadFromDatastore(query=query, num_splits=4)
            | beam.io.WriteToText('tmp.txt', num_shards=1, 
shard_name_template='')
    )

    items = open('tmp.txt').read().strip().split('\n')
    assert len(items) == 3, 'incorrect number of items'


{code}

  was:
In the presence of mixed type keys, v1new ReadFromDatastore may return 
duplicate items. The attached example returns 4 records, not the expected 3.

 
{code:java}
// code placeholder
from __future__ import unicode_literals
import apache_beam as beam
from apache_beam.io.gcp.datastore.v1new.types import Key, Entity, Query
from apache_beam.io.gcp.datastore.v1new import datastoreio


config = dict(project='your-google-project', namespace='test')


def test_mixed():
    keys = [
        Key(['mixed', '10038260-iperm_eservice'], **config),
        Key(['mixed', 4812224868188160L], **config),
        Key(['mixed', '99152975-pointshop'], **config)
    ]

    entities = map(lambda key: Entity(key=key), keys)

    with beam.Pipeline() as p:
        (p
            | beam.Create(entities)
            | datastoreio.WriteToDatastore(project=config['project'])
        )

    query = Query(kind=u'mixed', **config)

    with beam.Pipeline() as p:
        (p
            | datastoreio.ReadFromDatastore(query=query, num_splits=4)
            | beam.io.WriteToText('tmp.txt', num_shards=1, 
shard_name_template='')
    )

    items = open('tmp.txt').read().strip().split('\n')
    assert len(items) == 3, 'incorrect number of items'


{code}


> v1new ReadFromDatastore splits incorrectly in the presence of mixed type keys
> -----------------------------------------------------------------------------
>
>                 Key: BEAM-7860
>                 URL: https://issues.apache.org/jira/browse/BEAM-7860
>             Project: Beam
>          Issue Type: Bug
>          Components: io-python-gcp
>    Affects Versions: 2.13.0
>         Environment: Python 2.7
>            Reporter: Niels Stender
>            Priority: Major
>
> In the presence of mixed type keys, v1new ReadFromDatastore may return 
> duplicate items. The attached example returns 4 records, not the expected 3.
>  
> {code:java}
> // code placeholder
> from __future__ import unicode_literals
> import apache_beam as beam
> from apache_beam.io.gcp.datastore.v1new.types import Key, Entity, Query
> from apache_beam.io.gcp.datastore.v1new import datastoreio
> config = dict(project='your-google-project', namespace='test')
> def test_mixed():
>     keys = [
>         Key(['mixed', '10038260-iperm_eservice'], **config),
>         Key(['mixed', 4812224868188160], **config),
>         Key(['mixed', '99152975-pointshop'], **config)
>     ]
>     entities = map(lambda key: Entity(key=key), keys)
>     with beam.Pipeline() as p:
>         (p
>             | beam.Create(entities)
>             | datastoreio.WriteToDatastore(project=config['project'])
>         )
>     query = Query(kind=u'mixed', **config)
>     with beam.Pipeline() as p:
>         (p
>             | datastoreio.ReadFromDatastore(query=query, num_splits=4)
>             | beam.io.WriteToText('tmp.txt', num_shards=1, 
> shard_name_template='')
>     )
>     items = open('tmp.txt').read().strip().split('\n')
>     assert len(items) == 3, 'incorrect number of items'
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.14#76016)

Reply via email to