[ https://issues.apache.org/jira/browse/BEAM-7013?focusedWorklogId=337110&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-337110 ]
ASF GitHub Bot logged work on BEAM-7013: ---------------------------------------- Author: ASF GitHub Bot Created on: 01/Nov/19 00:23 Start Date: 01/Nov/19 00:23 Worklog Time Spent: 10m Work Description: robinyqiu commented on pull request #9778: [BEAM-7013] Update BigQueryHllSketchCompatibilityIT to cover empty sketch cases URL: https://github.com/apache/beam/pull/9778#discussion_r341414587 ########## File path: sdks/java/extensions/zetasketch/src/test/java/org/apache/beam/sdk/extensions/zetasketch/BigQueryHllSketchCompatibilityIT.java ########## @@ -126,22 +145,49 @@ public static void deleteDataset() throws Exception { } /** - * Test that HLL++ sketch computed in BigQuery can be processed by Beam. Hll sketch is computed by - * {@code HLL_COUNT.INIT} in BigQuery and read into Beam; the test verifies that we can run {@link - * HllCount.MergePartial} and {@link HllCount.Extract} on the sketch in Beam to get the correct - * estimated count. + * Test that non-empty HLL++ sketch computed in BigQuery can be processed by Beam. + * + * <p>Hll sketch is computed by {@code HLL_COUNT.INIT} in BigQuery and read into Beam; the test + * verifies that we can run {@link HllCount.MergePartial} and {@link HllCount.Extract} on the + * sketch in Beam to get the correct estimated count. + */ + @Test + public void testReadNonEmptySketchFromBigQuery() { + readSketchFromBigQuery(DATA_TABLE_ID_NON_EMPTY, EXPECTED_COUNT_NON_EMPTY); + } + + /** + * Test that empty HLL++ sketch computed in BigQuery can be processed by Beam. + * + * <p>Hll sketch is computed by {@code HLL_COUNT.INIT} in BigQuery and read into Beam; the test + * verifies that we can run {@link HllCount.MergePartial} and {@link HllCount.Extract} on the + * sketch in Beam to get the correct estimated count. */ @Test - public void testReadSketchFromBigQuery() { - String tableSpec = String.format("%s.%s", DATASET_ID, DATA_TABLE_ID); + public void testReadEmptySketchFromBigQuery() { + readSketchFromBigQuery(DATA_TABLE_ID_EMPTY, EXPECTED_COUNT_EMPTY); + } + + private void readSketchFromBigQuery(String tableId, Long expectedCount) { + String tableSpec = String.format("%s.%s", DATASET_ID, tableId); String query = String.format( "SELECT HLL_COUNT.INIT(%s) AS %s FROM %s", DATA_FIELD_NAME, QUERY_RESULT_FIELD_NAME, tableSpec); + SerializableFunction<SchemaAndRecord, byte[]> parseQueryResultToByteArray = - (SchemaAndRecord schemaAndRecord) -> - // BigQuery BYTES type corresponds to Java java.nio.ByteBuffer type - ((ByteBuffer) schemaAndRecord.getRecord().get(QUERY_RESULT_FIELD_NAME)).array(); + input -> { + // BigQuery BYTES type corresponds to Java java.nio.ByteBuffer type + ByteBuffer sketch = (ByteBuffer) input.getRecord().get(QUERY_RESULT_FIELD_NAME); + if (sketch == null) { + // Empty sketch is represented by null in BigQuery and by empty byte array in Beam + return new byte[0]; + } else { + byte[] result = new byte[sketch.remaining()]; Review comment: Exactly. We know that is the case by looking into `Avro`'s implementation, but compiler does not know that, and it gives a warning if we use `.array()`. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 337110) Time Spent: 35h 50m (was: 35h 40m) > A new count distinct transform based on BigQuery compatible HyperLogLog++ > implementation > ---------------------------------------------------------------------------------------- > > Key: BEAM-7013 > URL: https://issues.apache.org/jira/browse/BEAM-7013 > Project: Beam > Issue Type: New Feature > Components: extensions-java-sketching, sdk-java-core > Reporter: Yueyang Qiu > Assignee: Yueyang Qiu > Priority: Major > Fix For: 2.16.0 > > Time Spent: 35h 50m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)