dpmills commented on a change in pull request #11919:
URL: https://github.com/apache/beam/pull/11919#discussion_r443711084



##########
File path: 
sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/PubsubLiteIO.java
##########
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.gcp.pubsublite;
+
+import com.google.cloud.pubsublite.Message;
+import com.google.cloud.pubsublite.SequencedMessage;
+import org.apache.beam.sdk.annotations.Experimental;
+import org.apache.beam.sdk.io.Read;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.SerializableFunction;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PDone;
+import org.apache.beam.sdk.values.PInput;
+import org.apache.beam.sdk.values.POutput;
+
+@Experimental
+public final class PubsubLiteIO {
+  private PubsubLiteIO() {}
+
+  private static <InT extends PInput, OutT extends POutput> PTransform<InT, 
OutT> toTransform(
+      SerializableFunction<InT, OutT> fn, String name) {
+    return new PTransform<InT, OutT>(name) {
+      @Override
+      public OutT expand(InT input) {
+        return fn.apply(input);
+      }
+    };
+  }
+
+  /**
+   * Read messages from Pub/Sub Lite. These messages may contain duplicates if 
the publisher
+   * retried, which the PubsubLiteIO write method will do. Use the dedupe 
transform to remove these
+   * duplicates.
+   */
+  public static Read.Unbounded<SequencedMessage> read(SubscriberOptions 
options) {
+    return Read.from(new PubsubLiteUnboundedSource(options));
+  }
+
+  /**
+   * Remove duplicates from the PTransform from a read. Assumes by default 
that the uuids were added
+   * by a call to PubsubLiteIO.addUuids() when published.
+   */
+  public static PTransform<PCollection<SequencedMessage>, 
PCollection<SequencedMessage>>
+      deduplicate(UuidDeduplicationOptions options) {
+    return new UuidDeduplicationTransform(options);
+  }
+
+  /** Add Uuids to to-be-published messages that ensures that uniqueness is 
maintained. */
+  public static PTransform<PCollection<Message>, PCollection<Message>> 
addUuids() {
+    return new AddUuidsTransform();
+  }
+
+  /** Write messages to Pub/Sub Lite. */
+  public static PTransform<PCollection<Message>, PDone> write(PublisherOptions 
options) {
+    return toTransform(

Review comment:
       This is the only caller of toTransform now; I think it would be more 
readable to just inline that logic here

##########
File path: 
sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/AddUuidsTransform.java
##########
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.gcp.pubsublite;
+
+import com.google.cloud.pubsublite.Message;
+import com.google.common.collect.ImmutableListMultimap;
+import com.google.protobuf.ByteString;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.Reshuffle;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.TypeDescriptor;
+
+class AddUuidsTransform extends PTransform<PCollection<Message>, 
PCollection<Message>> {
+  private static Message addUuid(Message message) {
+    ImmutableListMultimap.Builder<String, ByteString> attributesBuilder =
+        ImmutableListMultimap.builder();
+    message.attributes().entries().stream()
+        .filter(entry -> !entry.getKey().equals(Uuid.DEFAULT_ATTRIBUTE))
+        .forEach(attributesBuilder::put);
+    attributesBuilder.put(Uuid.DEFAULT_ATTRIBUTE, Uuid.random().value());
+    return 
message.toBuilder().setAttributes(attributesBuilder.build()).build();
+  }
+
+  @Override
+  public PCollection<Message> expand(PCollection<Message> input) {
+    PCollection<Message> withUuids =
+        input
+            .apply(
+                "AddUuids",
+                MapElements.into(new TypeDescriptor<Message>() 
{}).via(AddUuidsTransform::addUuid))
+            .setCoder(new MessageCoder());
+    return withUuids.apply("ShuffleToPersist", Reshuffle.viaRandomKey());

Review comment:
       Adding a customizable numBuckets to Reshuffle.viaRandomKey sounds good 
to me.  Leave the default behavior as is to avoid breaking current users, 
though.

##########
File path: 
sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/PubsubLiteIO.java
##########
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.gcp.pubsublite;
+
+import com.google.cloud.pubsublite.Message;
+import com.google.cloud.pubsublite.SequencedMessage;
+import org.apache.beam.sdk.annotations.Experimental;
+import org.apache.beam.sdk.io.Read;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.SerializableFunction;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PDone;
+import org.apache.beam.sdk.values.PInput;
+import org.apache.beam.sdk.values.POutput;
+
+@Experimental

Review comment:
       Please add javadoc explaining the current state of this code.  Current 
directory setup LGTM.

##########
File path: 
sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/UuidDeduplicationTransform.java
##########
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.gcp.pubsublite;
+
+import com.google.cloud.pubsublite.SequencedMessage;
+import java.math.BigInteger;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.ProcessFunction;
+import org.apache.beam.sdk.transforms.Values;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.TypeDescriptor;
+
+class UuidDeduplicationTransform
+    extends PTransform<PCollection<SequencedMessage>, 
PCollection<SequencedMessage>> {
+  private final UuidDeduplicationOptions options;
+
+  UuidDeduplicationTransform(UuidDeduplicationOptions options) {
+    this.options = options;
+  }
+
+  @Override
+  public PCollection<SequencedMessage> expand(PCollection<SequencedMessage> 
input) {
+    input.getPipeline().getCoderRegistry().registerCoderForClass(Uuid.class, 
Uuid.getCoder());

Review comment:
       Yeah, you can create a CoderProviderRegistrar.  There's an example from 
another IO solving the same problem here: 
https://github.com/apache/beam/blob/master/sdks/java/io/hbase/src/main/java/org/apache/beam/sdk/io/hbase/HBaseCoderProviderRegistrar.java
   In that provider, add default coders for SequencedMessage and Message
   
   Also, remove the registerCoderForClass calls here

##########
File path: 
sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/PubsubLiteIO.java
##########
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.gcp.pubsublite;
+
+import com.google.cloud.pubsublite.Message;
+import com.google.cloud.pubsublite.SequencedMessage;
+import org.apache.beam.sdk.annotations.Experimental;
+import org.apache.beam.sdk.io.Read;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.SerializableFunction;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PDone;
+import org.apache.beam.sdk.values.PInput;
+import org.apache.beam.sdk.values.POutput;
+
+@Experimental
+public final class PubsubLiteIO {
+  private PubsubLiteIO() {}
+
+  private static <InT extends PInput, OutT extends POutput> PTransform<InT, 
OutT> toTransform(
+      SerializableFunction<InT, OutT> fn, String name) {
+    return new PTransform<InT, OutT>(name) {
+      @Override
+      public OutT expand(InT input) {
+        return fn.apply(input);
+      }
+    };
+  }
+
+  /**
+   * Read messages from Pub/Sub Lite. These messages may contain duplicates if 
the publisher
+   * retried, which the PubsubLiteIO write method will do. Use the dedupe 
transform to remove these
+   * duplicates.
+   */

Review comment:
       For each of the main entrypoint functions here, please add examples of 
calling them to the javadoc.

##########
File path: sdks/java/build-tools/src/main/resources/beam/checkstyle.xml
##########
@@ -101,7 +101,7 @@ page at http://checkstyle.sourceforge.net/config.html -->
     -->
     <module name="RegexpSinglelineJava">
       <property name="id" value="ForbidNonVendoredGuava"/>
-      <property name="format" 
value="(\scom\.google\.common\.(?!testing))|(\scom\.google\.thirdparty)"/>
+      <property name="format" 
value="(\scom\.google\.common\.(?!testing)(?!truth))|(\scom\.google\.thirdparty)"/>

Review comment:
       I think you can revert this change now

##########
File path: 
sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/UuidDeduplicationOptions.java
##########
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.gcp.pubsublite;
+
+import static com.google.cloud.pubsublite.internal.Preconditions.checkArgument;
+
+import com.google.auto.value.AutoValue;
+import com.google.cloud.pubsublite.SequencedMessage;
+import com.google.protobuf.ByteString;
+import java.io.Serializable;
+import java.util.List;
+import org.apache.beam.sdk.state.TimeDomain;
+import org.apache.beam.sdk.transforms.Deduplicate;
+
+@AutoValue
+public abstract class UuidDeduplicationOptions implements Serializable {
+  private static final long serialVersionUID = 9837489720893L;
+
+  public static final SerializableStatusFunction<SequencedMessage, Uuid> 
DEFAULT_UUID_EXTRACTOR =
+      message -> {
+        checkArgument(
+            message.message().attributes().containsKey(Uuid.DEFAULT_ATTRIBUTE),
+            "Uuid attribute missing.");
+        List<ByteString> attributes =
+            message.message().attributes().get(Uuid.DEFAULT_ATTRIBUTE);
+        checkArgument(attributes.size() == 1, "Duplicate Uuid attribute values 
exist.");
+        return Uuid.of(attributes.get(0));
+      };
+
+  public static final int DEFAULT_HASH_PARTITIONS = 10000;
+
+  // All parameters are optional.
+  public abstract SerializableStatusFunction<SequencedMessage, Uuid> 
uuidExtractor();
+
+  public abstract Deduplicate.KeyedValues<Uuid, SequencedMessage> 
deduplicate();
+
+  // The number of partitions to hash values into.
+  public abstract int hashPartitions();
+
+  @SuppressWarnings("CheckReturnValue")
+  public static Builder newBuilder() {
+    Builder builder = new AutoValue_UuidDeduplicationOptions.Builder();
+    builder.setUuidExtractor(DEFAULT_UUID_EXTRACTOR);
+    builder.setDeduplicate(
+        Deduplicate.<Uuid, 
SequencedMessage>keyedValues().withTimeDomain(TimeDomain.EVENT_TIME));
+    builder.setHashPartitions(DEFAULT_HASH_PARTITIONS);
+    return builder;
+  }
+
+  @AutoValue.Builder
+  public abstract static class Builder {
+    public abstract Builder setUuidExtractor(
+        SerializableStatusFunction<SequencedMessage, Uuid> uuidExtractor);
+
+    public abstract Builder setDeduplicate(
+        Deduplicate.KeyedValues<Uuid, SequencedMessage> deduplicate);

Review comment:
       Yeah, it's the wrapping into a tranform that I don't like.  I'm fine 
with leaving this as is if you add an example as I mentioned in a comment above




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to