[GitHub] [beam] iemejia commented on a change in pull request #10815: [BEAM-9279] Make HBase.ReadAll based on Reads instead of HBaseQuery

2020-03-19 Thread GitBox
iemejia commented on a change in pull request #10815: [BEAM-9279] Make 
HBase.ReadAll based on Reads instead of HBaseQuery
URL: https://github.com/apache/beam/pull/10815#discussion_r394877026
 
 

 ##
 File path: 
sdks/java/io/hbase/src/main/java/org/apache/beam/sdk/io/hbase/HBaseIO.java
 ##
 @@ -240,63 +245,109 @@ private Read(
 @Override
 public void populateDisplayData(DisplayData.Builder builder) {
   super.populateDisplayData(builder);
-  builder.add(DisplayData.item("configuration", 
serializableConfiguration.get().toString()));
+  builder.add(DisplayData.item("configuration", configuration.toString()));
   builder.add(DisplayData.item("tableId", tableId));
-  builder.addIfNotNull(DisplayData.item("scan", 
serializableScan.get().toString()));
+  builder.addIfNotNull(DisplayData.item("scan", scan.toString()));
 }
 
 public Configuration getConfiguration() {
-  return serializableConfiguration.get();
+  return configuration;
 }
 
 public String getTableId() {
   return tableId;
 }
 
 public Scan getScan() {
-  return serializableScan.get();
+  return scan;
 }
 
 /** Returns the range of keys that will be read from the table. */
 public ByteKeyRange getKeyRange() {
-  byte[] startRow = serializableScan.get().getStartRow();
-  byte[] stopRow = serializableScan.get().getStopRow();
+  byte[] startRow = scan.getStartRow();
+  byte[] stopRow = scan.getStopRow();
   return ByteKeyRange.of(ByteKey.copyFrom(startRow), 
ByteKey.copyFrom(stopRow));
 }
 
-private final SerializableConfiguration serializableConfiguration;
+@Override
+public boolean equals(Object o) {
+  if (this == o) {
+return true;
+  }
+  if (o == null || getClass() != o.getClass()) {
+return false;
+  }
+  Read read = (Read) o;
+  return configuration.toString().equals(read.configuration.toString())
+  && Objects.equals(tableId, read.tableId)
+  && scan.toString().equals(read.scan.toString());
+}
+
+@Override
+public int hashCode() {
+  return Objects.hash(configuration, tableId, scan);
+}
+
+private Object writeReplace() {
+  return new SerializationProxy(this);
+}
+
+private static class SerializationProxy implements Serializable {
+  public SerializationProxy() {}
+
+  public SerializationProxy(Read read) {
+configuration = read.configuration;
+tableId = read.tableId;
+scan = read.scan;
+  }
+
+  private void writeObject(ObjectOutputStream out) throws IOException {
+SerializableCoder.of(SerializableConfiguration.class)
+.encode(new SerializableConfiguration(this.configuration), out);
+StringUtf8Coder.of().encode(this.tableId, out);
+ProtobufUtil.toScan(this.scan).writeDelimitedTo(out);
+  }
+
+  private void readObject(ObjectInputStream in) throws IOException {
+this.configuration = 
SerializableCoder.of(SerializableConfiguration.class).decode(in).get();
+this.tableId = StringUtf8Coder.of().decode(in);
+this.scan = 
ProtobufUtil.toScan(ClientProtos.Scan.parseDelimitedFrom(in));
+  }
+
+  Object readResolve() {
+return 
HBaseIO.read().withConfiguration(configuration).withTableId(tableId).withScan(scan);
+  }
+
+  private Configuration configuration;
+  private String tableId;
+  private Scan scan;
+}
+
+@SuppressFBWarnings("SE_BAD_FIELD")
+private final Configuration configuration;
+
 private final String tableId;
-private final SerializableScan serializableScan;
+
+@SuppressFBWarnings("SE_BAD_FIELD")
+private final Scan scan;
   }
 
   /**
* A {@link PTransform} that works like {@link #read}, but executes read 
operations coming from a
-   * {@link PCollection} of {@link HBaseQuery}.
+   * {@link PCollection} of {@link Read}.
*/
   public static ReadAll readAll() {
-return new ReadAll(null);
+return new ReadAll();
   }
 
   /** Implementation of {@link #readAll}. */
-  public static class ReadAll extends PTransform, 
PCollection> {
-
-private ReadAll(SerializableConfiguration serializableConfiguration) {
-  this.serializableConfiguration = serializableConfiguration;
-}
-
-/** Reads from the HBase instance indicated by the* given configuration. */
-public ReadAll withConfiguration(Configuration configuration) {
-  checkArgument(configuration != null, "configuration can not be null");
-  return new ReadAll(new SerializableConfiguration(configuration));
-}
+  public static class ReadAll extends PTransform, 
PCollection> {
 
 Review comment:
   I better do this in the CHANGES.md release notes file, so this gets 
announced with the release notes. Java will cover making users aware at the 
code level :)


This is an automated message from the 

[GitHub] [beam] iemejia commented on a change in pull request #10815: [BEAM-9279] Make HBase.ReadAll based on Reads instead of HBaseQuery

2020-03-19 Thread GitBox
iemejia commented on a change in pull request #10815: [BEAM-9279] Make 
HBase.ReadAll based on Reads instead of HBaseQuery
URL: https://github.com/apache/beam/pull/10815#discussion_r394860657
 
 

 ##
 File path: 
sdks/java/io/hbase/src/main/java/org/apache/beam/sdk/io/hbase/HBaseIO.java
 ##
 @@ -240,63 +245,109 @@ private Read(
 @Override
 public void populateDisplayData(DisplayData.Builder builder) {
   super.populateDisplayData(builder);
-  builder.add(DisplayData.item("configuration", 
serializableConfiguration.get().toString()));
+  builder.add(DisplayData.item("configuration", configuration.toString()));
   builder.add(DisplayData.item("tableId", tableId));
-  builder.addIfNotNull(DisplayData.item("scan", 
serializableScan.get().toString()));
+  builder.addIfNotNull(DisplayData.item("scan", scan.toString()));
 }
 
 public Configuration getConfiguration() {
-  return serializableConfiguration.get();
+  return configuration;
 }
 
 public String getTableId() {
   return tableId;
 }
 
 public Scan getScan() {
-  return serializableScan.get();
+  return scan;
 }
 
 /** Returns the range of keys that will be read from the table. */
 public ByteKeyRange getKeyRange() {
-  byte[] startRow = serializableScan.get().getStartRow();
-  byte[] stopRow = serializableScan.get().getStopRow();
+  byte[] startRow = scan.getStartRow();
+  byte[] stopRow = scan.getStopRow();
   return ByteKeyRange.of(ByteKey.copyFrom(startRow), 
ByteKey.copyFrom(stopRow));
 }
 
-private final SerializableConfiguration serializableConfiguration;
+@Override
+public boolean equals(Object o) {
+  if (this == o) {
+return true;
+  }
+  if (o == null || getClass() != o.getClass()) {
+return false;
+  }
+  Read read = (Read) o;
+  return configuration.toString().equals(read.configuration.toString())
+  && Objects.equals(tableId, read.tableId)
+  && scan.toString().equals(read.scan.toString());
+}
+
+@Override
+public int hashCode() {
+  return Objects.hash(configuration, tableId, scan);
+}
+
+private Object writeReplace() {
 
 Review comment:
   good idea, doing it.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [beam] iemejia commented on a change in pull request #10815: [BEAM-9279] Make HBase.ReadAll based on Reads instead of HBaseQuery

2020-03-19 Thread GitBox
iemejia commented on a change in pull request #10815: [BEAM-9279] Make 
HBase.ReadAll based on Reads instead of HBaseQuery
URL: https://github.com/apache/beam/pull/10815#discussion_r394859324
 
 

 ##
 File path: 
sdks/java/io/hbase/src/main/java/org/apache/beam/sdk/io/hbase/HBaseReadSplittableDoFn.java
 ##
 @@ -32,65 +31,50 @@
 import org.apache.hadoop.hbase.client.ConnectionFactory;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.ResultScanner;
-import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.client.Table;
 
 /** A SplittableDoFn to read from HBase. */
 @BoundedPerElement
-class HBaseReadSplittableDoFn extends DoFn {
-  private final SerializableConfiguration serializableConfiguration;
-
-  private transient Connection connection;
-
-  HBaseReadSplittableDoFn(SerializableConfiguration serializableConfiguration) 
{
-this.serializableConfiguration = serializableConfiguration;
-  }
-
-  @Setup
-  public void setup() throws Exception {
-connection = 
ConnectionFactory.createConnection(serializableConfiguration.get());
-  }
-
-  private static Scan newScanInRange(Scan scan, ByteKeyRange range) throws 
IOException {
-return new Scan(scan)
-.setStartRow(range.getStartKey().getBytes())
-.setStopRow(range.getEndKey().getBytes());
-  }
+class HBaseReadSplittableDoFn extends DoFn {
+  HBaseReadSplittableDoFn() {}
 
   @ProcessElement
-  public void processElement(ProcessContext c, 
RestrictionTracker tracker)
+  public void processElement(
+  @Element Read read,
+  OutputReceiver out,
+  RestrictionTracker tracker)
   throws Exception {
-final HBaseQuery query = c.element();
-TableName tableName = TableName.valueOf(query.getTableId());
+Connection connection = 
ConnectionFactory.createConnection(read.getConfiguration());
 
 Review comment:
   Better filled https://issues.apache.org/jira/browse/BEAM-9554 to track this. 
No I did not test performance because this is a quite particular case as I 
mention, for users doing 1 to n queries when n is big this time would not be 
considerable, the real issue can manifest mostly in pipelines with streaming 
where we would like to do reads per window with multiple windows kind of 
similar to what we found for JdbcIO writes (but this case is waaay more common).


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [beam] iemejia commented on a change in pull request #10815: [BEAM-9279] Make HBase.ReadAll based on Reads instead of HBaseQuery

2020-03-19 Thread GitBox
iemejia commented on a change in pull request #10815: [BEAM-9279] Make 
HBase.ReadAll based on Reads instead of HBaseQuery
URL: https://github.com/apache/beam/pull/10815#discussion_r394846218
 
 

 ##
 File path: 
sdks/java/io/hbase/src/main/java/org/apache/beam/sdk/io/hbase/HBaseIO.java
 ##
 @@ -173,33 +182,33 @@ public static Read read() {
 /** Reads from the HBase instance indicated by the* given configuration. */
 public Read withConfiguration(Configuration configuration) {
   checkArgument(configuration != null, "configuration can not be null");
-  return new Read(new SerializableConfiguration(configuration), tableId, 
serializableScan);
+  return new Read(new Configuration(configuration), tableId, scan);
 }
 
 /** Reads from the specified table. */
 public Read withTableId(String tableId) {
   checkArgument(tableId != null, "tableIdcan not be null");
-  return new Read(serializableConfiguration, tableId, serializableScan);
+  return new Read(configuration, tableId, scan);
 }
 
 /** Filters the rows read from HBase using the given* scan. */
 public Read withScan(Scan scan) {
   checkArgument(scan != null, "scancan not be null");
 
 Review comment:
   good one, fixing it


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [beam] iemejia commented on a change in pull request #10815: [BEAM-9279] Make HBase.ReadAll based on Reads instead of HBaseQuery

2020-03-19 Thread GitBox
iemejia commented on a change in pull request #10815: [BEAM-9279] Make 
HBase.ReadAll based on Reads instead of HBaseQuery
URL: https://github.com/apache/beam/pull/10815#discussion_r394846361
 
 

 ##
 File path: 
sdks/java/io/hbase/src/main/java/org/apache/beam/sdk/io/hbase/HBaseIO.java
 ##
 @@ -173,33 +182,33 @@ public static Read read() {
 /** Reads from the HBase instance indicated by the* given configuration. */
 public Read withConfiguration(Configuration configuration) {
   checkArgument(configuration != null, "configuration can not be null");
-  return new Read(new SerializableConfiguration(configuration), tableId, 
serializableScan);
+  return new Read(new Configuration(configuration), tableId, scan);
 }
 
 /** Reads from the specified table. */
 public Read withTableId(String tableId) {
   checkArgument(tableId != null, "tableIdcan not be null");
 
 Review comment:
   fixing it, actually they were more ocurrencdes of this 'can' mistake I fixed 
them all now.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services