pabloem commented on a change in pull request #10497: [BEAM-8335] Add the 
ReverseTestStream
URL: https://github.com/apache/beam/pull/10497#discussion_r385979045
 
 

 ##########
 File path: sdks/python/apache_beam/testing/test_stream.py
 ##########
 @@ -314,3 +355,239 @@ def from_runner_api_parameter(ptransform, payload, 
context):
         coder=coder,
         events=[Event.from_runner_api(e, coder) for e in payload.events],
         output_tags=output_tags)
+
+
+class TimingInfo(object):
+  def __init__(self, processing_time, watermark):
+    self._processing_time = timestamp.Timestamp.of(processing_time)
+    self._watermark = timestamp.Timestamp.of(watermark)
+
+  @property
+  def processing_time(self):
+    return self._processing_time
+
+  @property
+  def watermark(self):
+    return self._watermark
+
+  def __repr__(self):
+    return '({}, {}, {})'.format(
+        self.event_timestamp, self.processing_time, self.watermark)
+
+
+class PairWithTiming(PTransform):
+  """Pairs the input element with timing information.
+
+  Input: element; output: KV(element, timing information)
+  Where timing information := (processing time, watermark)
+
+  This is used in the ReverseTestStream implementation to replay watermark
+  advancements.
+  """
+
+  URN = "beam:transform:pair_with_timing:v1"
+
+  def expand(self, pcoll):
+    return pvalue.PCollection.from_(pcoll)
+
+
+class ReverseTestStream(PTransform):
+  """A Transform that can create TestStream events from a stream of elements.
+
+  This currently assumes that this the pipeline being run on a single machine
+  and elements come in order and are outputted in the same order that they came
+  in.
+  """
+  class Format(Enum):
+    TEST_STREAM_EVENTS = 1
+    TEST_STREAM_FILE_RECORDS = 2
+    SERIALIZED_TEST_STREAM_FILE_RECORDS = 3
+
+  def __init__(
+      self, sample_resolution_sec, output_tag, coder=None, output_format=None):
+    self._sample_resolution_sec = sample_resolution_sec
+    self._output_tag = output_tag
+    self._output_format = output_format if output_format \
+                          else ReverseTestStream.Format.TEST_STREAM_EVENTS
+    self._coder = coder if coder else beam.coders.FastPrimitivesCoder()
+
+  def expand(self, pcoll):
+    generator = (
+        _TestStreamFileRecordGenerator(coder=self._coder) if (
+            self._output_format in (
+                self.Format.TEST_STREAM_FILE_RECORDS,
+                self.Format.SERIALIZED_TEST_STREAM_FILE_RECORDS)) else
+        _TestStreamEventGenerator())
+
+    ret = (
+        pcoll
+        | beam.WindowInto(beam.window.GlobalWindows())
+
+        # First get the initial timing information. This will be used to start
+        # the periodic timers which will generate processing time and watermark
+        # advancements every `sample_resolution_sec`.
+        | 'initial timing' >> PairWithTiming()
+
+        # Next, map every element to the same key so that only a single timer 
is
+        # started for this given ReverseTestStream.
+        | beam.Map(lambda x: (0, x))
+
+        # Next, pass-through each element which will be paired with its timing
+        # info in the next step. Also, start the periodic timers. We use timers
+        # in this situation to capture watermark advancements that occur when
+        # there are no elements being produced upstream.
+        | beam.ParDo(
+            _WatermarkEventGenerator(
+                output_tag=self._output_tag,
+                sample_resolution_sec=self._sample_resolution_sec))
+
+        # Next, retrieve the timing information for watermark events that were
+        # generated in the previous step. This is because elements generated
+        # through the timers don't have their timing information yet.
+        | 'timing info for watermarks' >> PairWithTiming()
+
+        # Format the events properly.
+        | beam.ParDo(generator))
+
+    if self._output_format == self.Format.SERIALIZED_TEST_STREAM_FILE_RECORDS:
+
+      def serializer(e):
+        return e.SerializeToString()
+
+      ret = ret | 'serializer' >> beam.Map(serializer)
+
+    return ret
+
+
+class _WatermarkEventGenerator(beam.DoFn):
+  # Used to return the initial timing information.
+  EXECUTE_ONCE_STATE = beam.transforms.userstate.BagStateSpec(
+      name='execute_once_state', coder=beam.coders.FastPrimitivesCoder())
+  WATERMARK_TRACKER = TimerSpec('watermark_tracker', TimeDomain.REAL_TIME)
+
+  def __init__(self, output_tag, sample_resolution_sec=0.1):
+    self._output_tag = output_tag
+    self._sample_resolution_sec = sample_resolution_sec
+
+  @on_timer(WATERMARK_TRACKER)
+  def on_watermark_tracker(
+      self,
+      timestamp=beam.DoFn.TimestampParam,
+      window=beam.DoFn.WindowParam,
+      watermark_tracker=beam.DoFn.TimerParam(WATERMARK_TRACKER)):
+    next_sample_time = (timestamp.micros * 1e-6) + self._sample_resolution_sec
+    watermark_tracker.set(next_sample_time)
 
 Review comment:
   We may want to be careful with time skew on the processing time timer 
firings. If they skew consistently, the time we think it is in this transform 
may be very different from the time it actually is.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to