[
https://issues.apache.org/jira/browse/FLINK-4391?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15683297#comment-15683297
]
ASF GitHub Bot commented on FLINK-4391:
---------------------------------------
Github user tillrohrmann commented on a diff in the pull request:
https://github.com/apache/flink/pull/2629#discussion_r88712331
--- Diff:
flink-streaming-java/src/main/java/org/apache/flink/streaming/api/operators/async/AsyncCollectorBuffer.java
---
@@ -0,0 +1,453 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.streaming.api.operators.async;
+
+import org.apache.flink.annotation.Internal;
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.streaming.api.datastream.AsyncDataStream;
+import org.apache.flink.streaming.api.operators.Output;
+import org.apache.flink.streaming.api.operators.TimestampedCollector;
+import org.apache.flink.streaming.api.watermark.Watermark;
+import org.apache.flink.streaming.runtime.streamrecord.LatencyMarker;
+import org.apache.flink.streaming.runtime.streamrecord.StreamElement;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.apache.flink.util.Preconditions;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * AsyncCollectorBuffer will hold all {@link AsyncCollector} in its
internal buffer,
+ * and emit results from {@link AsyncCollector} to the next operators
following it by
+ * calling {@link Output#collect(Object)}
+ */
+@Internal
+public class AsyncCollectorBuffer<IN, OUT> {
+
+ /**
+ * Max number of {@link AsyncCollector} in the buffer.
+ */
+ private final int bufferSize;
+
+ private final AsyncDataStream.OutputMode mode;
+
+ private final AsyncWaitOperator<IN, OUT> operator;
+
+ /**
+ * Keep all {@code AsyncCollector} and their input {@link StreamElement}
+ */
+ private final Map<AsyncCollector<IN, OUT>, StreamElement> queue = new
LinkedHashMap<>();
+ /**
+ * For the AsyncWaitOperator chained with StreamSource, the checkpoint
thread may get the
+ * {@link org.apache.flink.streaming.runtime.tasks.StreamTask#lock}
while {@link AsyncCollectorBuffer#queue}
+ * is full since main thread waits on this lock. The StreamElement in
+ * {@link AsyncWaitOperator#processElement(StreamRecord)} should be
treated as a part of all StreamElements
+ * in its queue. It will be kept in the operator state while
snapshotting.
+ */
+ private StreamElement extraStreamElement;
+
+ /**
+ * {@link TimestampedCollector} and {@link Output} to collect results
and watermarks.
+ */
+ private final Output<StreamRecord<OUT>> output;
+ private final TimestampedCollector<OUT> timestampedCollector;
+
+ /**
+ * Checkpoint lock from {@link
org.apache.flink.streaming.runtime.tasks.StreamTask#lock}
+ */
+ private final Object lock;
+
+ private final Emitter emitter;
+ private final Thread emitThread;
+
+ private IOException error;
+
+ public AsyncCollectorBuffer(
+ int bufferSize,
+ AsyncDataStream.OutputMode mode,
+ Output<StreamRecord<OUT>> output,
+ TimestampedCollector<OUT> collector,
+ Object lock,
+ AsyncWaitOperator operator) {
+ Preconditions.checkArgument(bufferSize > 0, "Future buffer size
should be greater than 0.");
+ Preconditions.checkNotNull(output, "Output should not be
NULL.");
+ Preconditions.checkNotNull(collector, "TimestampedCollector
should not be NULL.");
+ Preconditions.checkNotNull(lock, "Checkpoint lock should not be
NULL.");
+ Preconditions.checkNotNull(operator, "Reference to
AsyncWaitOperator should not be NULL.");
+
+ this.bufferSize = bufferSize;
+ this.mode = mode;
+ this.output = output;
+ this.timestampedCollector = collector;
+ this.operator = operator;
+ this.lock = lock;
+
+ this.emitter = new Emitter();
+ this.emitThread = new Thread(emitter);
+ }
+
+ /**
+ * Add an {@link StreamRecord} into the buffer. A new {@link
AsyncCollector} will be created and returned
+ * corresponding to the input StreamRecord.
+ * <p>
+ * If buffer is full, caller will wait until a new space is available.
+ *
+ * @param record StreamRecord
+ * @return An AsyncCollector
+ * @throws Exception InterruptedException or IOException from
AsyncCollector.
+ */
+ public AsyncCollector<IN, OUT> addStreamRecord(StreamRecord<IN> record)
throws InterruptedException, IOException {
+ while (queue.size() >= bufferSize) {
+ // hold the input StreamRecord until it is placed in
the buffer
+ extraStreamElement = record;
+
+ lock.wait();
+ }
+
+ if (error != null) {
+ throw error;
+ }
+
+ AsyncCollector<IN, OUT> collector = new AsyncCollector(this);
+
+ queue.put(collector, record);
+
+ extraStreamElement = null;
+
+ return collector;
+ }
+
+ /**
+ * Add a {@link Watermark} into queue. A new AsyncCollector will be
created and returned.
+ * <p>
+ * If queue is full, caller will wait here.
+ *
+ * @param watermark Watermark
+ * @return AsyncCollector
+ * @throws Exception InterruptedException or IOException from
AsyncCollector.
+ */
+ public AsyncCollector<IN, OUT> addWatermark(Watermark watermark) throws
InterruptedException, IOException {
+ return processMark(watermark);
+ }
+
+ /**
+ * Add a {@link LatencyMarker} into queue. A new AsyncCollector will be
created and returned.
+ * <p>
+ * If queue is full, caller will wait here.
+ *
+ * @param latencyMarker LatencyMarker
+ * @return AsyncCollector
+ * @throws Exception InterruptedException or IOException from
AsyncCollector.
+ */
+ public AsyncCollector<IN, OUT> addLatencyMarker(LatencyMarker
latencyMarker) throws InterruptedException, IOException {
+ return processMark(latencyMarker);
+ }
+
+ private AsyncCollector<IN, OUT> processMark(StreamElement mark) throws
InterruptedException, IOException {
+ while (queue.size() >= bufferSize) {
+ // hold the input StreamRecord until it is placed in
the buffer
+ extraStreamElement = mark;
+
+ lock.wait();
+ }
+
+ if (error != null) {
+ throw error;
+ }
+
+ AsyncCollector<IN, OUT> collector = new AsyncCollector(this,
true);
+
+ queue.put(collector, mark);
+
+ extraStreamElement = null;
+
+ return collector;
+ }
+
+ /**
+ * Notify the emitter thread and main thread that an AsyncCollector has
completed.
+ *
+ * @param collector Completed AsyncCollector
+ */
+ void markCollectorCompleted(AsyncCollector<IN, OUT> collector) {
+ synchronized (lock) {
+ collector.markDone();
+
+ // notify main thread to keep working
+ lock.notifyAll();
+ }
+ }
+
+ /**
+ * Caller will wait here if buffer is not empty, meaning that not all
async i/o tasks have returned yet.
+ *
+ * @throws Exception InterruptedException or IOException from
AsyncCollector.
+ */
+ void waitEmpty() throws InterruptedException, IOException {
+ while (queue.size() != 0) {
+ if (error != null) {
+ throw error;
+ }
+
+ lock.wait();
+ }
+ }
+
+ public void startEmitterThread() {
+ this.emitThread.start();
+ }
+
+ public void stopEmitterThread() {
+ emitter.stop();
+
+ emitThread.interrupt();
+ }
+
+ /**
+ * Get all StreamElements in the AsyncCollector queue.
+ * <p>
+ * Emitter Thread can not output records and will wait for a while due
to checkpoiting procedure
+ * holding the checkpoint lock.
+ *
+ * @return A List containing StreamElements.
+ */
+ public List<StreamElement> getStreamElementsInBuffer() {
+ List<StreamElement> ret = new ArrayList<>(queue.size());
+ for (Map.Entry<AsyncCollector<IN, OUT>, StreamElement> entry :
queue.entrySet()) {
+ ret.add(entry.getValue());
+ }
+
+ // add the lonely input outside of queue into return set.
+ if (extraStreamElement != null) {
+ ret.add(extraStreamElement);
+ }
+
+ return ret;
+ }
+
+ @VisibleForTesting
+ public Map<AsyncCollector<IN, OUT>, StreamElement> getQueue() {
+ return this.queue;
+ }
+
+ /**
+ * A working thread to output results from {@link AsyncCollector} to
the next operator.
+ */
+ private class Emitter implements Runnable {
+ private volatile boolean running = true;
+
+ private void output(AsyncCollector collector, StreamElement
element) throws Exception {
+ List<OUT> result = collector.getResult();
+
+ // update timestamp for output stream records based on
the input stream record.
+ if (element == null) {
+ throw new Exception("No input stream element
for current AsyncCollector");
+ }
+
+ if (element.isRecord()) {
+ if (result == null) {
+ throw new Exception("Result for stream
record "+element+" is null");
+ }
+
+
timestampedCollector.setTimestamp(element.asRecord());
+ for (OUT val : result) {
+ timestampedCollector.collect(val);
+ }
+ }
+ else if (element.isWatermark()) {
+ output.emitWatermark(element.asWatermark());
+ }
+ else if (element.isLatencyMarker()) {
+
operator.sendLatencyMarker(element.asLatencyMarker());
+ }
+ else {
+ throw new IOException("Unknown input record:
"+element);
+ }
+ }
+
+ /**
+ * Emit results from the finished head collector and its
following finished ones.
+ */
+ private void orderedProcess() throws IOException {
+ Iterator<Map.Entry<AsyncCollector<IN, OUT>,
StreamElement>> iterator = queue.entrySet().iterator();
+
+ while (iterator.hasNext()) {
+ try {
+ Map.Entry<AsyncCollector<IN, OUT>,
StreamElement> entry = iterator.next();
+
+ AsyncCollector<IN, OUT> collector =
entry.getKey();
+
+ if (!collector.isDone()) {
+ break;
+ }
+
+ output(collector, entry.getValue());
+
+ iterator.remove();
+
+ lock.notifyAll();
+ }
+ catch (Exception e) {
+ throw new IOException(e);
--- End diff --
Error message would be quite helpful.
> Provide support for asynchronous operations over streams
> --------------------------------------------------------
>
> Key: FLINK-4391
> URL: https://issues.apache.org/jira/browse/FLINK-4391
> Project: Flink
> Issue Type: New Feature
> Components: DataStream API
> Reporter: Jamie Grier
> Assignee: david.wang
>
> Many Flink users need to do asynchronous processing driven by data from a
> DataStream. The classic example would be joining against an external
> database in order to enrich a stream with extra information.
> It would be nice to add general support for this type of operation in the
> Flink API. Ideally this could simply take the form of a new operator that
> manages async operations, keeps so many of them in flight, and then emits
> results to downstream operators as the async operations complete.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)