lindong28 commented on a change in pull request #24: URL: https://github.com/apache/flink-ml/pull/24#discussion_r747369365
########## File path: flink-ml-lib/src/main/java/org/apache/flink/ml/algo/batch/knn/KnnTrainBatchOp.java ########## @@ -0,0 +1,230 @@ +package org.apache.flink.ml.algo.batch.knn; + +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.functions.RichMapPartitionFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.ml.algo.batch.knn.distance.BaseFastDistance; +import org.apache.flink.ml.algo.batch.knn.distance.BaseFastDistanceData; +import org.apache.flink.ml.algo.batch.knn.distance.FastDistanceMatrixData; +import org.apache.flink.ml.algo.batch.knn.distance.FastDistanceSparseData; +import org.apache.flink.ml.algo.batch.knn.distance.FastDistanceVectorData; +import org.apache.flink.ml.common.BatchOperator; +import org.apache.flink.ml.common.MapPartitionFunctionWrapper; +import org.apache.flink.ml.common.linalg.DenseVector; +import org.apache.flink.ml.common.linalg.VectorUtil; +import org.apache.flink.ml.param.Param; +import org.apache.flink.ml.param.StringParam; +import org.apache.flink.ml.params.knn.HasKnnDistanceType; +import org.apache.flink.ml.params.knn.KnnTrainParams; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.api.internal.TableImpl; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.types.DataType; +import org.apache.flink.types.Row; +import org.apache.flink.util.Collector; +import org.apache.flink.util.Preconditions; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.flink.ml.algo.batch.knn.distance.BaseFastDistanceData.pGson; + +/** + * KNN is to classify unlabeled observations by assigning them to the class of the most similar + * labeled examples. Note that though there is no ``training process`` in KNN, we create a ``fake + * one`` to use in pipeline model. In this operator, we do some preparation to speed up the + * inference process. + */ +public final class KnnTrainBatchOp extends BatchOperator<KnnTrainBatchOp> Review comment: I see. I guess we probably have different understanding regarding whether have a class with a `Batch` or `Stream` in the name could help improve developer experience. The pros/cons here seems to be very similar to the pros/cons we discussed today for https://github.com/apache/flink-ml/pull/29. Maybe we can try to reach agreement on https://github.com/apache/flink-ml/pull/29 first before revisiting this PR. ########## File path: flink-ml-lib/src/main/java/org/apache/flink/ml/algo/batch/knn/KnnTrainBatchOp.java ########## @@ -0,0 +1,230 @@ +package org.apache.flink.ml.algo.batch.knn; + +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.functions.RichMapPartitionFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.ml.algo.batch.knn.distance.BaseFastDistance; +import org.apache.flink.ml.algo.batch.knn.distance.BaseFastDistanceData; +import org.apache.flink.ml.algo.batch.knn.distance.FastDistanceMatrixData; +import org.apache.flink.ml.algo.batch.knn.distance.FastDistanceSparseData; +import org.apache.flink.ml.algo.batch.knn.distance.FastDistanceVectorData; +import org.apache.flink.ml.common.BatchOperator; +import org.apache.flink.ml.common.MapPartitionFunctionWrapper; +import org.apache.flink.ml.common.linalg.DenseVector; +import org.apache.flink.ml.common.linalg.VectorUtil; +import org.apache.flink.ml.param.Param; +import org.apache.flink.ml.param.StringParam; +import org.apache.flink.ml.params.knn.HasKnnDistanceType; +import org.apache.flink.ml.params.knn.KnnTrainParams; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.api.internal.TableImpl; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.types.DataType; +import org.apache.flink.types.Row; +import org.apache.flink.util.Collector; +import org.apache.flink.util.Preconditions; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.flink.ml.algo.batch.knn.distance.BaseFastDistanceData.pGson; + +/** + * KNN is to classify unlabeled observations by assigning them to the class of the most similar + * labeled examples. Note that though there is no ``training process`` in KNN, we create a ``fake + * one`` to use in pipeline model. In this operator, we do some preparation to speed up the + * inference process. + */ +public final class KnnTrainBatchOp extends BatchOperator<KnnTrainBatchOp> Review comment: I see. I guess we probably have different understanding regarding whether have a class with a `Batch` or `Stream` in the name could help improve developer experience. The pros/cons here seems to be very similar to the pros/cons we discussed today for https://github.com/apache/flink-ml/pull/29. Maybe we can try to reach agreement on https://github.com/apache/flink-ml/pull/29 first before revisiting this PR. Dos this sound good? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org