This is an automated email from the ASF dual-hosted git repository. baunsgaard pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 0e79c9b9c3e3af2525c44e4f7417f7703d6a7c72 Author: Sebastian Baunsgaard <[email protected]> AuthorDate: Wed Sep 24 15:01:06 2025 +0200 [MINOR] Correct dedub builtin docs --- scripts/builtin/dedup.dml | 10 ++-- .../python/systemds/operator/algorithm/__init__.py | 2 + .../systemds/operator/algorithm/builtin/dedup.py | 68 ++++++++++++++++++++++ .../systemds/operator/algorithm/builtin/glove.py | 1 + 4 files changed, 76 insertions(+), 5 deletions(-) diff --git a/scripts/builtin/dedup.dml b/scripts/builtin/dedup.dml index 1ec2e29c39..af2ecafcdc 100644 --- a/scripts/builtin/dedup.dml +++ b/scripts/builtin/dedup.dml @@ -28,11 +28,11 @@ # # INPUT: # -------------------------------------------------------------------------------------- -# X Input Frame[String] with n rows and d columns (raw tuples) -# gloveMatrix Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion -# vocab Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix) -# similarityMeasure (optional) String specifying similarity metric: "cosine", "euclidean" -# threshold (optional) Double: threshold value above which tuples are considered duplicates +# X Input Frame[String] with n rows and d columns (raw tuples) +# gloveMatrix Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion +# vocab Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix) +# similarityMeasure (optional) String specifying similarity metric: "cosine", "euclidean" +# threshold (optional) Double: threshold value above which tuples are considered duplicates # -------------------------------------------------------------------------------------- # # OUTPUT: diff --git a/src/main/python/systemds/operator/algorithm/__init__.py b/src/main/python/systemds/operator/algorithm/__init__.py index c198a7138e..e8cb4c04e9 100644 --- a/src/main/python/systemds/operator/algorithm/__init__.py +++ b/src/main/python/systemds/operator/algorithm/__init__.py @@ -53,6 +53,7 @@ from .builtin.dbscan import dbscan from .builtin.dbscanApply import dbscanApply from .builtin.decisionTree import decisionTree from .builtin.decisionTreePredict import decisionTreePredict +from .builtin.dedup import dedup from .builtin.deepWalk import deepWalk from .builtin.denialConstraints import denialConstraints from .builtin.differenceStatistics import differenceStatistics @@ -251,6 +252,7 @@ __all__ = ['WoE', 'dbscanApply', 'decisionTree', 'decisionTreePredict', + 'dedup', 'deepWalk', 'denialConstraints', 'differenceStatistics', diff --git a/src/main/python/systemds/operator/algorithm/builtin/dedup.py b/src/main/python/systemds/operator/algorithm/builtin/dedup.py new file mode 100644 index 0000000000..13d5c35a41 --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/dedup.py @@ -0,0 +1,68 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/dedup.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.utils.consts import VALID_INPUT_TYPES + + +def dedup(X: Frame, + gloveMatrix: Matrix, + vocab: Frame, + **kwargs: Dict[str, VALID_INPUT_TYPES]): + """ + Builtin for deduplication using distributed representations (DRs) and + locality-sensitive hashing (LSH) based blocking. + + The function encodes each input tuple as a dense vector using pre-trained GloVe embeddings (simple averaging), + groups semantically similar tuples via LSH into buckets, and compares only those pairs for deduplication. + + + + + :param X: Input Frame[String] with n rows and d columns (raw tuples) + :param gloveMatrix: Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion + :param vocab: Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix) + :param similarityMeasure: (optional) String specifying similarity metric: "cosine", "euclidean" + :param threshold: (optional) Double: threshold value above which tuples are considered duplicates + :return: Frame[String] with deduplicated tuples + (first occurrence of each duplicate group is retained) + :return: Frame[String] with all detected duplicates + (i.e., tuples removed from the input) + """ + + params_dict = {'X': X, 'gloveMatrix': gloveMatrix, 'vocab': vocab} + params_dict.update(kwargs) + + vX_0 = Frame(X.sds_context, '') + vX_1 = Frame(X.sds_context, '') + output_nodes = [vX_0, vX_1, ] + + op = MultiReturn(X.sds_context, 'dedup', output_nodes, named_input_nodes=params_dict) + + vX_0._unnamed_input_nodes = [op] + vX_1._unnamed_input_nodes = [op] + + return op diff --git a/src/main/python/systemds/operator/algorithm/builtin/glove.py b/src/main/python/systemds/operator/algorithm/builtin/glove.py index cbf9c421c4..3df38dfbfb 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/glove.py +++ b/src/main/python/systemds/operator/algorithm/builtin/glove.py @@ -42,6 +42,7 @@ def glove(input: Frame, distanceWeighting: bool, symmetric: bool): """ + Computes the vector embeddings for words in a large text corpus.
