(systemds) 02/03: [MINOR] Correct dedub builtin docs

baunsgaard Wed, 24 Sep 2025 06:26:32 -0700

This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


commit 0e79c9b9c3e3af2525c44e4f7417f7703d6a7c72
Author: Sebastian Baunsgaard <[email protected]>
AuthorDate: Wed Sep 24 15:01:06 2025 +0200

    [MINOR] Correct dedub builtin docs
---
 scripts/builtin/dedup.dml                          | 10 ++--
 .../python/systemds/operator/algorithm/__init__.py |  2 +
 .../systemds/operator/algorithm/builtin/dedup.py   | 68 ++++++++++++++++++++++
 .../systemds/operator/algorithm/builtin/glove.py   |  1 +
 4 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/scripts/builtin/dedup.dml b/scripts/builtin/dedup.dml
index 1ec2e29c39..af2ecafcdc 100644
--- a/scripts/builtin/dedup.dml
+++ b/scripts/builtin/dedup.dml
@@ -28,11 +28,11 @@
 #
 # INPUT:
 # 
--------------------------------------------------------------------------------------
-# X                 Input Frame[String] with n rows and d columns (raw tuples)
-# gloveMatrix       Matrix[Double] of size |V| × e (pretrained GloVe 
embeddings) -> |V| number of words and e = embedding dimesnion
-# vocab             Frame[String] of size |V| × 1 (vocabulary aligned with 
gloveMatrix)
-# similarityMeasure (optional) String specifying similarity metric: "cosine", 
"euclidean"
-# threshold         (optional) Double: threshold value above which tuples are 
considered duplicates
+# X                  Input Frame[String] with n rows and d columns (raw tuples)
+# gloveMatrix        Matrix[Double] of size |V| × e (pretrained GloVe 
embeddings) -> |V| number of words and e = embedding dimesnion
+# vocab              Frame[String] of size |V| × 1 (vocabulary aligned with 
gloveMatrix)
+# similarityMeasure  (optional) String specifying similarity metric: "cosine", 
"euclidean"
+# threshold          (optional) Double: threshold value above which tuples are 
considered duplicates
 # 
--------------------------------------------------------------------------------------
 #
 # OUTPUT:
diff --git a/src/main/python/systemds/operator/algorithm/__init__.py 
b/src/main/python/systemds/operator/algorithm/__init__.py
index c198a7138e..e8cb4c04e9 100644
--- a/src/main/python/systemds/operator/algorithm/__init__.py
+++ b/src/main/python/systemds/operator/algorithm/__init__.py
@@ -53,6 +53,7 @@ from .builtin.dbscan import dbscan
 from .builtin.dbscanApply import dbscanApply 
 from .builtin.decisionTree import decisionTree 
 from .builtin.decisionTreePredict import decisionTreePredict 
+from .builtin.dedup import dedup 
 from .builtin.deepWalk import deepWalk 
 from .builtin.denialConstraints import denialConstraints 
 from .builtin.differenceStatistics import differenceStatistics 
@@ -251,6 +252,7 @@ __all__ = ['WoE',
  'dbscanApply',
  'decisionTree',
  'decisionTreePredict',
+ 'dedup',
  'deepWalk',
  'denialConstraints',
  'differenceStatistics',
diff --git a/src/main/python/systemds/operator/algorithm/builtin/dedup.py 
b/src/main/python/systemds/operator/algorithm/builtin/dedup.py
new file mode 100644
index 0000000000..13d5c35a41
--- /dev/null
+++ b/src/main/python/systemds/operator/algorithm/builtin/dedup.py
@@ -0,0 +1,68 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+# Autogenerated By   : src/main/python/generator/generator.py
+# Autogenerated From : scripts/builtin/dedup.dml
+
+from typing import Dict, Iterable
+
+from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, 
Scalar
+from systemds.utils.consts import VALID_INPUT_TYPES
+
+
+def dedup(X: Frame,
+          gloveMatrix: Matrix,
+          vocab: Frame,
+          **kwargs: Dict[str, VALID_INPUT_TYPES]):
+    """
+     Builtin for deduplication using distributed representations (DRs) and
+     locality-sensitive hashing (LSH) based blocking.
+    
+     The function encodes each input tuple as a dense vector using pre-trained 
GloVe embeddings (simple averaging), 
+     groups semantically similar tuples via LSH into buckets, and compares 
only those pairs for deduplication.
+     
+    
+    
+    
+    :param X: Input Frame[String] with n rows and d columns (raw tuples)
+    :param gloveMatrix: Matrix[Double] of size |V| × e (pretrained GloVe 
embeddings) -> |V| number of words and e = embedding dimesnion
+    :param vocab: Frame[String] of size |V| × 1 (vocabulary aligned with 
gloveMatrix)
+    :param similarityMeasure: (optional) String specifying similarity metric: 
"cosine", "euclidean"
+    :param threshold: (optional) Double: threshold value above which tuples 
are considered duplicates
+    :return: Frame[String] with deduplicated tuples
+        (first occurrence of each duplicate group is retained)
+    :return: Frame[String] with all detected duplicates
+        (i.e., tuples removed from the input)
+    """
+
+    params_dict = {'X': X, 'gloveMatrix': gloveMatrix, 'vocab': vocab}
+    params_dict.update(kwargs)
+    
+    vX_0 = Frame(X.sds_context, '')
+    vX_1 = Frame(X.sds_context, '')
+    output_nodes = [vX_0, vX_1, ]
+
+    op = MultiReturn(X.sds_context, 'dedup', output_nodes, 
named_input_nodes=params_dict)
+
+    vX_0._unnamed_input_nodes = [op]
+    vX_1._unnamed_input_nodes = [op]
+
+    return op
diff --git a/src/main/python/systemds/operator/algorithm/builtin/glove.py 
b/src/main/python/systemds/operator/algorithm/builtin/glove.py
index cbf9c421c4..3df38dfbfb 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/glove.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/glove.py
@@ -42,6 +42,7 @@ def glove(input: Frame,
           distanceWeighting: bool,
           symmetric: bool):
     """
+     Computes the vector embeddings for words in a large text corpus.

(systemds) 02/03: [MINOR] Correct dedub builtin docs

Reply via email to