This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new b2f3966991 [SYSTEMDS-3701] Add additional text representations to Scuro
b2f3966991 is described below

commit b2f3966991806a4d4a975c05e5808af8ac2f0669
Author: Christina Dionysio <[email protected]>
AuthorDate: Mon Dec 2 17:19:00 2024 +0100

    [SYSTEMDS-3701] Add additional text representations to Scuro
    
    Closes #2146.
---
 .github/workflows/python.yml                       |  2 +
 .../python/systemds/scuro/representations/bert.py  | 39 ++++---------
 .../python/systemds/scuro/representations/bow.py   | 51 +++++++++++++++++
 .../python/systemds/scuro/representations/glove.py | 66 ++++++++++++++++++++++
 .../scuro/representations/{utils.py => tfidf.py}   | 31 +++++++---
 .../python/systemds/scuro/representations/utils.py | 38 +++++++++++++
 .../systemds/scuro/representations/word2vec.py     | 65 +++++++++++++++++++++
 src/main/python/tests/scuro/data_generator.py      |  2 +-
 8 files changed, 255 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index e5d8f1539d..d448645d94 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -114,6 +114,8 @@ jobs:
           torch \
           librosa \
           h5py \
+          nltk \
+          gensim \
           black
 
     - name: Build Python Package
diff --git a/src/main/python/systemds/scuro/representations/bert.py 
b/src/main/python/systemds/scuro/representations/bert.py
index d68729a97e..85d0b1ad65 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -19,20 +19,12 @@
 #
 # -------------------------------------------------------------
 
-import pickle
-
 import numpy as np
 
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 import torch
 from transformers import BertTokenizer, BertModel
-import os
-
-
-def read_text_file(file_path):
-    with open(file_path, "r", encoding="utf-8") as file:
-        text = file.read()
-    return text
+from systemds.scuro.representations.utils import read_data_from_file, 
save_embeddings
 
 
 class Bert(UnimodalRepresentation):
@@ -42,18 +34,8 @@ class Bert(UnimodalRepresentation):
         self.avg_layers = avg_layers
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices, get_sequences=False):
-        # Assumes text is stored in .txt files
-        data = []
-        if os.path.isdir(filepath):
-            for filename in os.listdir(filepath):
-                f = os.path.join(filepath, filename)
-                if os.path.isfile(f):
-                    with open(f, "r") as file:
-                        data.append(file.readlines()[0])
-        else:
-            with open(filepath, "r") as file:
-                data = file.readlines()
+    def parse_all(self, filepath, indices):
+        data = read_data_from_file(filepath, indices)
 
         model_name = "bert-base-uncased"
         tokenizer = BertTokenizer.from_pretrained(
@@ -65,13 +47,13 @@ class Bert(UnimodalRepresentation):
         else:
             model = BertModel.from_pretrained(model_name)
 
-        embeddings = self.create_embeddings(data, model, tokenizer)
+        embeddings = self.create_embeddings(list(data.values()), model, 
tokenizer)
 
         if self.output_file is not None:
             data = {}
             for i in range(0, embeddings.shape[0]):
                 data[indices[i]] = embeddings[i]
-            self.save_embeddings(data)
+            save_embeddings(data, self.output_file)
 
         return embeddings
 
@@ -88,14 +70,13 @@ class Bert(UnimodalRepresentation):
                     outputs.hidden_states[i][:, 0, :]
                     for i in range(-self.avg_layers, 0)
                 ]
-                cls_embedding = torch.mean(torch.stack(cls_embedding), dim=0)
+                cls_embedding = torch.mean(torch.stack(cls_embedding), 
dim=0).numpy()
             else:
                 cls_embedding = outputs.last_hidden_state[:, 0, 
:].squeeze().numpy()
-            embeddings.append(cls_embedding.numpy())
+            embeddings.append(cls_embedding)
+
+        if self.output_file is not None:
+            save_embeddings(embeddings, self.output_file)
 
         embeddings = np.array(embeddings)
         return embeddings.reshape((embeddings.shape[0], embeddings.shape[-1]))
-
-    def save_embeddings(self, data):
-        with open(self.output_file, "wb") as file:
-            pickle.dump(data, file)
diff --git a/src/main/python/systemds/scuro/representations/bow.py 
b/src/main/python/systemds/scuro/representations/bow.py
new file mode 100644
index 0000000000..dc5013b354
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -0,0 +1,51 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file, 
save_embeddings
+
+
+class BoW(UnimodalRepresentation):
+    def __init__(self, ngram_range, min_df, output_file=None):
+        super().__init__("BoW")
+        self.ngram_range = ngram_range
+        self.min_df = min_df
+        self.output_file = output_file
+
+    def parse_all(self, filepath, indices):
+        vectorizer = CountVectorizer(
+            ngram_range=(1, self.ngram_range), min_df=self.min_df
+        )
+
+        segments = read_data_from_file(filepath, indices)
+        X = vectorizer.fit_transform(segments.values())
+        X = X.toarray()
+
+        if self.output_file is not None:
+            df = pd.DataFrame(X)
+            df.index = segments.keys()
+
+            save_embeddings(df, self.output_file)
+
+        return X
diff --git a/src/main/python/systemds/scuro/representations/glove.py 
b/src/main/python/systemds/scuro/representations/glove.py
new file mode 100644
index 0000000000..840360540e
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -0,0 +1,66 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import nltk
+import numpy as np
+from nltk import word_tokenize
+
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file, 
save_embeddings
+
+
+def load_glove_embeddings(file_path):
+    embeddings = {}
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            values = line.split()
+            word = values[0]
+            vector = np.asarray(values[1:], dtype="float32")
+            embeddings[word] = vector
+    return embeddings
+
+
+class GloVe(UnimodalRepresentation):
+    def __init__(self, glove_path, output_file=None):
+        super().__init__("GloVe")
+        self.glove_path = glove_path
+        self.output_file = output_file
+
+    def parse_all(self, filepath, indices):
+        glove_embeddings = load_glove_embeddings(self.glove_path)
+        segments = read_data_from_file(filepath, indices)
+
+        embeddings = {}
+        for k, v in segments.items():
+            tokens = word_tokenize(v.lower())
+            embeddings[k] = np.mean(
+                [
+                    glove_embeddings[token]
+                    for token in tokens
+                    if token in glove_embeddings
+                ],
+                axis=0,
+            )
+
+        if self.output_file is not None:
+            save_embeddings(embeddings, self.output_file)
+
+        embeddings = np.array(list(embeddings.values()))
+        return embeddings
diff --git a/src/main/python/systemds/scuro/representations/utils.py 
b/src/main/python/systemds/scuro/representations/tfidf.py
similarity index 51%
copy from src/main/python/systemds/scuro/representations/utils.py
copy to src/main/python/systemds/scuro/representations/tfidf.py
index e23dd89dd7..15515dd538 100644
--- a/src/main/python/systemds/scuro/representations/utils.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -19,17 +19,30 @@
 #
 # -------------------------------------------------------------
 
-import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
 
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file, 
save_embeddings
 
-def pad_sequences(sequences, maxlen=None, dtype="float32", value=0):
-    if maxlen is None:
-        maxlen = max([len(seq) for seq in sequences])
 
-    result = np.full((len(sequences), maxlen), value, dtype=dtype)
+class TfIdf(UnimodalRepresentation):
+    def __init__(self, min_df, output_file=None):
+        super().__init__("TF-IDF")
+        self.min_df = min_df
+        self.output_file = output_file
 
-    for i, seq in enumerate(sequences):
-        data = seq[:maxlen]
-        result[i, : len(data)] = data
+    def parse_all(self, filepath, indices):
+        vectorizer = TfidfVectorizer(min_df=self.min_df)
 
-    return result
+        segments = read_data_from_file(filepath, indices)
+        X = vectorizer.fit_transform(segments.values())
+        X = X.toarray()
+
+        if self.output_file is not None:
+            df = pd.DataFrame(X)
+            df.index = segments.keys()
+
+            save_embeddings(df, self.output_file)
+
+        return X
diff --git a/src/main/python/systemds/scuro/representations/utils.py 
b/src/main/python/systemds/scuro/representations/utils.py
index e23dd89dd7..7551c6cb2b 100644
--- a/src/main/python/systemds/scuro/representations/utils.py
+++ b/src/main/python/systemds/scuro/representations/utils.py
@@ -18,6 +18,8 @@
 # under the License.
 #
 # -------------------------------------------------------------
+import os
+import pickle
 
 import numpy as np
 
@@ -33,3 +35,39 @@ def pad_sequences(sequences, maxlen=None, dtype="float32", 
value=0):
         result[i, : len(data)] = data
 
     return result
+
+
+def get_segments(data, key_prefix):
+    segments = {}
+    counter = 1
+    for line in data:
+        line = line.replace("\n", "")
+        segments[key_prefix + str(counter)] = line
+        counter += 1
+
+    return segments
+
+
+def read_data_from_file(filepath, indices):
+    data = {}
+
+    is_dir = True if os.path.isdir(filepath) else False
+
+    if is_dir:
+        files = os.listdir(filepath)
+
+        # get file extension
+        _, ext = os.path.splitext(files[0])
+        for key in indices:
+            with open(filepath + key + ext) as segm:
+                data.update(get_segments(segm, key + "_"))
+    else:
+        with open(filepath) as file:
+            data.update(get_segments(file, ""))
+
+    return data
+
+
+def save_embeddings(data, file_name):
+    with open(file_name, "wb") as file:
+        pickle.dump(data, file)
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py 
b/src/main/python/systemds/scuro/representations/word2vec.py
new file mode 100644
index 0000000000..cc8a180889
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -0,0 +1,65 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import numpy as np
+
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file, 
save_embeddings
+from gensim.models import Word2Vec
+from nltk.tokenize import word_tokenize
+import nltk
+
+
+def get_embedding(sentence, model):
+    vectors = []
+    for word in sentence:
+        if word in model.wv:
+            vectors.append(model.wv[word])
+
+    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
+
+
+class W2V(UnimodalRepresentation):
+    def __init__(self, vector_size, min_count, window, output_file=None):
+        super().__init__("Word2Vec")
+        self.vector_size = vector_size
+        self.min_count = min_count
+        self.window = window
+        self.output_file = output_file
+
+    def parse_all(self, filepath, indices):
+        segments = read_data_from_file(filepath, indices)
+        embeddings = {}
+        t = [word_tokenize(s.lower()) for s in segments.values()]
+        model = Word2Vec(
+            sentences=t,
+            vector_size=self.vector_size,
+            window=self.window,
+            min_count=self.min_count,
+        )
+
+        for k, v in segments.items():
+            tokenized_words = word_tokenize(v.lower())
+            embeddings[k] = get_embedding(tokenized_words, model)
+
+        if self.output_file is not None:
+            save_embeddings(embeddings, self.output_file)
+
+        return np.array(list(embeddings.values()))
diff --git a/src/main/python/tests/scuro/data_generator.py 
b/src/main/python/tests/scuro/data_generator.py
index 9ded5316d9..9f5b8dd2d7 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -36,7 +36,7 @@ class TestDataGenerator:
         self.balanced = balanced
 
         for modality in modalities:
-            mod_path = f"{self.path}/{modality.name.lower()}"
+            mod_path = f"{self.path}/{modality.name.lower()}/"
             os.mkdir(mod_path)
             modality.file_path = mod_path
         self.labels = []

Reply via email to