This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new b2f3966991 [SYSTEMDS-3701] Add additional text representations to Scuro
b2f3966991 is described below
commit b2f3966991806a4d4a975c05e5808af8ac2f0669
Author: Christina Dionysio <[email protected]>
AuthorDate: Mon Dec 2 17:19:00 2024 +0100
[SYSTEMDS-3701] Add additional text representations to Scuro
Closes #2146.
---
.github/workflows/python.yml | 2 +
.../python/systemds/scuro/representations/bert.py | 39 ++++---------
.../python/systemds/scuro/representations/bow.py | 51 +++++++++++++++++
.../python/systemds/scuro/representations/glove.py | 66 ++++++++++++++++++++++
.../scuro/representations/{utils.py => tfidf.py} | 31 +++++++---
.../python/systemds/scuro/representations/utils.py | 38 +++++++++++++
.../systemds/scuro/representations/word2vec.py | 65 +++++++++++++++++++++
src/main/python/tests/scuro/data_generator.py | 2 +-
8 files changed, 255 insertions(+), 39 deletions(-)
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index e5d8f1539d..d448645d94 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -114,6 +114,8 @@ jobs:
torch \
librosa \
h5py \
+ nltk \
+ gensim \
black
- name: Build Python Package
diff --git a/src/main/python/systemds/scuro/representations/bert.py
b/src/main/python/systemds/scuro/representations/bert.py
index d68729a97e..85d0b1ad65 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -19,20 +19,12 @@
#
# -------------------------------------------------------------
-import pickle
-
import numpy as np
from systemds.scuro.representations.unimodal import UnimodalRepresentation
import torch
from transformers import BertTokenizer, BertModel
-import os
-
-
-def read_text_file(file_path):
- with open(file_path, "r", encoding="utf-8") as file:
- text = file.read()
- return text
+from systemds.scuro.representations.utils import read_data_from_file,
save_embeddings
class Bert(UnimodalRepresentation):
@@ -42,18 +34,8 @@ class Bert(UnimodalRepresentation):
self.avg_layers = avg_layers
self.output_file = output_file
- def parse_all(self, filepath, indices, get_sequences=False):
- # Assumes text is stored in .txt files
- data = []
- if os.path.isdir(filepath):
- for filename in os.listdir(filepath):
- f = os.path.join(filepath, filename)
- if os.path.isfile(f):
- with open(f, "r") as file:
- data.append(file.readlines()[0])
- else:
- with open(filepath, "r") as file:
- data = file.readlines()
+ def parse_all(self, filepath, indices):
+ data = read_data_from_file(filepath, indices)
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(
@@ -65,13 +47,13 @@ class Bert(UnimodalRepresentation):
else:
model = BertModel.from_pretrained(model_name)
- embeddings = self.create_embeddings(data, model, tokenizer)
+ embeddings = self.create_embeddings(list(data.values()), model,
tokenizer)
if self.output_file is not None:
data = {}
for i in range(0, embeddings.shape[0]):
data[indices[i]] = embeddings[i]
- self.save_embeddings(data)
+ save_embeddings(data, self.output_file)
return embeddings
@@ -88,14 +70,13 @@ class Bert(UnimodalRepresentation):
outputs.hidden_states[i][:, 0, :]
for i in range(-self.avg_layers, 0)
]
- cls_embedding = torch.mean(torch.stack(cls_embedding), dim=0)
+ cls_embedding = torch.mean(torch.stack(cls_embedding),
dim=0).numpy()
else:
cls_embedding = outputs.last_hidden_state[:, 0,
:].squeeze().numpy()
- embeddings.append(cls_embedding.numpy())
+ embeddings.append(cls_embedding)
+
+ if self.output_file is not None:
+ save_embeddings(embeddings, self.output_file)
embeddings = np.array(embeddings)
return embeddings.reshape((embeddings.shape[0], embeddings.shape[-1]))
-
- def save_embeddings(self, data):
- with open(self.output_file, "wb") as file:
- pickle.dump(data, file)
diff --git a/src/main/python/systemds/scuro/representations/bow.py
b/src/main/python/systemds/scuro/representations/bow.py
new file mode 100644
index 0000000000..dc5013b354
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -0,0 +1,51 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file,
save_embeddings
+
+
+class BoW(UnimodalRepresentation):
+ def __init__(self, ngram_range, min_df, output_file=None):
+ super().__init__("BoW")
+ self.ngram_range = ngram_range
+ self.min_df = min_df
+ self.output_file = output_file
+
+ def parse_all(self, filepath, indices):
+ vectorizer = CountVectorizer(
+ ngram_range=(1, self.ngram_range), min_df=self.min_df
+ )
+
+ segments = read_data_from_file(filepath, indices)
+ X = vectorizer.fit_transform(segments.values())
+ X = X.toarray()
+
+ if self.output_file is not None:
+ df = pd.DataFrame(X)
+ df.index = segments.keys()
+
+ save_embeddings(df, self.output_file)
+
+ return X
diff --git a/src/main/python/systemds/scuro/representations/glove.py
b/src/main/python/systemds/scuro/representations/glove.py
new file mode 100644
index 0000000000..840360540e
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -0,0 +1,66 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import nltk
+import numpy as np
+from nltk import word_tokenize
+
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file,
save_embeddings
+
+
+def load_glove_embeddings(file_path):
+ embeddings = {}
+ with open(file_path, "r", encoding="utf-8") as f:
+ for line in f:
+ values = line.split()
+ word = values[0]
+ vector = np.asarray(values[1:], dtype="float32")
+ embeddings[word] = vector
+ return embeddings
+
+
+class GloVe(UnimodalRepresentation):
+ def __init__(self, glove_path, output_file=None):
+ super().__init__("GloVe")
+ self.glove_path = glove_path
+ self.output_file = output_file
+
+ def parse_all(self, filepath, indices):
+ glove_embeddings = load_glove_embeddings(self.glove_path)
+ segments = read_data_from_file(filepath, indices)
+
+ embeddings = {}
+ for k, v in segments.items():
+ tokens = word_tokenize(v.lower())
+ embeddings[k] = np.mean(
+ [
+ glove_embeddings[token]
+ for token in tokens
+ if token in glove_embeddings
+ ],
+ axis=0,
+ )
+
+ if self.output_file is not None:
+ save_embeddings(embeddings, self.output_file)
+
+ embeddings = np.array(list(embeddings.values()))
+ return embeddings
diff --git a/src/main/python/systemds/scuro/representations/utils.py
b/src/main/python/systemds/scuro/representations/tfidf.py
similarity index 51%
copy from src/main/python/systemds/scuro/representations/utils.py
copy to src/main/python/systemds/scuro/representations/tfidf.py
index e23dd89dd7..15515dd538 100644
--- a/src/main/python/systemds/scuro/representations/utils.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -19,17 +19,30 @@
#
# -------------------------------------------------------------
-import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file,
save_embeddings
-def pad_sequences(sequences, maxlen=None, dtype="float32", value=0):
- if maxlen is None:
- maxlen = max([len(seq) for seq in sequences])
- result = np.full((len(sequences), maxlen), value, dtype=dtype)
+class TfIdf(UnimodalRepresentation):
+ def __init__(self, min_df, output_file=None):
+ super().__init__("TF-IDF")
+ self.min_df = min_df
+ self.output_file = output_file
- for i, seq in enumerate(sequences):
- data = seq[:maxlen]
- result[i, : len(data)] = data
+ def parse_all(self, filepath, indices):
+ vectorizer = TfidfVectorizer(min_df=self.min_df)
- return result
+ segments = read_data_from_file(filepath, indices)
+ X = vectorizer.fit_transform(segments.values())
+ X = X.toarray()
+
+ if self.output_file is not None:
+ df = pd.DataFrame(X)
+ df.index = segments.keys()
+
+ save_embeddings(df, self.output_file)
+
+ return X
diff --git a/src/main/python/systemds/scuro/representations/utils.py
b/src/main/python/systemds/scuro/representations/utils.py
index e23dd89dd7..7551c6cb2b 100644
--- a/src/main/python/systemds/scuro/representations/utils.py
+++ b/src/main/python/systemds/scuro/representations/utils.py
@@ -18,6 +18,8 @@
# under the License.
#
# -------------------------------------------------------------
+import os
+import pickle
import numpy as np
@@ -33,3 +35,39 @@ def pad_sequences(sequences, maxlen=None, dtype="float32",
value=0):
result[i, : len(data)] = data
return result
+
+
+def get_segments(data, key_prefix):
+ segments = {}
+ counter = 1
+ for line in data:
+ line = line.replace("\n", "")
+ segments[key_prefix + str(counter)] = line
+ counter += 1
+
+ return segments
+
+
+def read_data_from_file(filepath, indices):
+ data = {}
+
+ is_dir = True if os.path.isdir(filepath) else False
+
+ if is_dir:
+ files = os.listdir(filepath)
+
+ # get file extension
+ _, ext = os.path.splitext(files[0])
+ for key in indices:
+ with open(filepath + key + ext) as segm:
+ data.update(get_segments(segm, key + "_"))
+ else:
+ with open(filepath) as file:
+ data.update(get_segments(file, ""))
+
+ return data
+
+
+def save_embeddings(data, file_name):
+ with open(file_name, "wb") as file:
+ pickle.dump(data, file)
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py
b/src/main/python/systemds/scuro/representations/word2vec.py
new file mode 100644
index 0000000000..cc8a180889
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -0,0 +1,65 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import numpy as np
+
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file,
save_embeddings
+from gensim.models import Word2Vec
+from nltk.tokenize import word_tokenize
+import nltk
+
+
+def get_embedding(sentence, model):
+ vectors = []
+ for word in sentence:
+ if word in model.wv:
+ vectors.append(model.wv[word])
+
+ return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
+
+
+class W2V(UnimodalRepresentation):
+ def __init__(self, vector_size, min_count, window, output_file=None):
+ super().__init__("Word2Vec")
+ self.vector_size = vector_size
+ self.min_count = min_count
+ self.window = window
+ self.output_file = output_file
+
+ def parse_all(self, filepath, indices):
+ segments = read_data_from_file(filepath, indices)
+ embeddings = {}
+ t = [word_tokenize(s.lower()) for s in segments.values()]
+ model = Word2Vec(
+ sentences=t,
+ vector_size=self.vector_size,
+ window=self.window,
+ min_count=self.min_count,
+ )
+
+ for k, v in segments.items():
+ tokenized_words = word_tokenize(v.lower())
+ embeddings[k] = get_embedding(tokenized_words, model)
+
+ if self.output_file is not None:
+ save_embeddings(embeddings, self.output_file)
+
+ return np.array(list(embeddings.values()))
diff --git a/src/main/python/tests/scuro/data_generator.py
b/src/main/python/tests/scuro/data_generator.py
index 9ded5316d9..9f5b8dd2d7 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -36,7 +36,7 @@ class TestDataGenerator:
self.balanced = balanced
for modality in modalities:
- mod_path = f"{self.path}/{modality.name.lower()}"
+ mod_path = f"{self.path}/{modality.name.lower()}/"
os.mkdir(mod_path)
modality.file_path = mod_path
self.labels = []