This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 704b6fbbca [SYSTEMDS-3701] Rework Scuro modalites, update python 3.8
704b6fbbca is described below
commit 704b6fbbca709907e13539e6693e009bf86a0d31
Author: Christina Dionysio <[email protected]>
AuthorDate: Fri Jan 10 14:31:36 2025 +0100
[SYSTEMDS-3701] Rework Scuro modalites, update python 3.8
Closes #2155.
---
src/main/python/systemds/scuro/__init__.py | 45 +++----
.../python/systemds/scuro/aligner/alignment.py | 3 +-
.../python/systemds/scuro/aligner/dr_search.py | 35 +++---
.../unimodal.py => dataloader/__init__.py} | 18 ---
.../unimodal.py => dataloader/audio_loader.py} | 29 ++---
.../systemds/scuro/dataloader/base_loader.py | 92 ++++++++++++++
.../unimodal.py => dataloader/json_loader.py} | 33 ++---
.../sum.py => dataloader/text_loader.py} | 52 ++++----
.../sum.py => dataloader/video_loader.py} | 44 ++++---
src/main/python/systemds/scuro/main.py | 43 ++++---
.../systemds/scuro/modality/aligned_modality.py | 51 --------
.../systemds/scuro/modality/audio_modality.py | 61 ---------
.../python/systemds/scuro/modality/modality.py | 40 ++----
.../systemds/scuro/modality/text_modality.py | 61 ---------
.../python/systemds/scuro/modality/transformed.py | 52 ++++++++
.../unimodal.py => modality/type.py} | 23 ++--
.../systemds/scuro/modality/unimodal_modality.py | 59 +++++++++
.../systemds/scuro/modality/video_modality.py | 61 ---------
.../systemds/scuro/representations/average.py | 2 +-
.../python/systemds/scuro/representations/bert.py | 15 +--
.../python/systemds/scuro/representations/bow.py | 14 +--
.../scuro/representations/concatenation.py | 2 +-
.../systemds/scuro/representations/fusion.py | 2 +-
.../python/systemds/scuro/representations/glove.py | 31 +++--
.../python/systemds/scuro/representations/lstm.py | 8 +-
.../python/systemds/scuro/representations/max.py | 2 +-
.../scuro/representations/mel_spectrogram.py | 23 ++--
.../scuro/representations/multiplication.py | 2 +-
.../systemds/scuro/representations/resnet.py | 137 ++++++++-------------
.../systemds/scuro/representations/rowmax.py | 9 +-
.../python/systemds/scuro/representations/sum.py | 2 +-
.../python/systemds/scuro/representations/tfidf.py | 11 +-
.../systemds/scuro/representations/unimodal.py | 2 +-
.../systemds/scuro/representations/word2vec.py | 21 ++--
src/main/python/tests/scuro/data_generator.py | 19 ++-
src/main/python/tests/scuro/test_data_loaders.py | 82 ++++++------
src/main/python/tests/scuro/test_dr_search.py | 43 +++----
37 files changed, 553 insertions(+), 676 deletions(-)
diff --git a/src/main/python/systemds/scuro/__init__.py
b/src/main/python/systemds/scuro/__init__.py
index 84494a158e..53b68d430f 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -18,59 +18,60 @@
# under the License.
#
# -------------------------------------------------------------
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.dataloader.json_loader import JSONLoader
from systemds.scuro.representations.representation import Representation
from systemds.scuro.representations.average import Average
from systemds.scuro.representations.concatenation import Concatenation
-from systemds.scuro.representations.fusion import Fusion
from systemds.scuro.representations.sum import Sum
from systemds.scuro.representations.max import RowMax
from systemds.scuro.representations.multiplication import Multiplication
from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
from systemds.scuro.representations.resnet import ResNet
from systemds.scuro.representations.bert import Bert
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
from systemds.scuro.representations.lstm import LSTM
-from systemds.scuro.representations.representation_dataloader import (
- NPY,
- Pickle,
- HDF5,
- JSON,
-)
+from systemds.scuro.representations.bow import BoW
+from systemds.scuro.representations.glove import GloVe
+from systemds.scuro.representations.tfidf import TfIdf
+from systemds.scuro.representations.word2vec import W2V
from systemds.scuro.models.model import Model
from systemds.scuro.models.discrete_model import DiscreteModel
-from systemds.scuro.modality.aligned_modality import AlignedModality
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.video_modality import VideoModality
-from systemds.scuro.modality.text_modality import TextModality
from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.modality.type import ModalityType
from systemds.scuro.aligner.dr_search import DRSearch
from systemds.scuro.aligner.task import Task
__all__ = [
+ "BaseLoader",
+ "AudioLoader",
+ "VideoLoader",
+ "TextLoader",
"Representation",
"Average",
"Concatenation",
- "Fusion",
"Sum",
"RowMax",
"Multiplication",
"MelSpectrogram",
"ResNet",
"Bert",
- "UnimodalRepresentation",
"LSTM",
- "NPY",
- "Pickle",
- "HDF5",
- "JSON",
+ "BoW",
+ "GloVe",
+ "TfIdf",
+ "W2V",
"Model",
"DiscreteModel",
- "AlignedModality",
- "AudioModality",
- "VideoModality",
- "TextModality",
"Modality",
+ "UnimodalModality",
+ "TransformedModality",
+ "ModalityType",
"DRSearch",
"Task",
]
diff --git a/src/main/python/systemds/scuro/aligner/alignment.py
b/src/main/python/systemds/scuro/aligner/alignment.py
index e341e1b76b..62f88a272b 100644
--- a/src/main/python/systemds/scuro/aligner/alignment.py
+++ b/src/main/python/systemds/scuro/aligner/alignment.py
@@ -19,7 +19,6 @@
#
# -------------------------------------------------------------
from aligner.alignment_strategy import AlignmentStrategy
-from modality.aligned_modality import AlignedModality
from modality.modality import Modality
from modality.representation import Representation
from aligner.similarity_measures import Measure
@@ -46,4 +45,4 @@ class Alignment:
self.similarity_measure = similarity_measure
def align_modalities(self) -> Modality:
- return AlignedModality(Representation())
+ return Modality(Representation())
diff --git a/src/main/python/systemds/scuro/aligner/dr_search.py
b/src/main/python/systemds/scuro/aligner/dr_search.py
index 24f3c3236f..b46139dff3 100644
--- a/src/main/python/systemds/scuro/aligner/dr_search.py
+++ b/src/main/python/systemds/scuro/aligner/dr_search.py
@@ -23,7 +23,6 @@ import random
from typing import List
from systemds.scuro.aligner.task import Task
-from systemds.scuro.modality.aligned_modality import AlignedModality
from systemds.scuro.modality.modality import Modality
from systemds.scuro.representations.representation import Representation
@@ -64,27 +63,25 @@ class DRSearch:
def set_best_params(
self,
- modality_name: str,
representation: Representation,
scores: List[float],
modality_names: List[str],
):
"""
Updates the best parameters for given modalities, representation, and
score
- :param modality_name: The name of the aligned modality
:param representation: The representation used to retrieve the current
score
- :param score: achieved score for the set of modalities and
representation
+ :param scores: achieved train/test scores for the set of modalities
and representation
:param modality_names: List of modality names used in this setting
:return:
"""
# check if modality name is already in dictionary
- if modality_name not in self.scores.keys():
+ if "_".join(modality_names) not in self.scores.keys():
# if not add it to dictionary
- self.scores[modality_name] = {}
+ self.scores["_".join(modality_names)] = {}
# set score for representation
- self.scores[modality_name][representation] = scores
+ self.scores["_".join(modality_names)][representation] = scores
# compare current score with best score
if scores[1] > self.best_score:
@@ -113,13 +110,12 @@ class DRSearch:
modality_combination = random.choice(modalities)
representation = random.choice(self.representations)
- modality = AlignedModality(representation, list(modality_combination))
# noqa
- modality.combine()
+ modality = modality_combination[0].combine(
+ modality_combination[1:], representation
+ )
scores = self.task.run(modality.data)
- self.set_best_params(
- modality.name, representation, scores,
modality.get_modality_names()
- )
+ self.set_best_params(representation, scores,
modality.get_modality_names())
return self.best_representation, self.best_score, self.best_modalities
@@ -133,14 +129,14 @@ class DRSearch:
for M in range(1, len(self.modalities) + 1):
for combination in itertools.combinations(self.modalities, M):
for representation in self.representations:
- modality = AlignedModality(
- representation, list(combination)
- ) # noqa
- modality.combine()
+ modality = combination[0]
+ if len(combination) > 1:
+ modality = combination[0].combine(
+ list(combination[1:]), representation
+ )
scores = self.task.run(modality.data)
self.set_best_params(
- modality.name,
representation,
scores,
modality.get_modality_names(),
@@ -164,7 +160,8 @@ class DRSearch:
for modality_name in self.best_modalities:
used_modalities.append(get_modalities_by_name(modalities,
modality_name))
- modality = AlignedModality(self.best_representation, used_modalities)
# noqa
- modality.combine(self.task.train_indices)
+ modality = used_modalities[0].combine(
+ used_modalities[1:], self.best_representation
+ )
return modality.data
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py
b/src/main/python/systemds/scuro/dataloader/__init__.py
similarity index 63%
copy from src/main/python/systemds/scuro/representations/unimodal.py
copy to src/main/python/systemds/scuro/dataloader/__init__.py
index ccd6197765..e66abb4646 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/dataloader/__init__.py
@@ -18,21 +18,3 @@
# under the License.
#
# -------------------------------------------------------------
-from systemds.scuro.representations.representation import Representation
-
-
-class UnimodalRepresentation(Representation):
- def __init__(self, name):
- """
- Parent class for all unimodal representation types
- :param name: name of the representation
- """
- super().__init__(name)
-
- def parse_all(self, file_path, indices):
- raise f"Not implemented for {self.name}"
-
-
-class PixelRepresentation(UnimodalRepresentation):
- def __init__(self):
- super().__init__("Pixel")
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py
b/src/main/python/systemds/scuro/dataloader/audio_loader.py
similarity index 64%
copy from src/main/python/systemds/scuro/representations/unimodal.py
copy to src/main/python/systemds/scuro/dataloader/audio_loader.py
index ccd6197765..f85b1b80fa 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -18,21 +18,22 @@
# under the License.
#
# -------------------------------------------------------------
-from systemds.scuro.representations.representation import Representation
+from typing import List, Optional
+import librosa
+from systemds.scuro.dataloader.base_loader import BaseLoader
-class UnimodalRepresentation(Representation):
- def __init__(self, name):
- """
- Parent class for all unimodal representation types
- :param name: name of the representation
- """
- super().__init__(name)
- def parse_all(self, file_path, indices):
- raise f"Not implemented for {self.name}"
+class AudioLoader(BaseLoader):
+ def __init__(
+ self,
+ source_path: str,
+ indices: List[str],
+ chunk_size: Optional[int] = None,
+ ):
+ super().__init__(source_path, indices, chunk_size)
-
-class PixelRepresentation(UnimodalRepresentation):
- def __init__(self):
- super().__init__("Pixel")
+ def extract(self, file: str):
+ self.file_sanity_check(file)
+ audio, sr = librosa.load(file)
+ self.data.append(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py
b/src/main/python/systemds/scuro/dataloader/base_loader.py
new file mode 100644
index 0000000000..2ef60677c6
--- /dev/null
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -0,0 +1,92 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import os
+from abc import ABC, abstractmethod
+from typing import List, Optional, Union
+
+
+class BaseLoader(ABC):
+ def __init__(
+ self, source_path: str, indices: List[str], chunk_size: Optional[int]
= None
+ ):
+ """
+ Base class to load raw data for a given list of indices and stores
them in the data object
+ :param source_path: The location where the raw data lies
+ :param indices: A list of indices as strings that are corresponding to
the file names
+ :param chunk_size: An optional argument to load the data in chunks
instead of all at once
+ (otherwise please provide your own Dataloader that knows about the
file name convention)
+ """
+ self.data = []
+ self.source_path = source_path
+ self.indices = indices
+ self.chunk_size = chunk_size
+ self.next_chunk = 0
+
+ if self.chunk_size:
+ self.num_chunks = int(len(self.indices) / self.chunk_size)
+
+ def load(self):
+ """
+ Takes care of loading the raw data either chunk wise (if chunk size is
defined) or all at once
+ """
+ if self.chunk_size:
+ return self._load_next_chunk()
+
+ return self._load(self.indices)
+
+ def _load_next_chunk(self):
+ """
+ Loads the next chunk of data
+ """
+ self.data = []
+ next_chunk_indices = self.indices[
+ self.next_chunk * self.chunk_size : (self.next_chunk + 1) *
self.chunk_size
+ ]
+ self.next_chunk += 1
+ return self._load(next_chunk_indices)
+
+ def _load(self, indices: List[str]):
+ is_dir = True if os.path.isdir(self.source_path) else False
+
+ if is_dir:
+ _, ext = os.path.splitext(os.listdir(self.source_path)[0])
+ for index in indices:
+ self.extract(self.source_path + index + ext)
+ else:
+ self.extract(self.source_path, indices)
+
+ return self.data
+
+ @abstractmethod
+ def extract(self, file: str, index: Optional[Union[str, List[str]]] =
None):
+ pass
+
+ def file_sanity_check(self, file):
+ """
+ Checks if the file can be found is not empty
+ """
+ try:
+ file_size = os.path.getsize(file)
+ except:
+ raise (f"Error: File {0} not found!".format(file))
+
+ if file_size == 0:
+ raise ("File {0} is empty".format(file))
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py
b/src/main/python/systemds/scuro/dataloader/json_loader.py
similarity index 59%
copy from src/main/python/systemds/scuro/representations/unimodal.py
copy to src/main/python/systemds/scuro/dataloader/json_loader.py
index ccd6197765..c4e3b95611 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -18,21 +18,26 @@
# under the License.
#
# -------------------------------------------------------------
-from systemds.scuro.representations.representation import Representation
+import json
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from typing import Optional, List
-class UnimodalRepresentation(Representation):
- def __init__(self, name):
- """
- Parent class for all unimodal representation types
- :param name: name of the representation
- """
- super().__init__(name)
- def parse_all(self, file_path, indices):
- raise f"Not implemented for {self.name}"
+class JSONLoader(BaseLoader):
+ def __init__(
+ self,
+ source_path: str,
+ indices: List[str],
+ field: str,
+ chunk_size: Optional[int] = None,
+ ):
+ super().__init__(source_path, indices, chunk_size)
+ self.field = field
-
-class PixelRepresentation(UnimodalRepresentation):
- def __init__(self):
- super().__init__("Pixel")
+ def extract(self, file: str, indices: List[str]):
+ self.file_sanity_check(file)
+ with open(file) as f:
+ json_file = json.load(f)
+ for idx in indices:
+ self.data.append(json_file[idx][self.field])
diff --git a/src/main/python/systemds/scuro/representations/sum.py
b/src/main/python/systemds/scuro/dataloader/text_loader.py
similarity index 55%
copy from src/main/python/systemds/scuro/representations/sum.py
copy to src/main/python/systemds/scuro/dataloader/text_loader.py
index bfb19d4f7d..f614472bce 100644
--- a/src/main/python/systemds/scuro/representations/sum.py
+++ b/src/main/python/systemds/scuro/dataloader/text_loader.py
@@ -18,31 +18,27 @@
# under the License.
#
# -------------------------------------------------------------
-
-from typing import List
-
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.utils import pad_sequences
-
-from systemds.scuro.representations.fusion import Fusion
-
-
-class Sum(Fusion):
- def __init__(self):
- """
- Combines modalities using colum-wise sum
- """
- super().__init__("Sum")
-
- def fuse(self, modalities: List[Modality]):
- max_emb_size = self.get_max_embedding_size(modalities)
-
- data = pad_sequences(modalities[0].data, maxlen=max_emb_size,
dtype="float32")
-
- for m in range(1, len(modalities)):
- data += pad_sequences(
- modalities[m].data, maxlen=max_emb_size, dtype="float32"
- )
-
- return data
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from typing import Optional, Pattern, List
+import re
+
+
+class TextLoader(BaseLoader):
+ def __init__(
+ self,
+ source_path: str,
+ indices: List[str],
+ chunk_size: Optional[int] = None,
+ prefix: Optional[Pattern[str]] = None,
+ ):
+ super().__init__(source_path, indices, chunk_size)
+ self.prefix = prefix
+
+ def extract(self, file: str):
+ self.file_sanity_check(file)
+ with open(file) as text_file:
+ for i, line in enumerate(text_file):
+ if self.prefix:
+ line = re.sub(self.prefix, "", line)
+ line = line.replace("\n", "")
+ self.data.append(line)
diff --git a/src/main/python/systemds/scuro/representations/sum.py
b/src/main/python/systemds/scuro/dataloader/video_loader.py
similarity index 54%
copy from src/main/python/systemds/scuro/representations/sum.py
copy to src/main/python/systemds/scuro/dataloader/video_loader.py
index bfb19d4f7d..6da20b3475 100644
--- a/src/main/python/systemds/scuro/representations/sum.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -18,31 +18,35 @@
# under the License.
#
# -------------------------------------------------------------
+from typing import List, Optional
-from typing import List
+import numpy as np
+from systemds.scuro.dataloader.base_loader import BaseLoader
+import cv2
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.utils import pad_sequences
-from systemds.scuro.representations.fusion import Fusion
+class VideoLoader(BaseLoader):
+ def __init__(
+ self,
+ source_path: str,
+ indices: List[str],
+ chunk_size: Optional[int] = None,
+ ):
+ super().__init__(source_path, indices, chunk_size)
+ def extract(self, file: str):
+ self.file_sanity_check(file)
+ cap = cv2.VideoCapture(file)
+ frames = []
+ while cap.isOpened():
+ ret, frame = cap.read()
-class Sum(Fusion):
- def __init__(self):
- """
- Combines modalities using colum-wise sum
- """
- super().__init__("Sum")
+ if not ret:
+ break
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+ frame = frame.astype(np.float32) / 255.0
- def fuse(self, modalities: List[Modality]):
- max_emb_size = self.get_max_embedding_size(modalities)
+ frames.append(frame)
- data = pad_sequences(modalities[0].data, maxlen=max_emb_size,
dtype="float32")
-
- for m in range(1, len(modalities)):
- data += pad_sequences(
- modalities[m].data, maxlen=max_emb_size, dtype="float32"
- )
-
- return data
+ self.data.append(frames)
diff --git a/src/main/python/systemds/scuro/main.py
b/src/main/python/systemds/scuro/main.py
index f28b271b97..8a51e098cc 100644
--- a/src/main/python/systemds/scuro/main.py
+++ b/src/main/python/systemds/scuro/main.py
@@ -18,21 +18,20 @@
# under the License.
#
# -------------------------------------------------------------
-import collections
-import json
-from datetime import datetime
-
+from systemds.scuro.representations.bert import Bert
+from systemds.scuro.representations.resnet import ResNet
+from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
from systemds.scuro.representations.average import Average
from systemds.scuro.representations.concatenation import Concatenation
-from systemds.scuro.modality.aligned_modality import AlignedModality
-from systemds.scuro.modality.text_modality import TextModality
-from systemds.scuro.modality.video_modality import VideoModality
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.representations.unimodal import Pickle, JSON, HDF5, NPY
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
from systemds.scuro.models.discrete_model import DiscreteModel
from systemds.scuro.aligner.task import Task
from systemds.scuro.aligner.dr_search import DRSearch
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+
class CustomTask(Task):
def __init__(self, model, labels, train_indices, val_indices):
@@ -49,18 +48,32 @@ labels = []
train_indices = []
val_indices = []
+all_indices = []
+
video_path = ""
audio_path = ""
text_path = ""
+
+# Define dataloaders
+video_data_loader = VideoLoader(video_path, all_indices, chunk_size=10)
+text_data_loader = TextLoader(text_path, all_indices)
+audio_data_loader = AudioLoader(audio_path, all_indices)
+
# Load modalities (audio, video, text)
-video = VideoModality(video_path, HDF5(), train_indices)
-audio = AudioModality(audio_path, Pickle(), train_indices)
-text = TextModality(text_path, NPY(), train_indices)
+video = UnimodalModality(video_data_loader, "VIDEO")
+audio = UnimodalModality(audio_data_loader, "AUDIO")
+text = UnimodalModality(text_data_loader, "TEXT")
+
+# Define unimodal representations
+r_v = ResNet()
+r_a = MelSpectrogram()
+r_t = Bert()
-video.read_all()
-audio.read_all()
-text.read_all()
+# Transform raw unimodal data
+video.apply_representation(r_v)
+audio.apply_representation(r_a)
+text.apply_representation(r_t)
modalities = [text, audio, video]
diff --git a/src/main/python/systemds/scuro/modality/aligned_modality.py
b/src/main/python/systemds/scuro/modality/aligned_modality.py
deleted file mode 100644
index 839b9d296f..0000000000
--- a/src/main/python/systemds/scuro/modality/aligned_modality.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-from typing import List
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.fusion import Fusion
-
-
-class AlignedModality(Modality):
- def __init__(self, representation: Fusion, modalities: List[Modality]):
- """
- Defines the modality that is created during the fusion process
- :param representation: The representation for the aligned modality
- :param modalities: List of modalities to be combined
- """
- name = ""
- for modality in modalities:
- name += modality.name
- super().__init__(representation, modality_name=name)
- self.modalities = modalities
-
- def combine(self):
- """
- Initiates the call to fuse the given modalities depending on the
Fusion type
- """
- self.data = self.representation.fuse(self.modalities) # noqa
-
- def get_modality_names(self):
- names = []
- for modality in self.modalities:
- names.append(modality.name)
-
- return names
diff --git a/src/main/python/systemds/scuro/modality/audio_modality.py
b/src/main/python/systemds/scuro/modality/audio_modality.py
deleted file mode 100644
index ba84962226..0000000000
--- a/src/main/python/systemds/scuro/modality/audio_modality.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-import os
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
-
-
-class AudioModality(Modality):
- def __init__(
- self,
- file_path: str,
- representation: UnimodalRepresentation,
- train_indices=None,
- start_index: int = 0,
- ):
- """
- Creates an audio modality
- :param file_path: path to file where the audio embeddings are stored
- :param representation: Unimodal representation that indicates how to
extract the data from the file
- """
- super().__init__(representation, start_index, "Audio", train_indices)
- self.file_path = file_path
-
- def file_sanity_check(self):
- """
- Checks if the file can be found is not empty
- """
- try:
- file_size = os.path.getsize(self.file_path)
- except:
- raise (f"Error: File {0} not found!".format(self.file_path))
-
- if file_size == 0:
- raise ("File {0} is empty".format(self.file_path))
-
- def read_chunk(self):
- pass
-
- def read_all(self, indices=None):
- self.data = self.representation.parse_all(
- self.file_path, indices=indices
- ) # noqa
diff --git a/src/main/python/systemds/scuro/modality/modality.py
b/src/main/python/systemds/scuro/modality/modality.py
index a899576d5b..9a3d1b148d 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -18,41 +18,27 @@
# under the License.
#
# -------------------------------------------------------------
+from typing import List
-from systemds.scuro.representations.representation import Representation
+from systemds.scuro.modality.type import ModalityType
class Modality:
- def __init__(
- self,
- representation: Representation,
- start_index: int = 0,
- modality_name="",
- train_indices=None,
- ):
+ def __init__(self, modality_type: ModalityType):
"""
- Parent class of the different Modalities
- :param representation: Specifies how the data should be represented
for a specific modality
- :param start_index: Defines the first index used for the alignment
- :param modality_name: Name of the modality
- :param train_indices: List of indices used for train-test split
+ Parent class of the different Modalities (unimodal & multimodal)
+ :param modality_type: Type of the modality
"""
- self.representation = representation
- self.start_index = start_index
- self.name = modality_name
+ self.type = modality_type
self.data = None
- self.train_indices = train_indices
+ self.data_type = None
+ self.cost = None
+ self.shape = None
+ self.schema = {}
- def read_chunk(self):
+ def get_modality_names(self) -> List[str]:
"""
- Extracts a data chunk of the modality according to the window size
specified in params
+ Extracts the individual unimodal modalities for a given transformed
modality.
"""
- raise NotImplementedError
-
- def read_all(self, indices):
- """
- Implemented for every unique modality to read all samples from a
specified format
- :param indices: List of indices to be read
- """
- pass
+ return [modality.name for modality in ModalityType if modality in
self.type]
diff --git a/src/main/python/systemds/scuro/modality/text_modality.py
b/src/main/python/systemds/scuro/modality/text_modality.py
deleted file mode 100644
index c636de7167..0000000000
--- a/src/main/python/systemds/scuro/modality/text_modality.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-import os
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
-
-
-class TextModality(Modality):
- def __init__(
- self,
- file_path: str,
- representation: UnimodalRepresentation,
- train_indices=None,
- start_index: int = 0,
- ):
- """
- Creates a text modality
- :param file_path: path to file(s) where the text data is stored
- :param representation: Unimodal representation that indicates how to
extract the data from the file
- """
- super().__init__(representation, start_index, "Text", train_indices)
- self.file_path = file_path
-
- def file_sanity_check(self):
- """
- Checks if the file can be found is not empty
- """
- try:
- file_size = os.path.getsize(self.file_path)
- except:
- raise (f"Error: File {0} not found!".format(self.file_path))
-
- if file_size == 0:
- raise ("File {0} is empty".format(self.file_path))
-
- def read_chunk(self):
- pass
-
- def read_all(self, indices=None):
- self.data = self.representation.parse_all(
- self.file_path, indices=indices
- ) # noqa
diff --git a/src/main/python/systemds/scuro/modality/transformed.py
b/src/main/python/systemds/scuro/modality/transformed.py
new file mode 100644
index 0000000000..61c327e469
--- /dev/null
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -0,0 +1,52 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from functools import reduce
+from operator import or_
+
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.type import ModalityType
+
+
+class TransformedModality(Modality):
+
+ def __init__(self, modality_type: ModalityType, transformation):
+ """
+ Parent class of the different Modalities (unimodal & multimodal)
+ :param modality_type: Type of the original modality(ies)
+ :param transformation: Representation to be applied on the modality
+ """
+ super().__init__(modality_type)
+ self.transformation = transformation
+
+ def combine(self, other, fusion_method):
+ """
+ Combines two or more modalities with each other using a dedicated
fusion method
+ :param other: The modality to be combined
+ :param fusion_method: The fusion method to be used to combine
modalities
+ """
+ fused_modality = TransformedModality(
+ reduce(or_, (o.type for o in other), self.type), fusion_method
+ )
+ modalities = [self]
+ modalities.extend(other)
+ fused_modality.data = fusion_method.transform(modalities)
+
+ return fused_modality
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py
b/src/main/python/systemds/scuro/modality/type.py
similarity index 64%
copy from src/main/python/systemds/scuro/representations/unimodal.py
copy to src/main/python/systemds/scuro/modality/type.py
index ccd6197765..c451eea6f1 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -18,21 +18,14 @@
# under the License.
#
# -------------------------------------------------------------
-from systemds.scuro.representations.representation import Representation
+from enum import Enum, Flag, auto
-class UnimodalRepresentation(Representation):
- def __init__(self, name):
- """
- Parent class for all unimodal representation types
- :param name: name of the representation
- """
- super().__init__(name)
+class ModalityType(Flag):
+ TEXT = auto()
+ AUDIO = auto()
+ VIDEO = auto()
- def parse_all(self, file_path, indices):
- raise f"Not implemented for {self.name}"
-
-
-class PixelRepresentation(UnimodalRepresentation):
- def __init__(self):
- super().__init__("Pixel")
+ # def __init__(self, value, name):
+ # self._value_ = value
+ # self.name = name
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py
b/src/main/python/systemds/scuro/modality/unimodal_modality.py
new file mode 100644
index 0000000000..976d4194d4
--- /dev/null
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -0,0 +1,59 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.modality.type import ModalityType
+
+
+class UnimodalModality(Modality):
+
+ def __init__(self, data_loader: BaseLoader, modality_type: ModalityType):
+ """
+ This class represents a unimodal modality.
+ :param data_loader: Defines how the raw data should be loaded
+ :param modality_type: Type of the modality
+ """
+ super().__init__(modality_type)
+ self.data_loader = data_loader
+
+ def extract_raw_data(self):
+ """
+ Uses the data loader to read the raw data from a specified location
+ and stores the data in the data location.
+ TODO: schema
+ """
+ self.data = self.data_loader.load()
+
+ def apply_representation(self, representation):
+ new_modality = TransformedModality(self.type, representation)
+ new_modality.data = []
+
+ if self.data_loader.chunk_size:
+ while self.data_loader.next_chunk < self.data_loader.num_chunks:
+ self.extract_raw_data()
+ new_modality.data.extend(representation.transform(self.data))
+ else:
+ if not self.data:
+ self.extract_raw_data()
+ new_modality.data = representation.transform(self.data)
+
+ return new_modality
diff --git a/src/main/python/systemds/scuro/modality/video_modality.py
b/src/main/python/systemds/scuro/modality/video_modality.py
deleted file mode 100644
index a6cedf6c86..0000000000
--- a/src/main/python/systemds/scuro/modality/video_modality.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-import os
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
-
-
-class VideoModality(Modality):
- def __init__(
- self,
- file_path: str,
- representation: UnimodalRepresentation,
- train_indices=None,
- start_index: int = 0,
- ):
- """
- Creates a video modality
- :param file_path: path to file where the video embeddings (for now)
are stored
- :param representation: Unimodal representation that indicates how to
extract the data from the file
- """
- super().__init__(representation, start_index, "Video", train_indices)
- self.file_path = file_path
-
- def file_sanity_check(self):
- """
- Checks if the file can be found is not empty
- """
- try:
- file_size = os.path.getsize(self.file_path)
- except:
- raise (f"Error: File {0} not found!".format(self.file_path))
-
- if file_size == 0:
- raise ("File {0} is empty".format(self.file_path))
-
- def read_chunk(self):
- pass
-
- def read_all(self, indices=None):
- self.data = self.representation.parse_all(
- self.file_path, indices=indices
- ) # noqa
diff --git a/src/main/python/systemds/scuro/representations/average.py
b/src/main/python/systemds/scuro/representations/average.py
index d10778f113..db44050e9e 100644
--- a/src/main/python/systemds/scuro/representations/average.py
+++ b/src/main/python/systemds/scuro/representations/average.py
@@ -36,7 +36,7 @@ class Average(Fusion):
"""
super().__init__("Average")
- def fuse(self, modalities: List[Modality]):
+ def transform(self, modalities: List[Modality]):
max_emb_size = self.get_max_embedding_size(modalities)
padded_modalities = []
diff --git a/src/main/python/systemds/scuro/representations/bert.py
b/src/main/python/systemds/scuro/representations/bert.py
index 85d0b1ad65..0fcf1e8d28 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -24,7 +24,7 @@ import numpy as np
from systemds.scuro.representations.unimodal import UnimodalRepresentation
import torch
from transformers import BertTokenizer, BertModel
-from systemds.scuro.representations.utils import read_data_from_file,
save_embeddings
+from systemds.scuro.representations.utils import save_embeddings
class Bert(UnimodalRepresentation):
@@ -34,8 +34,7 @@ class Bert(UnimodalRepresentation):
self.avg_layers = avg_layers
self.output_file = output_file
- def parse_all(self, filepath, indices):
- data = read_data_from_file(filepath, indices)
+ def transform(self, data):
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(
@@ -47,13 +46,10 @@ class Bert(UnimodalRepresentation):
else:
model = BertModel.from_pretrained(model_name)
- embeddings = self.create_embeddings(list(data.values()), model,
tokenizer)
+ embeddings = self.create_embeddings(data, model, tokenizer)
if self.output_file is not None:
- data = {}
- for i in range(0, embeddings.shape[0]):
- data[indices[i]] = embeddings[i]
- save_embeddings(data, self.output_file)
+ save_embeddings(embeddings, self.output_file)
return embeddings
@@ -75,8 +71,5 @@ class Bert(UnimodalRepresentation):
cls_embedding = outputs.last_hidden_state[:, 0,
:].squeeze().numpy()
embeddings.append(cls_embedding)
- if self.output_file is not None:
- save_embeddings(embeddings, self.output_file)
-
embeddings = np.array(embeddings)
return embeddings.reshape((embeddings.shape[0], embeddings.shape[-1]))
diff --git a/src/main/python/systemds/scuro/representations/bow.py
b/src/main/python/systemds/scuro/representations/bow.py
index dc5013b354..bd54654a5c 100644
--- a/src/main/python/systemds/scuro/representations/bow.py
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -19,11 +19,10 @@
#
# -------------------------------------------------------------
-import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from systemds.scuro.representations.unimodal import UnimodalRepresentation
-from systemds.scuro.representations.utils import read_data_from_file,
save_embeddings
+from systemds.scuro.representations.utils import save_embeddings
class BoW(UnimodalRepresentation):
@@ -33,19 +32,14 @@ class BoW(UnimodalRepresentation):
self.min_df = min_df
self.output_file = output_file
- def parse_all(self, filepath, indices):
+ def transform(self, data):
vectorizer = CountVectorizer(
ngram_range=(1, self.ngram_range), min_df=self.min_df
)
- segments = read_data_from_file(filepath, indices)
- X = vectorizer.fit_transform(segments.values())
- X = X.toarray()
+ X = vectorizer.fit_transform(data).toarray()
if self.output_file is not None:
- df = pd.DataFrame(X)
- df.index = segments.keys()
-
- save_embeddings(df, self.output_file)
+ save_embeddings(X, self.output_file)
return X
diff --git a/src/main/python/systemds/scuro/representations/concatenation.py
b/src/main/python/systemds/scuro/representations/concatenation.py
index 7694fa6897..fd9293d399 100644
--- a/src/main/python/systemds/scuro/representations/concatenation.py
+++ b/src/main/python/systemds/scuro/representations/concatenation.py
@@ -37,7 +37,7 @@ class Concatenation(Fusion):
super().__init__("Concatenation")
self.padding = padding
- def fuse(self, modalities: List[Modality]):
+ def transform(self, modalities: List[Modality]):
if len(modalities) == 1:
return np.array(modalities[0].data)
diff --git a/src/main/python/systemds/scuro/representations/fusion.py
b/src/main/python/systemds/scuro/representations/fusion.py
index e84e59f666..623979dd05 100644
--- a/src/main/python/systemds/scuro/representations/fusion.py
+++ b/src/main/python/systemds/scuro/representations/fusion.py
@@ -32,7 +32,7 @@ class Fusion(Representation):
"""
super().__init__(name)
- def fuse(self, modalities: List[Modality]):
+ def transform(self, modalities: List[Modality]):
"""
Implemented for every child class and creates a fused representation
out of
multiple modalities
diff --git a/src/main/python/systemds/scuro/representations/glove.py
b/src/main/python/systemds/scuro/representations/glove.py
index 840360540e..cf13c717d2 100644
--- a/src/main/python/systemds/scuro/representations/glove.py
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -18,7 +18,6 @@
# under the License.
#
# -------------------------------------------------------------
-import nltk
import numpy as np
from nltk import word_tokenize
@@ -43,24 +42,24 @@ class GloVe(UnimodalRepresentation):
self.glove_path = glove_path
self.output_file = output_file
- def parse_all(self, filepath, indices):
+ def transform(self, data):
glove_embeddings = load_glove_embeddings(self.glove_path)
- segments = read_data_from_file(filepath, indices)
- embeddings = {}
- for k, v in segments.items():
- tokens = word_tokenize(v.lower())
- embeddings[k] = np.mean(
- [
- glove_embeddings[token]
- for token in tokens
- if token in glove_embeddings
- ],
- axis=0,
+ embeddings = []
+ for sentences in data:
+ tokens = word_tokenize(sentences.lower())
+ embeddings.append(
+ np.mean(
+ [
+ glove_embeddings[token]
+ for token in tokens
+ if token in glove_embeddings
+ ],
+ axis=0,
+ )
)
if self.output_file is not None:
- save_embeddings(embeddings, self.output_file)
+ save_embeddings(np.array(embeddings), self.output_file)
- embeddings = np.array(list(embeddings.values()))
- return embeddings
+ return np.array(embeddings)
diff --git a/src/main/python/systemds/scuro/representations/lstm.py
b/src/main/python/systemds/scuro/representations/lstm.py
index 3687ff6514..649b81117b 100644
--- a/src/main/python/systemds/scuro/representations/lstm.py
+++ b/src/main/python/systemds/scuro/representations/lstm.py
@@ -40,17 +40,17 @@ class LSTM(Fusion):
self.dropout_rate = dropout_rate
self.unimodal_embeddings = {}
- def fuse(self, modalities: List[Modality], train_indices=None):
+ def transform(self, modalities: List[Modality]):
size = len(modalities[0].data)
result = np.zeros((size, 0))
for modality in modalities:
- if modality.name in self.unimodal_embeddings.keys():
- out = self.unimodal_embeddings.get(modality.name)
+ if modality.type in self.unimodal_embeddings.keys():
+ out = self.unimodal_embeddings.get(modality.type)
else:
out = self.run_lstm(modality.data)
- self.unimodal_embeddings[modality.name] = out
+ self.unimodal_embeddings[modality.type] = out
result = np.concatenate([result, out], axis=-1)
diff --git a/src/main/python/systemds/scuro/representations/max.py
b/src/main/python/systemds/scuro/representations/max.py
index 2e1e864477..194b20801e 100644
--- a/src/main/python/systemds/scuro/representations/max.py
+++ b/src/main/python/systemds/scuro/representations/max.py
@@ -38,7 +38,7 @@ class RowMax(Fusion):
super().__init__("RowMax")
self.split = split
- def fuse(
+ def transform(
self,
modalities: List[Modality],
):
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 27aba8b997..57a7fab83e 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -19,7 +19,6 @@
#
# -------------------------------------------------------------
-import os
import pickle
import librosa
@@ -35,19 +34,15 @@ class MelSpectrogram(UnimodalRepresentation):
self.avg = avg
self.output_file = output_file
- def parse_all(self, file_path, indices, get_sequences=False):
+ def transform(self, data):
result = []
max_length = 0
- if os.path.isdir(file_path):
- for filename in os.listdir(file_path):
- f = os.path.join(file_path, filename)
- if os.path.isfile(f):
- y, sr = librosa.load(f)
- S = librosa.feature.melspectrogram(y=y, sr=sr)
- S_dB = librosa.power_to_db(S, ref=np.max)
- if S_dB.shape[-1] > max_length:
- max_length = S_dB.shape[-1]
- result.append(S_dB)
+ for sample in data:
+ S = librosa.feature.melspectrogram(y=sample)
+ S_dB = librosa.power_to_db(S, ref=np.max)
+ if S_dB.shape[-1] > max_length:
+ max_length = S_dB.shape[-1]
+ result.append(S_dB)
r = []
for elem in result:
@@ -57,9 +52,9 @@ class MelSpectrogram(UnimodalRepresentation):
np_array_r = np.array(r) if not self.avg else np.mean(np.array(r),
axis=1)
if self.output_file is not None:
- data = {}
+ data = []
for i in range(0, np_array_r.shape[0]):
- data[indices[i]] = np_array_r[i]
+ data.append(np_array_r[i])
with open(self.output_file, "wb") as file:
pickle.dump(data, file)
diff --git a/src/main/python/systemds/scuro/representations/multiplication.py
b/src/main/python/systemds/scuro/representations/multiplication.py
index 18f34bae6f..2934fe5b3c 100644
--- a/src/main/python/systemds/scuro/representations/multiplication.py
+++ b/src/main/python/systemds/scuro/representations/multiplication.py
@@ -36,7 +36,7 @@ class Multiplication(Fusion):
"""
super().__init__("Multiplication")
- def fuse(self, modalities: List[Modality], train_indices=None):
+ def transform(self, modalities: List[Modality], train_indices=None):
max_emb_size = self.get_max_embedding_size(modalities)
data = pad_sequences(modalities[0].data, maxlen=max_emb_size,
dtype="float32")
diff --git a/src/main/python/systemds/scuro/representations/resnet.py
b/src/main/python/systemds/scuro/representations/resnet.py
index 75c921184b..1c1bfa1d5e 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -25,8 +25,6 @@ import h5py
from systemds.scuro.representations.unimodal import UnimodalRepresentation
from typing import Callable, Dict, Tuple, Any
import torch.utils.data
-import os
-import cv2
import torch
import torchvision.models as models
import torchvision.transforms as transforms
@@ -36,22 +34,25 @@ DEVICE = "cpu"
class ResNet(UnimodalRepresentation):
- def __init__(self, output_file=None):
+ def __init__(self, layer="avgpool", output_file=None):
super().__init__("ResNet")
self.output_file = output_file
+ self.layer_name = layer
- def parse_all(self, file_path, indices, get_sequences=False):
- resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT)
+ def transform(self, data):
+
+ resnet =
models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE)
resnet.eval()
for param in resnet.parameters():
param.requires_grad = False
- transform = transforms.Compose(
+ t = transforms.Compose(
[
transforms.ToPILImage(),
- transforms.Resize((224, 224)),
+ transforms.Resize(256),
+ transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
@@ -59,7 +60,7 @@ class ResNet(UnimodalRepresentation):
]
)
- dataset = ResNetDataset(transform=transform,
video_folder_path=file_path)
+ dataset = ResNetDataset(data, t)
embeddings = {}
class Identity(torch.nn.Module):
@@ -70,108 +71,72 @@ class ResNet(UnimodalRepresentation):
res5c_output = None
- def avg_pool_hook(
- _module: torch.nn.Module, input_: Tuple[torch.Tensor], _output: Any
- ) -> None:
- nonlocal res5c_output
- res5c_output = input_[0]
+ def get_features(name_):
+ def hook(
+ _module: torch.nn.Module, input_: Tuple[torch.Tensor], output:
Any
+ ):
+ nonlocal res5c_output
+ res5c_output = output
+
+ return hook
- resnet.avgpool.register_forward_hook(avg_pool_hook)
+ if self.layer_name:
+ for name, layer in resnet.named_modules():
+ if name == self.layer_name:
+ layer.register_forward_hook(get_features(name))
+ break
for instance in torch.utils.data.DataLoader(dataset):
video_id = instance["id"][0]
frames = instance["frames"][0].to(DEVICE)
- embeddings[video_id] = torch.empty((len(frames), 2048))
- batch_size = 32
+ embeddings[video_id] = []
+ batch_size = 64
+
for start_index in range(0, len(frames), batch_size):
end_index = min(start_index + batch_size, len(frames))
frame_ids_range = range(start_index, end_index)
frame_batch = frames[frame_ids_range]
- avg_pool_value = resnet(frame_batch)
+ _ = resnet(frame_batch)
+ values = res5c_output
+
+ if self.layer_name == "avgpool" or self.layer_name ==
"maxpool":
+ embeddings[video_id].extend(
+ torch.flatten(values, 1).detach().cpu().numpy()
+ )
- embeddings[video_id][frame_ids_range] =
avg_pool_value.to(DEVICE)
+ else:
+ pooled = torch.nn.functional.adaptive_avg_pool2d(values,
(1, 1))
+
+ embeddings[video_id].extend(
+ torch.flatten(pooled, 1).detach().cpu().numpy()
+ )
if self.output_file is not None:
with h5py.File(self.output_file, "w") as hdf:
for key, value in embeddings.items():
hdf.create_dataset(key, data=value)
- emb = np.zeros((len(indices), 2048), dtype="float32")
- if indices is not None:
- for i in indices:
- emb[i] = embeddings.get(str(i)).mean(dim=0).numpy()
- else:
- for i, key in enumerate(embeddings.keys()):
- emb[i] = embeddings.get(key).mean(dim=0).numpy()
-
- return emb
-
- @staticmethod
- def extract_features_from_video(video_path, model, transform):
- cap = cv2.VideoCapture(video_path)
- features = []
- count = 0
- success, frame = cap.read()
-
- while success:
- success, frame = cap.read()
- transformed_frame = transform(frame).unsqueeze(0)
+ emb = []
- with torch.no_grad():
- feature_vector = model(transformed_frame)
- feature_vector = feature_vector.view(-1).numpy()
+ for video in embeddings.values():
+ emb.append(np.array(video).mean(axis=0).tolist())
- features.append(feature_vector)
-
- count += 1
-
- cap.release()
- return features, count
+ return np.array(emb)
class ResNetDataset(torch.utils.data.Dataset):
- def __init__(self, video_folder_path: str, transform: Callable = None):
- self.video_folder_path = video_folder_path
- self.transform = transform
- self.video_ids = []
- video_files = [
- f
- for f in os.listdir(self.video_folder_path)
- if f.lower().endswith((".mp4", ".avi", ".mov", ".mkv"))
- ]
- self.file_extension = video_files[0].split(".")[-1]
-
- for video in video_files:
- video_id, _ = video.split("/")[-1].split(".")
- self.video_ids.append(video_id)
-
- self.frame_count_by_video_id = {video_id: 0 for video_id in
self.video_ids}
+ def __init__(self, data: str, tf: Callable = None):
+ self.data = data
+ self.tf = tf
def __getitem__(self, index) -> Dict[str, object]:
- video_id = self.video_ids[index]
- video_path = self.video_folder_path + "/" + video_id + "." +
self.file_extension
-
- frames = None
- count = 0
-
- cap = cv2.VideoCapture(video_path)
-
- success, frame = cap.read()
-
- num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
- self.frame_count_by_video_id[video_id] = num_frames
- if frames is None and success:
- frames = torch.empty((num_frames, 3, 224, 224))
-
- while success:
- frame = self.transform(frame)
- frames[count] = frame # noqa
- success, frame = cap.read()
- count += 1
+ video = self.data[index]
+ frames = torch.empty((len(video), 3, 224, 224))
- cap.release()
- return {"id": video_id, "frames": frames}
+ for i, frame in enumerate(video):
+ frames[i] = self.tf(frame)
+ return {"id": index, "frames": frames}
def __len__(self) -> int:
- return len(self.video_ids)
+ return len(self.data)
diff --git a/src/main/python/systemds/scuro/representations/rowmax.py
b/src/main/python/systemds/scuro/representations/rowmax.py
index 0dc201e2ee..3152782026 100644
--- a/src/main/python/systemds/scuro/representations/rowmax.py
+++ b/src/main/python/systemds/scuro/representations/rowmax.py
@@ -23,10 +23,10 @@ from typing import List
import numpy as np
-from modality.modality import Modality
+from systemds.scuro.modality.modality import Modality
from systemds.scuro.representations.utils import pad_sequences
-from representations.fusion import Fusion
+from systemds.scuro.representations.fusion import Fusion
class RowMax(Fusion):
@@ -38,7 +38,7 @@ class RowMax(Fusion):
super().__init__("RowMax")
self.split = split
- def fuse(self, modalities: List[Modality], train_indices):
+ def transform(self, modalities: List[Modality]):
if len(modalities) < 2:
return np.array(modalities)
@@ -46,8 +46,7 @@ class RowMax(Fusion):
padded_modalities = []
for modality in modalities:
- scaled = self.scale_data(modality.data, train_indices)
- d = pad_sequences(scaled, maxlen=max_emb_size, dtype="float32")
+ d = pad_sequences(modality.data, maxlen=max_emb_size,
dtype="float32")
padded_modalities.append(d)
split_rows = int(len(modalities[0].data) / self.split)
diff --git a/src/main/python/systemds/scuro/representations/sum.py
b/src/main/python/systemds/scuro/representations/sum.py
index bfb19d4f7d..0608338a0f 100644
--- a/src/main/python/systemds/scuro/representations/sum.py
+++ b/src/main/python/systemds/scuro/representations/sum.py
@@ -35,7 +35,7 @@ class Sum(Fusion):
"""
super().__init__("Sum")
- def fuse(self, modalities: List[Modality]):
+ def transform(self, modalities: List[Modality]):
max_emb_size = self.get_max_embedding_size(modalities)
data = pad_sequences(modalities[0].data, maxlen=max_emb_size,
dtype="float32")
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py
b/src/main/python/systemds/scuro/representations/tfidf.py
index 15515dd538..4849aba136 100644
--- a/src/main/python/systemds/scuro/representations/tfidf.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -19,7 +19,6 @@
#
# -------------------------------------------------------------
-import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from systemds.scuro.representations.unimodal import UnimodalRepresentation
@@ -32,17 +31,13 @@ class TfIdf(UnimodalRepresentation):
self.min_df = min_df
self.output_file = output_file
- def parse_all(self, filepath, indices):
+ def transform(self, data):
vectorizer = TfidfVectorizer(min_df=self.min_df)
- segments = read_data_from_file(filepath, indices)
- X = vectorizer.fit_transform(segments.values())
+ X = vectorizer.fit_transform(data)
X = X.toarray()
if self.output_file is not None:
- df = pd.DataFrame(X)
- df.index = segments.keys()
-
- save_embeddings(df, self.output_file)
+ save_embeddings(X, self.output_file)
return X
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py
b/src/main/python/systemds/scuro/representations/unimodal.py
index ccd6197765..c56d611a74 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/representations/unimodal.py
@@ -29,7 +29,7 @@ class UnimodalRepresentation(Representation):
"""
super().__init__(name)
- def parse_all(self, file_path, indices):
+ def transform(self, data):
raise f"Not implemented for {self.name}"
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py
b/src/main/python/systemds/scuro/representations/word2vec.py
index cc8a180889..209091648d 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -21,10 +21,9 @@
import numpy as np
from systemds.scuro.representations.unimodal import UnimodalRepresentation
-from systemds.scuro.representations.utils import read_data_from_file,
save_embeddings
+from systemds.scuro.representations.utils import save_embeddings
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
-import nltk
def get_embedding(sentence, model):
@@ -44,22 +43,20 @@ class W2V(UnimodalRepresentation):
self.window = window
self.output_file = output_file
- def parse_all(self, filepath, indices):
- segments = read_data_from_file(filepath, indices)
- embeddings = {}
- t = [word_tokenize(s.lower()) for s in segments.values()]
+ def transform(self, data):
+ t = [word_tokenize(s.lower()) for s in data]
model = Word2Vec(
sentences=t,
vector_size=self.vector_size,
window=self.window,
min_count=self.min_count,
)
-
- for k, v in segments.items():
- tokenized_words = word_tokenize(v.lower())
- embeddings[k] = get_embedding(tokenized_words, model)
+ embeddings = []
+ for sentences in data:
+ tokens = word_tokenize(sentences.lower())
+ embeddings.append(get_embedding(tokens, model))
if self.output_file is not None:
- save_embeddings(embeddings, self.output_file)
+ save_embeddings(np.array(embeddings), self.output_file)
- return np.array(list(embeddings.values()))
+ return np.array(embeddings)
diff --git a/src/main/python/tests/scuro/data_generator.py
b/src/main/python/tests/scuro/data_generator.py
index 9f5b8dd2d7..6856ee7044 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -23,10 +23,7 @@ import numpy as np
from scipy.io.wavfile import write
import random
import os
-
-from systemds.scuro.modality.video_modality import VideoModality
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.text_modality import TextModality
+from systemds.scuro.modality.type import ModalityType
class TestDataGenerator:
@@ -36,7 +33,7 @@ class TestDataGenerator:
self.balanced = balanced
for modality in modalities:
- mod_path = f"{self.path}/{modality.name.lower()}/"
+ mod_path = f"{self.path}/{modality.type.name}/"
os.mkdir(mod_path)
modality.file_path = mod_path
self.labels = []
@@ -72,17 +69,17 @@ class TestDataGenerator:
speed_slow += 1
for modality in self.modalities:
- if isinstance(modality, VideoModality):
+ if modality.type == ModalityType.VIDEO:
self.__create_video_data(idx, duration, 30, speed_factor)
- if isinstance(modality, AudioModality):
+ if modality.type == ModalityType.AUDIO:
self.__create_audio_data(idx, duration, speed_factor)
- if isinstance(modality, TextModality):
+ if modality.type == ModalityType.TEXT:
self.__create_text_data(idx, speed_factor)
np.save(f"{self.path}/labels.npy", np.array(self.labels))
def __create_video_data(self, idx, duration, fps, speed_factor):
- path = f"{self.path}/video/{idx}.mp4"
+ path = f"{self.path}/VIDEO/{idx}.mp4"
width, height = 160, 120
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
@@ -108,13 +105,13 @@ class TestDataGenerator:
out.release()
def __create_text_data(self, idx, speed_factor):
- path = f"{self.path}/text/{idx}.txt"
+ path = f"{self.path}/TEXT/{idx}.txt"
with open(path, "w") as f:
f.write(f"The ball moves at speed factor {speed_factor:.2f}.")
def __create_audio_data(self, idx, duration, speed_factor):
- path = f"{self.path}/audio/{idx}.wav"
+ path = f"{self.path}/AUDIO/{idx}.wav"
sample_rate = 44100
t = np.linspace(0, duration, int(sample_rate * duration),
endpoint=False)
diff --git a/src/main/python/tests/scuro/test_data_loaders.py
b/src/main/python/tests/scuro/test_data_loaders.py
index cbbeafab8a..55704b8d8a 100644
--- a/src/main/python/tests/scuro/test_data_loaders.py
+++ b/src/main/python/tests/scuro/test_data_loaders.py
@@ -22,15 +22,17 @@
import os
import shutil
import unittest
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.text_modality import TextModality
-from systemds.scuro.modality.video_modality import VideoModality
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
from systemds.scuro.representations.bert import Bert
from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
from systemds.scuro.representations.resnet import ResNet
-from systemds.scuro.representations.representation_dataloader import HDF5,
NPY, Pickle
from tests.scuro.data_generator import TestDataGenerator
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.modality.type import ModalityType
+
class TestDataLoaders(unittest.TestCase):
test_file_path = None
@@ -53,28 +55,26 @@ class TestDataLoaders(unittest.TestCase):
cls.num_instances = 2
cls.indizes = [str(i) for i in range(0, cls.num_instances)]
- cls.video = VideoModality(
- "",
ResNet(f"{cls.test_file_path}/embeddings/resnet_embeddings.hdf5")
- )
- cls.audio = AudioModality(
- "",
- MelSpectrogram(
-
output_file=f"{cls.test_file_path}/embeddings/mel_sp_embeddings.npy"
- ),
- )
- cls.text = TextModality(
- "",
- Bert(
- avg_layers=4,
-
output_file=f"{cls.test_file_path}/embeddings/bert_embeddings.pkl",
- ),
- )
- cls.mods = [cls.video, cls.audio, cls.text]
+
+ cls.video_path = cls.test_file_path + "/" + ModalityType.VIDEO.name +
"/"
+ cls.audio_path = cls.test_file_path + "/" + ModalityType.AUDIO.name +
"/"
+ cls.text_path = cls.test_file_path + "/" + ModalityType.TEXT.name + "/"
+
+ video_data_loader = VideoLoader(cls.video_path, cls.indizes)
+ audio_data_loader = AudioLoader(cls.audio_path, cls.indizes)
+ text_data_loader = TextLoader(cls.text_path, cls.indizes)
+
+ # Load modalities (audio, video, text)
+ video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+ audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
+ text = UnimodalModality(text_data_loader, ModalityType.TEXT)
+
+ cls.mods = [video, audio, text]
cls.data_generator = TestDataGenerator(cls.mods, cls.test_file_path)
cls.data_generator.create_multimodal_data(cls.num_instances)
- cls.text.read_all(cls.indizes)
- cls.audio.read_all(cls.indizes)
- cls.video.read_all([i for i in range(0, cls.num_instances)])
+ cls.text_ref = text.apply_representation(Bert())
+ cls.audio_ref = audio.apply_representation(MelSpectrogram())
+ cls.video_ref = video.apply_representation(ResNet())
@classmethod
def tearDownClass(cls):
@@ -82,35 +82,31 @@ class TestDataLoaders(unittest.TestCase):
shutil.rmtree(cls.test_file_path)
def test_load_audio_data_from_file(self):
- load_audio = AudioModality(
- f"{self.test_file_path}/embeddings/mel_sp_embeddings.npy", NPY()
- )
- load_audio.read_all(self.indizes)
+ audio_data_loader = AudioLoader(self.audio_path, self.indizes)
+ audio = UnimodalModality(
+ audio_data_loader, ModalityType.AUDIO
+ ).apply_representation(MelSpectrogram())
for i in range(0, self.num_instances):
- assert round(sum(self.audio.data[i]), 4) == round(
- sum(load_audio.data[i]), 4
- )
+ assert round(sum(self.audio_ref.data[i]), 4) ==
round(sum(audio.data[i]), 4)
def test_load_video_data_from_file(self):
- load_video = VideoModality(
- f"{self.test_file_path}/embeddings/resnet_embeddings.hdf5", HDF5()
- )
- load_video.read_all(self.indizes)
+ video_data_loader = VideoLoader(self.video_path, self.indizes)
+ video = UnimodalModality(
+ video_data_loader, ModalityType.VIDEO
+ ).apply_representation(ResNet())
for i in range(0, self.num_instances):
- assert round(sum(self.video.data[i]), 4) == round(
- sum(load_video.data[i]), 4
- )
+ assert round(sum(self.video_ref.data[i]), 4) ==
round(sum(video.data[i]), 4)
def test_load_text_data_from_file(self):
- load_text = TextModality(
- f"{self.test_file_path}/embeddings/bert_embeddings.pkl", Pickle()
- )
- load_text.read_all(self.indizes)
+ text_data_loader = TextLoader(self.text_path, self.indizes)
+ text = UnimodalModality(
+ text_data_loader, ModalityType.TEXT
+ ).apply_representation(Bert())
for i in range(0, self.num_instances):
- assert round(sum(self.text.data[i]), 4) ==
round(sum(load_text.data[i]), 4)
+ assert round(sum(self.text_ref.data[i]), 4) ==
round(sum(text.data[i]), 4)
if __name__ == "__main__":
diff --git a/src/main/python/tests/scuro/test_dr_search.py
b/src/main/python/tests/scuro/test_dr_search.py
index eac4a77641..d0d7ef5077 100644
--- a/src/main/python/tests/scuro/test_dr_search.py
+++ b/src/main/python/tests/scuro/test_dr_search.py
@@ -28,11 +28,13 @@ from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
from systemds.scuro.aligner.dr_search import DRSearch
from systemds.scuro.aligner.task import Task
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.text_modality import TextModality
-from systemds.scuro.modality.video_modality import VideoModality
from systemds.scuro.models.model import Model
from systemds.scuro.representations.average import Average
from systemds.scuro.representations.bert import Bert
@@ -101,28 +103,27 @@ class TestDataLoaders(unittest.TestCase):
cls.num_instances = 8
cls.indizes = [str(i) for i in range(0, cls.num_instances)]
- cls.video = VideoModality(
- "",
ResNet(f"{cls.test_file_path}/embeddings/resnet_embeddings.hdf5")
+
+ video_data_loader = VideoLoader(
+ cls.test_file_path + "/" + ModalityType.VIDEO.name + "/",
cls.indizes
)
- cls.audio = AudioModality(
- "",
- MelSpectrogram(
-
output_file=f"{cls.test_file_path}/embeddings/mel_sp_embeddings.npy"
- ),
+ audio_data_loader = AudioLoader(
+ cls.test_file_path + "/" + ModalityType.AUDIO.name + "/",
cls.indizes
)
- cls.text = TextModality(
- "",
- Bert(
- avg_layers=4,
-
output_file=f"{cls.test_file_path}/embeddings/bert_embeddings.pkl",
- ),
+ text_data_loader = TextLoader(
+ cls.test_file_path + "/" + ModalityType.TEXT.name + "/",
cls.indizes
)
- cls.mods = [cls.video, cls.audio, cls.text]
- cls.data_generator = TestDataGenerator(cls.mods, cls.test_file_path)
+ video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+ audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
+ text = UnimodalModality(text_data_loader, ModalityType.TEXT)
+ cls.data_generator = TestDataGenerator([video, audio, text],
cls.test_file_path)
cls.data_generator.create_multimodal_data(cls.num_instances)
- cls.text.read_all(cls.indizes)
- cls.audio.read_all(cls.indizes)
- cls.video.read_all([i for i in range(0, cls.num_instances)])
+
+ cls.bert = text.apply_representation(Bert())
+ cls.mel_spe = audio.apply_representation(MelSpectrogram())
+ cls.resnet = video.apply_representation(ResNet())
+
+ cls.mods = [cls.bert, cls.mel_spe, cls.resnet]
split = train_test_split(
cls.indizes, cls.data_generator.labels, test_size=0.2,
random_state=42