(systemds) branch main updated: [SYSTEMDS-3701] Rework Scuro modalites, update python 3.8

mboehm7 Fri, 10 Jan 2025 05:33:43 -0800

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 704b6fbbca [SYSTEMDS-3701] Rework Scuro modalites, update python 3.8
704b6fbbca is described below

commit 704b6fbbca709907e13539e6693e009bf86a0d31
Author: Christina Dionysio <[email protected]>
AuthorDate: Fri Jan 10 14:31:36 2025 +0100

    [SYSTEMDS-3701] Rework Scuro modalites, update python 3.8
    
    Closes #2155.
---
 src/main/python/systemds/scuro/__init__.py         |  45 +++----
 .../python/systemds/scuro/aligner/alignment.py     |   3 +-
 .../python/systemds/scuro/aligner/dr_search.py     |  35 +++---
 .../unimodal.py => dataloader/__init__.py}         |  18 ---
 .../unimodal.py => dataloader/audio_loader.py}     |  29 ++---
 .../systemds/scuro/dataloader/base_loader.py       |  92 ++++++++++++++
 .../unimodal.py => dataloader/json_loader.py}      |  33 ++---
 .../sum.py => dataloader/text_loader.py}           |  52 ++++----
 .../sum.py => dataloader/video_loader.py}          |  44 ++++---
 src/main/python/systemds/scuro/main.py             |  43 ++++---
 .../systemds/scuro/modality/aligned_modality.py    |  51 --------
 .../systemds/scuro/modality/audio_modality.py      |  61 ---------
 .../python/systemds/scuro/modality/modality.py     |  40 ++----
 .../systemds/scuro/modality/text_modality.py       |  61 ---------
 .../python/systemds/scuro/modality/transformed.py  |  52 ++++++++
 .../unimodal.py => modality/type.py}               |  23 ++--
 .../systemds/scuro/modality/unimodal_modality.py   |  59 +++++++++
 .../systemds/scuro/modality/video_modality.py      |  61 ---------
 .../systemds/scuro/representations/average.py      |   2 +-
 .../python/systemds/scuro/representations/bert.py  |  15 +--
 .../python/systemds/scuro/representations/bow.py   |  14 +--
 .../scuro/representations/concatenation.py         |   2 +-
 .../systemds/scuro/representations/fusion.py       |   2 +-
 .../python/systemds/scuro/representations/glove.py |  31 +++--
 .../python/systemds/scuro/representations/lstm.py  |   8 +-
 .../python/systemds/scuro/representations/max.py   |   2 +-
 .../scuro/representations/mel_spectrogram.py       |  23 ++--
 .../scuro/representations/multiplication.py        |   2 +-
 .../systemds/scuro/representations/resnet.py       | 137 ++++++++-------------
 .../systemds/scuro/representations/rowmax.py       |   9 +-
 .../python/systemds/scuro/representations/sum.py   |   2 +-
 .../python/systemds/scuro/representations/tfidf.py |  11 +-
 .../systemds/scuro/representations/unimodal.py     |   2 +-
 .../systemds/scuro/representations/word2vec.py     |  21 ++--
 src/main/python/tests/scuro/data_generator.py      |  19 ++-
 src/main/python/tests/scuro/test_data_loaders.py   |  82 ++++++------
 src/main/python/tests/scuro/test_dr_search.py      |  43 +++----
 37 files changed, 553 insertions(+), 676 deletions(-)

diff --git a/src/main/python/systemds/scuro/__init__.py 
b/src/main/python/systemds/scuro/__init__.py
index 84494a158e..53b68d430f 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -18,59 +18,60 @@
 # under the License.
 #
 # -------------------------------------------------------------
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.dataloader.json_loader import JSONLoader
 from systemds.scuro.representations.representation import Representation
 from systemds.scuro.representations.average import Average
 from systemds.scuro.representations.concatenation import Concatenation
-from systemds.scuro.representations.fusion import Fusion
 from systemds.scuro.representations.sum import Sum
 from systemds.scuro.representations.max import RowMax
 from systemds.scuro.representations.multiplication import Multiplication
 from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.resnet import ResNet
 from systemds.scuro.representations.bert import Bert
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.lstm import LSTM
-from systemds.scuro.representations.representation_dataloader import (
-    NPY,
-    Pickle,
-    HDF5,
-    JSON,
-)
+from systemds.scuro.representations.bow import BoW
+from systemds.scuro.representations.glove import GloVe
+from systemds.scuro.representations.tfidf import TfIdf
+from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.models.model import Model
 from systemds.scuro.models.discrete_model import DiscreteModel
-from systemds.scuro.modality.aligned_modality import AlignedModality
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.video_modality import VideoModality
-from systemds.scuro.modality.text_modality import TextModality
 from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.aligner.dr_search import DRSearch
 from systemds.scuro.aligner.task import Task
 
 
 __all__ = [
+    "BaseLoader",
+    "AudioLoader",
+    "VideoLoader",
+    "TextLoader",
     "Representation",
     "Average",
     "Concatenation",
-    "Fusion",
     "Sum",
     "RowMax",
     "Multiplication",
     "MelSpectrogram",
     "ResNet",
     "Bert",
-    "UnimodalRepresentation",
     "LSTM",
-    "NPY",
-    "Pickle",
-    "HDF5",
-    "JSON",
+    "BoW",
+    "GloVe",
+    "TfIdf",
+    "W2V",
     "Model",
     "DiscreteModel",
-    "AlignedModality",
-    "AudioModality",
-    "VideoModality",
-    "TextModality",
     "Modality",
+    "UnimodalModality",
+    "TransformedModality",
+    "ModalityType",
     "DRSearch",
     "Task",
 ]
diff --git a/src/main/python/systemds/scuro/aligner/alignment.py 
b/src/main/python/systemds/scuro/aligner/alignment.py
index e341e1b76b..62f88a272b 100644
--- a/src/main/python/systemds/scuro/aligner/alignment.py
+++ b/src/main/python/systemds/scuro/aligner/alignment.py
@@ -19,7 +19,6 @@
 #
 # -------------------------------------------------------------
 from aligner.alignment_strategy import AlignmentStrategy
-from modality.aligned_modality import AlignedModality
 from modality.modality import Modality
 from modality.representation import Representation
 from aligner.similarity_measures import Measure
@@ -46,4 +45,4 @@ class Alignment:
         self.similarity_measure = similarity_measure
 
     def align_modalities(self) -> Modality:
-        return AlignedModality(Representation())
+        return Modality(Representation())
diff --git a/src/main/python/systemds/scuro/aligner/dr_search.py 
b/src/main/python/systemds/scuro/aligner/dr_search.py
index 24f3c3236f..b46139dff3 100644
--- a/src/main/python/systemds/scuro/aligner/dr_search.py
+++ b/src/main/python/systemds/scuro/aligner/dr_search.py
@@ -23,7 +23,6 @@ import random
 from typing import List
 
 from systemds.scuro.aligner.task import Task
-from systemds.scuro.modality.aligned_modality import AlignedModality
 from systemds.scuro.modality.modality import Modality
 from systemds.scuro.representations.representation import Representation
 
@@ -64,27 +63,25 @@ class DRSearch:
 
     def set_best_params(
         self,
-        modality_name: str,
         representation: Representation,
         scores: List[float],
         modality_names: List[str],
     ):
         """
         Updates the best parameters for given modalities, representation, and 
score
-        :param modality_name: The name of the aligned modality
         :param representation: The representation used to retrieve the current 
score
-        :param score: achieved score for the set of modalities and 
representation
+        :param scores: achieved train/test scores for the set of modalities 
and representation
         :param modality_names: List of modality names used in this setting
         :return:
         """
 
         # check if modality name is already in dictionary
-        if modality_name not in self.scores.keys():
+        if "_".join(modality_names) not in self.scores.keys():
             # if not add it to dictionary
-            self.scores[modality_name] = {}
+            self.scores["_".join(modality_names)] = {}
 
         # set score for representation
-        self.scores[modality_name][representation] = scores
+        self.scores["_".join(modality_names)][representation] = scores
 
         # compare current score with best score
         if scores[1] > self.best_score:
@@ -113,13 +110,12 @@ class DRSearch:
         modality_combination = random.choice(modalities)
         representation = random.choice(self.representations)
 
-        modality = AlignedModality(representation, list(modality_combination)) 
 # noqa
-        modality.combine()
+        modality = modality_combination[0].combine(
+            modality_combination[1:], representation
+        )
 
         scores = self.task.run(modality.data)
-        self.set_best_params(
-            modality.name, representation, scores, 
modality.get_modality_names()
-        )
+        self.set_best_params(representation, scores, 
modality.get_modality_names())
 
         return self.best_representation, self.best_score, self.best_modalities
 
@@ -133,14 +129,14 @@ class DRSearch:
         for M in range(1, len(self.modalities) + 1):
             for combination in itertools.combinations(self.modalities, M):
                 for representation in self.representations:
-                    modality = AlignedModality(
-                        representation, list(combination)
-                    )  # noqa
-                    modality.combine()
+                    modality = combination[0]
+                    if len(combination) > 1:
+                        modality = combination[0].combine(
+                            list(combination[1:]), representation
+                        )
 
                     scores = self.task.run(modality.data)
                     self.set_best_params(
-                        modality.name,
                         representation,
                         scores,
                         modality.get_modality_names(),
@@ -164,7 +160,8 @@ class DRSearch:
         for modality_name in self.best_modalities:
             used_modalities.append(get_modalities_by_name(modalities, 
modality_name))
 
-        modality = AlignedModality(self.best_representation, used_modalities)  
# noqa
-        modality.combine(self.task.train_indices)
+        modality = used_modalities[0].combine(
+            used_modalities[1:], self.best_representation
+        )
 
         return modality.data
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py 
b/src/main/python/systemds/scuro/dataloader/__init__.py
similarity index 63%
copy from src/main/python/systemds/scuro/representations/unimodal.py
copy to src/main/python/systemds/scuro/dataloader/__init__.py
index ccd6197765..e66abb4646 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/dataloader/__init__.py
@@ -18,21 +18,3 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from systemds.scuro.representations.representation import Representation
-
-
-class UnimodalRepresentation(Representation):
-    def __init__(self, name):
-        """
-        Parent class for all unimodal representation types
-        :param name: name of the representation
-        """
-        super().__init__(name)
-
-    def parse_all(self, file_path, indices):
-        raise f"Not implemented for {self.name}"
-
-
-class PixelRepresentation(UnimodalRepresentation):
-    def __init__(self):
-        super().__init__("Pixel")
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py 
b/src/main/python/systemds/scuro/dataloader/audio_loader.py
similarity index 64%
copy from src/main/python/systemds/scuro/representations/unimodal.py
copy to src/main/python/systemds/scuro/dataloader/audio_loader.py
index ccd6197765..f85b1b80fa 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -18,21 +18,22 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from systemds.scuro.representations.representation import Representation
+from typing import List, Optional
 
+import librosa
+from systemds.scuro.dataloader.base_loader import BaseLoader
 
-class UnimodalRepresentation(Representation):
-    def __init__(self, name):
-        """
-        Parent class for all unimodal representation types
-        :param name: name of the representation
-        """
-        super().__init__(name)
 
-    def parse_all(self, file_path, indices):
-        raise f"Not implemented for {self.name}"
+class AudioLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        chunk_size: Optional[int] = None,
+    ):
+        super().__init__(source_path, indices, chunk_size)
 
-
-class PixelRepresentation(UnimodalRepresentation):
-    def __init__(self):
-        super().__init__("Pixel")
+    def extract(self, file: str):
+        self.file_sanity_check(file)
+        audio, sr = librosa.load(file)
+        self.data.append(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py 
b/src/main/python/systemds/scuro/dataloader/base_loader.py
new file mode 100644
index 0000000000..2ef60677c6
--- /dev/null
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -0,0 +1,92 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import os
+from abc import ABC, abstractmethod
+from typing import List, Optional, Union
+
+
+class BaseLoader(ABC):
+    def __init__(
+        self, source_path: str, indices: List[str], chunk_size: Optional[int] 
= None
+    ):
+        """
+        Base class to load raw data for a given list of indices and stores 
them in the data object
+        :param source_path: The location where the raw data lies
+        :param indices: A list of indices as strings that are corresponding to 
the file names
+        :param chunk_size: An optional argument to load the data in chunks 
instead of all at once
+        (otherwise please provide your own Dataloader that knows about the 
file name convention)
+        """
+        self.data = []
+        self.source_path = source_path
+        self.indices = indices
+        self.chunk_size = chunk_size
+        self.next_chunk = 0
+
+        if self.chunk_size:
+            self.num_chunks = int(len(self.indices) / self.chunk_size)
+
+    def load(self):
+        """
+        Takes care of loading the raw data either chunk wise (if chunk size is 
defined) or all at once
+        """
+        if self.chunk_size:
+            return self._load_next_chunk()
+
+        return self._load(self.indices)
+
+    def _load_next_chunk(self):
+        """
+        Loads the next chunk of data
+        """
+        self.data = []
+        next_chunk_indices = self.indices[
+            self.next_chunk * self.chunk_size : (self.next_chunk + 1) * 
self.chunk_size
+        ]
+        self.next_chunk += 1
+        return self._load(next_chunk_indices)
+
+    def _load(self, indices: List[str]):
+        is_dir = True if os.path.isdir(self.source_path) else False
+
+        if is_dir:
+            _, ext = os.path.splitext(os.listdir(self.source_path)[0])
+            for index in indices:
+                self.extract(self.source_path + index + ext)
+        else:
+            self.extract(self.source_path, indices)
+
+        return self.data
+
+    @abstractmethod
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = 
None):
+        pass
+
+    def file_sanity_check(self, file):
+        """
+        Checks if the file can be found is not empty
+        """
+        try:
+            file_size = os.path.getsize(file)
+        except:
+            raise (f"Error: File {0} not found!".format(file))
+
+        if file_size == 0:
+            raise ("File {0} is empty".format(file))
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py 
b/src/main/python/systemds/scuro/dataloader/json_loader.py
similarity index 59%
copy from src/main/python/systemds/scuro/representations/unimodal.py
copy to src/main/python/systemds/scuro/dataloader/json_loader.py
index ccd6197765..c4e3b95611 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -18,21 +18,26 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from systemds.scuro.representations.representation import Representation
+import json
 
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from typing import Optional, List
 
-class UnimodalRepresentation(Representation):
-    def __init__(self, name):
-        """
-        Parent class for all unimodal representation types
-        :param name: name of the representation
-        """
-        super().__init__(name)
 
-    def parse_all(self, file_path, indices):
-        raise f"Not implemented for {self.name}"
+class JSONLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        field: str,
+        chunk_size: Optional[int] = None,
+    ):
+        super().__init__(source_path, indices, chunk_size)
+        self.field = field
 
-
-class PixelRepresentation(UnimodalRepresentation):
-    def __init__(self):
-        super().__init__("Pixel")
+    def extract(self, file: str, indices: List[str]):
+        self.file_sanity_check(file)
+        with open(file) as f:
+            json_file = json.load(f)
+            for idx in indices:
+                self.data.append(json_file[idx][self.field])
diff --git a/src/main/python/systemds/scuro/representations/sum.py 
b/src/main/python/systemds/scuro/dataloader/text_loader.py
similarity index 55%
copy from src/main/python/systemds/scuro/representations/sum.py
copy to src/main/python/systemds/scuro/dataloader/text_loader.py
index bfb19d4f7d..f614472bce 100644
--- a/src/main/python/systemds/scuro/representations/sum.py
+++ b/src/main/python/systemds/scuro/dataloader/text_loader.py
@@ -18,31 +18,27 @@
 # under the License.
 #
 # -------------------------------------------------------------
-
-from typing import List
-
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.utils import pad_sequences
-
-from systemds.scuro.representations.fusion import Fusion
-
-
-class Sum(Fusion):
-    def __init__(self):
-        """
-        Combines modalities using colum-wise sum
-        """
-        super().__init__("Sum")
-
-    def fuse(self, modalities: List[Modality]):
-        max_emb_size = self.get_max_embedding_size(modalities)
-
-        data = pad_sequences(modalities[0].data, maxlen=max_emb_size, 
dtype="float32")
-
-        for m in range(1, len(modalities)):
-            data += pad_sequences(
-                modalities[m].data, maxlen=max_emb_size, dtype="float32"
-            )
-
-        return data
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from typing import Optional, Pattern, List
+import re
+
+
+class TextLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        chunk_size: Optional[int] = None,
+        prefix: Optional[Pattern[str]] = None,
+    ):
+        super().__init__(source_path, indices, chunk_size)
+        self.prefix = prefix
+
+    def extract(self, file: str):
+        self.file_sanity_check(file)
+        with open(file) as text_file:
+            for i, line in enumerate(text_file):
+                if self.prefix:
+                    line = re.sub(self.prefix, "", line)
+                line = line.replace("\n", "")
+                self.data.append(line)
diff --git a/src/main/python/systemds/scuro/representations/sum.py 
b/src/main/python/systemds/scuro/dataloader/video_loader.py
similarity index 54%
copy from src/main/python/systemds/scuro/representations/sum.py
copy to src/main/python/systemds/scuro/dataloader/video_loader.py
index bfb19d4f7d..6da20b3475 100644
--- a/src/main/python/systemds/scuro/representations/sum.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -18,31 +18,35 @@
 # under the License.
 #
 # -------------------------------------------------------------
+from typing import List, Optional
 
-from typing import List
+import numpy as np
 
+from systemds.scuro.dataloader.base_loader import BaseLoader
+import cv2
 
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.utils import pad_sequences
 
-from systemds.scuro.representations.fusion import Fusion
+class VideoLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        chunk_size: Optional[int] = None,
+    ):
+        super().__init__(source_path, indices, chunk_size)
 
+    def extract(self, file: str):
+        self.file_sanity_check(file)
+        cap = cv2.VideoCapture(file)
+        frames = []
+        while cap.isOpened():
+            ret, frame = cap.read()
 
-class Sum(Fusion):
-    def __init__(self):
-        """
-        Combines modalities using colum-wise sum
-        """
-        super().__init__("Sum")
+            if not ret:
+                break
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = frame.astype(np.float32) / 255.0
 
-    def fuse(self, modalities: List[Modality]):
-        max_emb_size = self.get_max_embedding_size(modalities)
+            frames.append(frame)
 
-        data = pad_sequences(modalities[0].data, maxlen=max_emb_size, 
dtype="float32")
-
-        for m in range(1, len(modalities)):
-            data += pad_sequences(
-                modalities[m].data, maxlen=max_emb_size, dtype="float32"
-            )
-
-        return data
+        self.data.append(frames)
diff --git a/src/main/python/systemds/scuro/main.py 
b/src/main/python/systemds/scuro/main.py
index f28b271b97..8a51e098cc 100644
--- a/src/main/python/systemds/scuro/main.py
+++ b/src/main/python/systemds/scuro/main.py
@@ -18,21 +18,20 @@
 # under the License.
 #
 # -------------------------------------------------------------
-import collections
-import json
-from datetime import datetime
-
+from systemds.scuro.representations.bert import Bert
+from systemds.scuro.representations.resnet import ResNet
+from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.average import Average
 from systemds.scuro.representations.concatenation import Concatenation
-from systemds.scuro.modality.aligned_modality import AlignedModality
-from systemds.scuro.modality.text_modality import TextModality
-from systemds.scuro.modality.video_modality import VideoModality
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.representations.unimodal import Pickle, JSON, HDF5, NPY
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.models.discrete_model import DiscreteModel
 from systemds.scuro.aligner.task import Task
 from systemds.scuro.aligner.dr_search import DRSearch
 
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+
 
 class CustomTask(Task):
     def __init__(self, model, labels, train_indices, val_indices):
@@ -49,18 +48,32 @@ labels = []
 train_indices = []
 val_indices = []
 
+all_indices = []
+
 video_path = ""
 audio_path = ""
 text_path = ""
 
+
+# Define dataloaders
+video_data_loader = VideoLoader(video_path, all_indices, chunk_size=10)
+text_data_loader = TextLoader(text_path, all_indices)
+audio_data_loader = AudioLoader(audio_path, all_indices)
+
 # Load modalities (audio, video, text)
-video = VideoModality(video_path, HDF5(), train_indices)
-audio = AudioModality(audio_path, Pickle(), train_indices)
-text = TextModality(text_path, NPY(), train_indices)
+video = UnimodalModality(video_data_loader, "VIDEO")
+audio = UnimodalModality(audio_data_loader, "AUDIO")
+text = UnimodalModality(text_data_loader, "TEXT")
+
+# Define unimodal representations
+r_v = ResNet()
+r_a = MelSpectrogram()
+r_t = Bert()
 
-video.read_all()
-audio.read_all()
-text.read_all()
+# Transform raw unimodal data
+video.apply_representation(r_v)
+audio.apply_representation(r_a)
+text.apply_representation(r_t)
 
 modalities = [text, audio, video]
 
diff --git a/src/main/python/systemds/scuro/modality/aligned_modality.py 
b/src/main/python/systemds/scuro/modality/aligned_modality.py
deleted file mode 100644
index 839b9d296f..0000000000
--- a/src/main/python/systemds/scuro/modality/aligned_modality.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-from typing import List
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.fusion import Fusion
-
-
-class AlignedModality(Modality):
-    def __init__(self, representation: Fusion, modalities: List[Modality]):
-        """
-        Defines the modality that is created during the fusion process
-        :param representation: The representation for the aligned modality
-        :param modalities: List of modalities to be combined
-        """
-        name = ""
-        for modality in modalities:
-            name += modality.name
-        super().__init__(representation, modality_name=name)
-        self.modalities = modalities
-
-    def combine(self):
-        """
-        Initiates the call to fuse the given modalities depending on the 
Fusion type
-        """
-        self.data = self.representation.fuse(self.modalities)  # noqa
-
-    def get_modality_names(self):
-        names = []
-        for modality in self.modalities:
-            names.append(modality.name)
-
-        return names
diff --git a/src/main/python/systemds/scuro/modality/audio_modality.py 
b/src/main/python/systemds/scuro/modality/audio_modality.py
deleted file mode 100644
index ba84962226..0000000000
--- a/src/main/python/systemds/scuro/modality/audio_modality.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-import os
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
-
-
-class AudioModality(Modality):
-    def __init__(
-        self,
-        file_path: str,
-        representation: UnimodalRepresentation,
-        train_indices=None,
-        start_index: int = 0,
-    ):
-        """
-        Creates an audio modality
-        :param file_path: path to file where the audio embeddings are stored
-        :param representation: Unimodal representation that indicates how to 
extract the data from the file
-        """
-        super().__init__(representation, start_index, "Audio", train_indices)
-        self.file_path = file_path
-
-    def file_sanity_check(self):
-        """
-        Checks if the file can be found is not empty
-        """
-        try:
-            file_size = os.path.getsize(self.file_path)
-        except:
-            raise (f"Error: File {0} not found!".format(self.file_path))
-
-        if file_size == 0:
-            raise ("File {0} is empty".format(self.file_path))
-
-    def read_chunk(self):
-        pass
-
-    def read_all(self, indices=None):
-        self.data = self.representation.parse_all(
-            self.file_path, indices=indices
-        )  # noqa
diff --git a/src/main/python/systemds/scuro/modality/modality.py 
b/src/main/python/systemds/scuro/modality/modality.py
index a899576d5b..9a3d1b148d 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -18,41 +18,27 @@
 # under the License.
 #
 # -------------------------------------------------------------
+from typing import List
 
-from systemds.scuro.representations.representation import Representation
+from systemds.scuro.modality.type import ModalityType
 
 
 class Modality:
 
-    def __init__(
-        self,
-        representation: Representation,
-        start_index: int = 0,
-        modality_name="",
-        train_indices=None,
-    ):
+    def __init__(self, modality_type: ModalityType):
         """
-        Parent class of the different Modalities
-        :param representation: Specifies how the data should be represented 
for a specific modality
-        :param start_index: Defines the first index used for the alignment
-        :param modality_name: Name of the modality
-        :param train_indices: List of indices used for train-test split
+        Parent class of the different Modalities (unimodal & multimodal)
+        :param modality_type: Type of the modality
         """
-        self.representation = representation
-        self.start_index = start_index
-        self.name = modality_name
+        self.type = modality_type
         self.data = None
-        self.train_indices = train_indices
+        self.data_type = None
+        self.cost = None
+        self.shape = None
+        self.schema = {}
 
-    def read_chunk(self):
+    def get_modality_names(self) -> List[str]:
         """
-        Extracts a data chunk of the modality according to the window size 
specified in params
+        Extracts the individual unimodal modalities for a given transformed 
modality.
         """
-        raise NotImplementedError
-
-    def read_all(self, indices):
-        """
-        Implemented for every unique modality to read all samples from a 
specified format
-        :param indices: List of indices to be read
-        """
-        pass
+        return [modality.name for modality in ModalityType if modality in 
self.type]
diff --git a/src/main/python/systemds/scuro/modality/text_modality.py 
b/src/main/python/systemds/scuro/modality/text_modality.py
deleted file mode 100644
index c636de7167..0000000000
--- a/src/main/python/systemds/scuro/modality/text_modality.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-import os
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
-
-
-class TextModality(Modality):
-    def __init__(
-        self,
-        file_path: str,
-        representation: UnimodalRepresentation,
-        train_indices=None,
-        start_index: int = 0,
-    ):
-        """
-        Creates a text modality
-        :param file_path: path to file(s) where the text data is stored
-        :param representation: Unimodal representation that indicates how to 
extract the data from the file
-        """
-        super().__init__(representation, start_index, "Text", train_indices)
-        self.file_path = file_path
-
-    def file_sanity_check(self):
-        """
-        Checks if the file can be found is not empty
-        """
-        try:
-            file_size = os.path.getsize(self.file_path)
-        except:
-            raise (f"Error: File {0} not found!".format(self.file_path))
-
-        if file_size == 0:
-            raise ("File {0} is empty".format(self.file_path))
-
-    def read_chunk(self):
-        pass
-
-    def read_all(self, indices=None):
-        self.data = self.representation.parse_all(
-            self.file_path, indices=indices
-        )  # noqa
diff --git a/src/main/python/systemds/scuro/modality/transformed.py 
b/src/main/python/systemds/scuro/modality/transformed.py
new file mode 100644
index 0000000000..61c327e469
--- /dev/null
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -0,0 +1,52 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from functools import reduce
+from operator import or_
+
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.type import ModalityType
+
+
+class TransformedModality(Modality):
+
+    def __init__(self, modality_type: ModalityType, transformation):
+        """
+        Parent class of the different Modalities (unimodal & multimodal)
+        :param modality_type: Type of the original modality(ies)
+        :param transformation: Representation to be applied on the modality
+        """
+        super().__init__(modality_type)
+        self.transformation = transformation
+
+    def combine(self, other, fusion_method):
+        """
+        Combines two or more modalities with each other using a dedicated 
fusion method
+        :param other: The modality to be combined
+        :param fusion_method: The fusion method to be used to combine 
modalities
+        """
+        fused_modality = TransformedModality(
+            reduce(or_, (o.type for o in other), self.type), fusion_method
+        )
+        modalities = [self]
+        modalities.extend(other)
+        fused_modality.data = fusion_method.transform(modalities)
+
+        return fused_modality
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py 
b/src/main/python/systemds/scuro/modality/type.py
similarity index 64%
copy from src/main/python/systemds/scuro/representations/unimodal.py
copy to src/main/python/systemds/scuro/modality/type.py
index ccd6197765..c451eea6f1 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -18,21 +18,14 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from systemds.scuro.representations.representation import Representation
+from enum import Enum, Flag, auto
 
 
-class UnimodalRepresentation(Representation):
-    def __init__(self, name):
-        """
-        Parent class for all unimodal representation types
-        :param name: name of the representation
-        """
-        super().__init__(name)
+class ModalityType(Flag):
+    TEXT = auto()
+    AUDIO = auto()
+    VIDEO = auto()
 
-    def parse_all(self, file_path, indices):
-        raise f"Not implemented for {self.name}"
-
-
-class PixelRepresentation(UnimodalRepresentation):
-    def __init__(self):
-        super().__init__("Pixel")
+    # def __init__(self, value, name):
+    #     self._value_ = value
+    #     self.name = name
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py 
b/src/main/python/systemds/scuro/modality/unimodal_modality.py
new file mode 100644
index 0000000000..976d4194d4
--- /dev/null
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -0,0 +1,59 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.modality.type import ModalityType
+
+
+class UnimodalModality(Modality):
+
+    def __init__(self, data_loader: BaseLoader, modality_type: ModalityType):
+        """
+        This class represents a unimodal modality.
+        :param data_loader: Defines how the raw data should be loaded
+        :param modality_type: Type of the modality
+        """
+        super().__init__(modality_type)
+        self.data_loader = data_loader
+
+    def extract_raw_data(self):
+        """
+        Uses the data loader to read the raw data from a specified location
+        and stores the data in the data location.
+        TODO: schema
+        """
+        self.data = self.data_loader.load()
+
+    def apply_representation(self, representation):
+        new_modality = TransformedModality(self.type, representation)
+        new_modality.data = []
+
+        if self.data_loader.chunk_size:
+            while self.data_loader.next_chunk < self.data_loader.num_chunks:
+                self.extract_raw_data()
+                new_modality.data.extend(representation.transform(self.data))
+        else:
+            if not self.data:
+                self.extract_raw_data()
+            new_modality.data = representation.transform(self.data)
+
+        return new_modality
diff --git a/src/main/python/systemds/scuro/modality/video_modality.py 
b/src/main/python/systemds/scuro/modality/video_modality.py
deleted file mode 100644
index a6cedf6c86..0000000000
--- a/src/main/python/systemds/scuro/modality/video_modality.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-import os
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
-
-
-class VideoModality(Modality):
-    def __init__(
-        self,
-        file_path: str,
-        representation: UnimodalRepresentation,
-        train_indices=None,
-        start_index: int = 0,
-    ):
-        """
-        Creates a video modality
-        :param file_path: path to file where the video embeddings (for now) 
are stored
-        :param representation: Unimodal representation that indicates how to 
extract the data from the file
-        """
-        super().__init__(representation, start_index, "Video", train_indices)
-        self.file_path = file_path
-
-    def file_sanity_check(self):
-        """
-        Checks if the file can be found is not empty
-        """
-        try:
-            file_size = os.path.getsize(self.file_path)
-        except:
-            raise (f"Error: File {0} not found!".format(self.file_path))
-
-        if file_size == 0:
-            raise ("File {0} is empty".format(self.file_path))
-
-    def read_chunk(self):
-        pass
-
-    def read_all(self, indices=None):
-        self.data = self.representation.parse_all(
-            self.file_path, indices=indices
-        )  # noqa
diff --git a/src/main/python/systemds/scuro/representations/average.py 
b/src/main/python/systemds/scuro/representations/average.py
index d10778f113..db44050e9e 100644
--- a/src/main/python/systemds/scuro/representations/average.py
+++ b/src/main/python/systemds/scuro/representations/average.py
@@ -36,7 +36,7 @@ class Average(Fusion):
         """
         super().__init__("Average")
 
-    def fuse(self, modalities: List[Modality]):
+    def transform(self, modalities: List[Modality]):
         max_emb_size = self.get_max_embedding_size(modalities)
 
         padded_modalities = []
diff --git a/src/main/python/systemds/scuro/representations/bert.py 
b/src/main/python/systemds/scuro/representations/bert.py
index 85d0b1ad65..0fcf1e8d28 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -24,7 +24,7 @@ import numpy as np
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 import torch
 from transformers import BertTokenizer, BertModel
-from systemds.scuro.representations.utils import read_data_from_file, 
save_embeddings
+from systemds.scuro.representations.utils import save_embeddings
 
 
 class Bert(UnimodalRepresentation):
@@ -34,8 +34,7 @@ class Bert(UnimodalRepresentation):
         self.avg_layers = avg_layers
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices):
-        data = read_data_from_file(filepath, indices)
+    def transform(self, data):
 
         model_name = "bert-base-uncased"
         tokenizer = BertTokenizer.from_pretrained(
@@ -47,13 +46,10 @@ class Bert(UnimodalRepresentation):
         else:
             model = BertModel.from_pretrained(model_name)
 
-        embeddings = self.create_embeddings(list(data.values()), model, 
tokenizer)
+        embeddings = self.create_embeddings(data, model, tokenizer)
 
         if self.output_file is not None:
-            data = {}
-            for i in range(0, embeddings.shape[0]):
-                data[indices[i]] = embeddings[i]
-            save_embeddings(data, self.output_file)
+            save_embeddings(embeddings, self.output_file)
 
         return embeddings
 
@@ -75,8 +71,5 @@ class Bert(UnimodalRepresentation):
                 cls_embedding = outputs.last_hidden_state[:, 0, 
:].squeeze().numpy()
             embeddings.append(cls_embedding)
 
-        if self.output_file is not None:
-            save_embeddings(embeddings, self.output_file)
-
         embeddings = np.array(embeddings)
         return embeddings.reshape((embeddings.shape[0], embeddings.shape[-1]))
diff --git a/src/main/python/systemds/scuro/representations/bow.py 
b/src/main/python/systemds/scuro/representations/bow.py
index dc5013b354..bd54654a5c 100644
--- a/src/main/python/systemds/scuro/representations/bow.py
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -19,11 +19,10 @@
 #
 # -------------------------------------------------------------
 
-import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
 
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
-from systemds.scuro.representations.utils import read_data_from_file, 
save_embeddings
+from systemds.scuro.representations.utils import save_embeddings
 
 
 class BoW(UnimodalRepresentation):
@@ -33,19 +32,14 @@ class BoW(UnimodalRepresentation):
         self.min_df = min_df
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices):
+    def transform(self, data):
         vectorizer = CountVectorizer(
             ngram_range=(1, self.ngram_range), min_df=self.min_df
         )
 
-        segments = read_data_from_file(filepath, indices)
-        X = vectorizer.fit_transform(segments.values())
-        X = X.toarray()
+        X = vectorizer.fit_transform(data).toarray()
 
         if self.output_file is not None:
-            df = pd.DataFrame(X)
-            df.index = segments.keys()
-
-            save_embeddings(df, self.output_file)
+            save_embeddings(X, self.output_file)
 
         return X
diff --git a/src/main/python/systemds/scuro/representations/concatenation.py 
b/src/main/python/systemds/scuro/representations/concatenation.py
index 7694fa6897..fd9293d399 100644
--- a/src/main/python/systemds/scuro/representations/concatenation.py
+++ b/src/main/python/systemds/scuro/representations/concatenation.py
@@ -37,7 +37,7 @@ class Concatenation(Fusion):
         super().__init__("Concatenation")
         self.padding = padding
 
-    def fuse(self, modalities: List[Modality]):
+    def transform(self, modalities: List[Modality]):
         if len(modalities) == 1:
             return np.array(modalities[0].data)
 
diff --git a/src/main/python/systemds/scuro/representations/fusion.py 
b/src/main/python/systemds/scuro/representations/fusion.py
index e84e59f666..623979dd05 100644
--- a/src/main/python/systemds/scuro/representations/fusion.py
+++ b/src/main/python/systemds/scuro/representations/fusion.py
@@ -32,7 +32,7 @@ class Fusion(Representation):
         """
         super().__init__(name)
 
-    def fuse(self, modalities: List[Modality]):
+    def transform(self, modalities: List[Modality]):
         """
         Implemented for every child class and creates a fused representation 
out of
         multiple modalities
diff --git a/src/main/python/systemds/scuro/representations/glove.py 
b/src/main/python/systemds/scuro/representations/glove.py
index 840360540e..cf13c717d2 100644
--- a/src/main/python/systemds/scuro/representations/glove.py
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -18,7 +18,6 @@
 # under the License.
 #
 # -------------------------------------------------------------
-import nltk
 import numpy as np
 from nltk import word_tokenize
 
@@ -43,24 +42,24 @@ class GloVe(UnimodalRepresentation):
         self.glove_path = glove_path
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices):
+    def transform(self, data):
         glove_embeddings = load_glove_embeddings(self.glove_path)
-        segments = read_data_from_file(filepath, indices)
 
-        embeddings = {}
-        for k, v in segments.items():
-            tokens = word_tokenize(v.lower())
-            embeddings[k] = np.mean(
-                [
-                    glove_embeddings[token]
-                    for token in tokens
-                    if token in glove_embeddings
-                ],
-                axis=0,
+        embeddings = []
+        for sentences in data:
+            tokens = word_tokenize(sentences.lower())
+            embeddings.append(
+                np.mean(
+                    [
+                        glove_embeddings[token]
+                        for token in tokens
+                        if token in glove_embeddings
+                    ],
+                    axis=0,
+                )
             )
 
         if self.output_file is not None:
-            save_embeddings(embeddings, self.output_file)
+            save_embeddings(np.array(embeddings), self.output_file)
 
-        embeddings = np.array(list(embeddings.values()))
-        return embeddings
+        return np.array(embeddings)
diff --git a/src/main/python/systemds/scuro/representations/lstm.py 
b/src/main/python/systemds/scuro/representations/lstm.py
index 3687ff6514..649b81117b 100644
--- a/src/main/python/systemds/scuro/representations/lstm.py
+++ b/src/main/python/systemds/scuro/representations/lstm.py
@@ -40,17 +40,17 @@ class LSTM(Fusion):
         self.dropout_rate = dropout_rate
         self.unimodal_embeddings = {}
 
-    def fuse(self, modalities: List[Modality], train_indices=None):
+    def transform(self, modalities: List[Modality]):
         size = len(modalities[0].data)
 
         result = np.zeros((size, 0))
 
         for modality in modalities:
-            if modality.name in self.unimodal_embeddings.keys():
-                out = self.unimodal_embeddings.get(modality.name)
+            if modality.type in self.unimodal_embeddings.keys():
+                out = self.unimodal_embeddings.get(modality.type)
             else:
                 out = self.run_lstm(modality.data)
-                self.unimodal_embeddings[modality.name] = out
+                self.unimodal_embeddings[modality.type] = out
 
             result = np.concatenate([result, out], axis=-1)
 
diff --git a/src/main/python/systemds/scuro/representations/max.py 
b/src/main/python/systemds/scuro/representations/max.py
index 2e1e864477..194b20801e 100644
--- a/src/main/python/systemds/scuro/representations/max.py
+++ b/src/main/python/systemds/scuro/representations/max.py
@@ -38,7 +38,7 @@ class RowMax(Fusion):
         super().__init__("RowMax")
         self.split = split
 
-    def fuse(
+    def transform(
         self,
         modalities: List[Modality],
     ):
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py 
b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 27aba8b997..57a7fab83e 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -19,7 +19,6 @@
 #
 # -------------------------------------------------------------
 
-import os
 import pickle
 
 import librosa
@@ -35,19 +34,15 @@ class MelSpectrogram(UnimodalRepresentation):
         self.avg = avg
         self.output_file = output_file
 
-    def parse_all(self, file_path, indices, get_sequences=False):
+    def transform(self, data):
         result = []
         max_length = 0
-        if os.path.isdir(file_path):
-            for filename in os.listdir(file_path):
-                f = os.path.join(file_path, filename)
-                if os.path.isfile(f):
-                    y, sr = librosa.load(f)
-                    S = librosa.feature.melspectrogram(y=y, sr=sr)
-                    S_dB = librosa.power_to_db(S, ref=np.max)
-                    if S_dB.shape[-1] > max_length:
-                        max_length = S_dB.shape[-1]
-                    result.append(S_dB)
+        for sample in data:
+            S = librosa.feature.melspectrogram(y=sample)
+            S_dB = librosa.power_to_db(S, ref=np.max)
+            if S_dB.shape[-1] > max_length:
+                max_length = S_dB.shape[-1]
+            result.append(S_dB)
 
         r = []
         for elem in result:
@@ -57,9 +52,9 @@ class MelSpectrogram(UnimodalRepresentation):
         np_array_r = np.array(r) if not self.avg else np.mean(np.array(r), 
axis=1)
 
         if self.output_file is not None:
-            data = {}
+            data = []
             for i in range(0, np_array_r.shape[0]):
-                data[indices[i]] = np_array_r[i]
+                data.append(np_array_r[i])
             with open(self.output_file, "wb") as file:
                 pickle.dump(data, file)
 
diff --git a/src/main/python/systemds/scuro/representations/multiplication.py 
b/src/main/python/systemds/scuro/representations/multiplication.py
index 18f34bae6f..2934fe5b3c 100644
--- a/src/main/python/systemds/scuro/representations/multiplication.py
+++ b/src/main/python/systemds/scuro/representations/multiplication.py
@@ -36,7 +36,7 @@ class Multiplication(Fusion):
         """
         super().__init__("Multiplication")
 
-    def fuse(self, modalities: List[Modality], train_indices=None):
+    def transform(self, modalities: List[Modality], train_indices=None):
         max_emb_size = self.get_max_embedding_size(modalities)
 
         data = pad_sequences(modalities[0].data, maxlen=max_emb_size, 
dtype="float32")
diff --git a/src/main/python/systemds/scuro/representations/resnet.py 
b/src/main/python/systemds/scuro/representations/resnet.py
index 75c921184b..1c1bfa1d5e 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -25,8 +25,6 @@ import h5py
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from typing import Callable, Dict, Tuple, Any
 import torch.utils.data
-import os
-import cv2
 import torch
 import torchvision.models as models
 import torchvision.transforms as transforms
@@ -36,22 +34,25 @@ DEVICE = "cpu"
 
 
 class ResNet(UnimodalRepresentation):
-    def __init__(self, output_file=None):
+    def __init__(self, layer="avgpool", output_file=None):
         super().__init__("ResNet")
 
         self.output_file = output_file
+        self.layer_name = layer
 
-    def parse_all(self, file_path, indices, get_sequences=False):
-        resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT)
+    def transform(self, data):
+
+        resnet = 
models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE)
         resnet.eval()
 
         for param in resnet.parameters():
             param.requires_grad = False
 
-        transform = transforms.Compose(
+        t = transforms.Compose(
             [
                 transforms.ToPILImage(),
-                transforms.Resize((224, 224)),
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
                 transforms.ToTensor(),
                 transforms.Normalize(
                     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
@@ -59,7 +60,7 @@ class ResNet(UnimodalRepresentation):
             ]
         )
 
-        dataset = ResNetDataset(transform=transform, 
video_folder_path=file_path)
+        dataset = ResNetDataset(data, t)
         embeddings = {}
 
         class Identity(torch.nn.Module):
@@ -70,108 +71,72 @@ class ResNet(UnimodalRepresentation):
 
         res5c_output = None
 
-        def avg_pool_hook(
-            _module: torch.nn.Module, input_: Tuple[torch.Tensor], _output: Any
-        ) -> None:
-            nonlocal res5c_output
-            res5c_output = input_[0]
+        def get_features(name_):
+            def hook(
+                _module: torch.nn.Module, input_: Tuple[torch.Tensor], output: 
Any
+            ):
+                nonlocal res5c_output
+                res5c_output = output
+
+            return hook
 
-        resnet.avgpool.register_forward_hook(avg_pool_hook)
+        if self.layer_name:
+            for name, layer in resnet.named_modules():
+                if name == self.layer_name:
+                    layer.register_forward_hook(get_features(name))
+                    break
 
         for instance in torch.utils.data.DataLoader(dataset):
             video_id = instance["id"][0]
             frames = instance["frames"][0].to(DEVICE)
-            embeddings[video_id] = torch.empty((len(frames), 2048))
-            batch_size = 32
+            embeddings[video_id] = []
+            batch_size = 64
+
             for start_index in range(0, len(frames), batch_size):
                 end_index = min(start_index + batch_size, len(frames))
                 frame_ids_range = range(start_index, end_index)
                 frame_batch = frames[frame_ids_range]
 
-                avg_pool_value = resnet(frame_batch)
+                _ = resnet(frame_batch)
+                values = res5c_output
+
+                if self.layer_name == "avgpool" or self.layer_name == 
"maxpool":
+                    embeddings[video_id].extend(
+                        torch.flatten(values, 1).detach().cpu().numpy()
+                    )
 
-                embeddings[video_id][frame_ids_range] = 
avg_pool_value.to(DEVICE)
+                else:
+                    pooled = torch.nn.functional.adaptive_avg_pool2d(values, 
(1, 1))
+
+                    embeddings[video_id].extend(
+                        torch.flatten(pooled, 1).detach().cpu().numpy()
+                    )
 
         if self.output_file is not None:
             with h5py.File(self.output_file, "w") as hdf:
                 for key, value in embeddings.items():
                     hdf.create_dataset(key, data=value)
 
-        emb = np.zeros((len(indices), 2048), dtype="float32")
-        if indices is not None:
-            for i in indices:
-                emb[i] = embeddings.get(str(i)).mean(dim=0).numpy()
-        else:
-            for i, key in enumerate(embeddings.keys()):
-                emb[i] = embeddings.get(key).mean(dim=0).numpy()
-
-        return emb
-
-    @staticmethod
-    def extract_features_from_video(video_path, model, transform):
-        cap = cv2.VideoCapture(video_path)
-        features = []
-        count = 0
-        success, frame = cap.read()
-
-        while success:
-            success, frame = cap.read()
-            transformed_frame = transform(frame).unsqueeze(0)
+        emb = []
 
-            with torch.no_grad():
-                feature_vector = model(transformed_frame)
-                feature_vector = feature_vector.view(-1).numpy()
+        for video in embeddings.values():
+            emb.append(np.array(video).mean(axis=0).tolist())
 
-            features.append(feature_vector)
-
-            count += 1
-
-        cap.release()
-        return features, count
+        return np.array(emb)
 
 
 class ResNetDataset(torch.utils.data.Dataset):
-    def __init__(self, video_folder_path: str, transform: Callable = None):
-        self.video_folder_path = video_folder_path
-        self.transform = transform
-        self.video_ids = []
-        video_files = [
-            f
-            for f in os.listdir(self.video_folder_path)
-            if f.lower().endswith((".mp4", ".avi", ".mov", ".mkv"))
-        ]
-        self.file_extension = video_files[0].split(".")[-1]
-
-        for video in video_files:
-            video_id, _ = video.split("/")[-1].split(".")
-            self.video_ids.append(video_id)
-
-        self.frame_count_by_video_id = {video_id: 0 for video_id in 
self.video_ids}
+    def __init__(self, data: str, tf: Callable = None):
+        self.data = data
+        self.tf = tf
 
     def __getitem__(self, index) -> Dict[str, object]:
-        video_id = self.video_ids[index]
-        video_path = self.video_folder_path + "/" + video_id + "." + 
self.file_extension
-
-        frames = None
-        count = 0
-
-        cap = cv2.VideoCapture(video_path)
-
-        success, frame = cap.read()
-
-        num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        self.frame_count_by_video_id[video_id] = num_frames
-        if frames is None and success:
-            frames = torch.empty((num_frames, 3, 224, 224))
-
-        while success:
-            frame = self.transform(frame)
-            frames[count] = frame  # noqa
-            success, frame = cap.read()
-            count += 1
+        video = self.data[index]
+        frames = torch.empty((len(video), 3, 224, 224))
 
-        cap.release()
-        return {"id": video_id, "frames": frames}
+        for i, frame in enumerate(video):
+            frames[i] = self.tf(frame)
+        return {"id": index, "frames": frames}
 
     def __len__(self) -> int:
-        return len(self.video_ids)
+        return len(self.data)
diff --git a/src/main/python/systemds/scuro/representations/rowmax.py 
b/src/main/python/systemds/scuro/representations/rowmax.py
index 0dc201e2ee..3152782026 100644
--- a/src/main/python/systemds/scuro/representations/rowmax.py
+++ b/src/main/python/systemds/scuro/representations/rowmax.py
@@ -23,10 +23,10 @@ from typing import List
 
 import numpy as np
 
-from modality.modality import Modality
+from systemds.scuro.modality.modality import Modality
 from systemds.scuro.representations.utils import pad_sequences
 
-from representations.fusion import Fusion
+from systemds.scuro.representations.fusion import Fusion
 
 
 class RowMax(Fusion):
@@ -38,7 +38,7 @@ class RowMax(Fusion):
         super().__init__("RowMax")
         self.split = split
 
-    def fuse(self, modalities: List[Modality], train_indices):
+    def transform(self, modalities: List[Modality]):
         if len(modalities) < 2:
             return np.array(modalities)
 
@@ -46,8 +46,7 @@ class RowMax(Fusion):
 
         padded_modalities = []
         for modality in modalities:
-            scaled = self.scale_data(modality.data, train_indices)
-            d = pad_sequences(scaled, maxlen=max_emb_size, dtype="float32")
+            d = pad_sequences(modality.data, maxlen=max_emb_size, 
dtype="float32")
             padded_modalities.append(d)
 
         split_rows = int(len(modalities[0].data) / self.split)
diff --git a/src/main/python/systemds/scuro/representations/sum.py 
b/src/main/python/systemds/scuro/representations/sum.py
index bfb19d4f7d..0608338a0f 100644
--- a/src/main/python/systemds/scuro/representations/sum.py
+++ b/src/main/python/systemds/scuro/representations/sum.py
@@ -35,7 +35,7 @@ class Sum(Fusion):
         """
         super().__init__("Sum")
 
-    def fuse(self, modalities: List[Modality]):
+    def transform(self, modalities: List[Modality]):
         max_emb_size = self.get_max_embedding_size(modalities)
 
         data = pad_sequences(modalities[0].data, maxlen=max_emb_size, 
dtype="float32")
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py 
b/src/main/python/systemds/scuro/representations/tfidf.py
index 15515dd538..4849aba136 100644
--- a/src/main/python/systemds/scuro/representations/tfidf.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -19,7 +19,6 @@
 #
 # -------------------------------------------------------------
 
-import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
@@ -32,17 +31,13 @@ class TfIdf(UnimodalRepresentation):
         self.min_df = min_df
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices):
+    def transform(self, data):
         vectorizer = TfidfVectorizer(min_df=self.min_df)
 
-        segments = read_data_from_file(filepath, indices)
-        X = vectorizer.fit_transform(segments.values())
+        X = vectorizer.fit_transform(data)
         X = X.toarray()
 
         if self.output_file is not None:
-            df = pd.DataFrame(X)
-            df.index = segments.keys()
-
-            save_embeddings(df, self.output_file)
+            save_embeddings(X, self.output_file)
 
         return X
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py 
b/src/main/python/systemds/scuro/representations/unimodal.py
index ccd6197765..c56d611a74 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/representations/unimodal.py
@@ -29,7 +29,7 @@ class UnimodalRepresentation(Representation):
         """
         super().__init__(name)
 
-    def parse_all(self, file_path, indices):
+    def transform(self, data):
         raise f"Not implemented for {self.name}"
 
 
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py 
b/src/main/python/systemds/scuro/representations/word2vec.py
index cc8a180889..209091648d 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -21,10 +21,9 @@
 import numpy as np
 
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
-from systemds.scuro.representations.utils import read_data_from_file, 
save_embeddings
+from systemds.scuro.representations.utils import save_embeddings
 from gensim.models import Word2Vec
 from nltk.tokenize import word_tokenize
-import nltk
 
 
 def get_embedding(sentence, model):
@@ -44,22 +43,20 @@ class W2V(UnimodalRepresentation):
         self.window = window
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices):
-        segments = read_data_from_file(filepath, indices)
-        embeddings = {}
-        t = [word_tokenize(s.lower()) for s in segments.values()]
+    def transform(self, data):
+        t = [word_tokenize(s.lower()) for s in data]
         model = Word2Vec(
             sentences=t,
             vector_size=self.vector_size,
             window=self.window,
             min_count=self.min_count,
         )
-
-        for k, v in segments.items():
-            tokenized_words = word_tokenize(v.lower())
-            embeddings[k] = get_embedding(tokenized_words, model)
+        embeddings = []
+        for sentences in data:
+            tokens = word_tokenize(sentences.lower())
+            embeddings.append(get_embedding(tokens, model))
 
         if self.output_file is not None:
-            save_embeddings(embeddings, self.output_file)
+            save_embeddings(np.array(embeddings), self.output_file)
 
-        return np.array(list(embeddings.values()))
+        return np.array(embeddings)
diff --git a/src/main/python/tests/scuro/data_generator.py 
b/src/main/python/tests/scuro/data_generator.py
index 9f5b8dd2d7..6856ee7044 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -23,10 +23,7 @@ import numpy as np
 from scipy.io.wavfile import write
 import random
 import os
-
-from systemds.scuro.modality.video_modality import VideoModality
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.text_modality import TextModality
+from systemds.scuro.modality.type import ModalityType
 
 
 class TestDataGenerator:
@@ -36,7 +33,7 @@ class TestDataGenerator:
         self.balanced = balanced
 
         for modality in modalities:
-            mod_path = f"{self.path}/{modality.name.lower()}/"
+            mod_path = f"{self.path}/{modality.type.name}/"
             os.mkdir(mod_path)
             modality.file_path = mod_path
         self.labels = []
@@ -72,17 +69,17 @@ class TestDataGenerator:
                 speed_slow += 1
 
             for modality in self.modalities:
-                if isinstance(modality, VideoModality):
+                if modality.type == ModalityType.VIDEO:
                     self.__create_video_data(idx, duration, 30, speed_factor)
-                if isinstance(modality, AudioModality):
+                if modality.type == ModalityType.AUDIO:
                     self.__create_audio_data(idx, duration, speed_factor)
-                if isinstance(modality, TextModality):
+                if modality.type == ModalityType.TEXT:
                     self.__create_text_data(idx, speed_factor)
 
         np.save(f"{self.path}/labels.npy", np.array(self.labels))
 
     def __create_video_data(self, idx, duration, fps, speed_factor):
-        path = f"{self.path}/video/{idx}.mp4"
+        path = f"{self.path}/VIDEO/{idx}.mp4"
 
         width, height = 160, 120
         fourcc = cv2.VideoWriter_fourcc(*"mp4v")
@@ -108,13 +105,13 @@ class TestDataGenerator:
         out.release()
 
     def __create_text_data(self, idx, speed_factor):
-        path = f"{self.path}/text/{idx}.txt"
+        path = f"{self.path}/TEXT/{idx}.txt"
 
         with open(path, "w") as f:
             f.write(f"The ball moves at speed factor {speed_factor:.2f}.")
 
     def __create_audio_data(self, idx, duration, speed_factor):
-        path = f"{self.path}/audio/{idx}.wav"
+        path = f"{self.path}/AUDIO/{idx}.wav"
         sample_rate = 44100
 
         t = np.linspace(0, duration, int(sample_rate * duration), 
endpoint=False)
diff --git a/src/main/python/tests/scuro/test_data_loaders.py 
b/src/main/python/tests/scuro/test_data_loaders.py
index cbbeafab8a..55704b8d8a 100644
--- a/src/main/python/tests/scuro/test_data_loaders.py
+++ b/src/main/python/tests/scuro/test_data_loaders.py
@@ -22,15 +22,17 @@
 import os
 import shutil
 import unittest
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.text_modality import TextModality
-from systemds.scuro.modality.video_modality import VideoModality
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.representations.bert import Bert
 from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.resnet import ResNet
-from systemds.scuro.representations.representation_dataloader import HDF5, 
NPY, Pickle
 from tests.scuro.data_generator import TestDataGenerator
 
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.modality.type import ModalityType
+
 
 class TestDataLoaders(unittest.TestCase):
     test_file_path = None
@@ -53,28 +55,26 @@ class TestDataLoaders(unittest.TestCase):
 
         cls.num_instances = 2
         cls.indizes = [str(i) for i in range(0, cls.num_instances)]
-        cls.video = VideoModality(
-            "", 
ResNet(f"{cls.test_file_path}/embeddings/resnet_embeddings.hdf5")
-        )
-        cls.audio = AudioModality(
-            "",
-            MelSpectrogram(
-                
output_file=f"{cls.test_file_path}/embeddings/mel_sp_embeddings.npy"
-            ),
-        )
-        cls.text = TextModality(
-            "",
-            Bert(
-                avg_layers=4,
-                
output_file=f"{cls.test_file_path}/embeddings/bert_embeddings.pkl",
-            ),
-        )
-        cls.mods = [cls.video, cls.audio, cls.text]
+
+        cls.video_path = cls.test_file_path + "/" + ModalityType.VIDEO.name + 
"/"
+        cls.audio_path = cls.test_file_path + "/" + ModalityType.AUDIO.name + 
"/"
+        cls.text_path = cls.test_file_path + "/" + ModalityType.TEXT.name + "/"
+
+        video_data_loader = VideoLoader(cls.video_path, cls.indizes)
+        audio_data_loader = AudioLoader(cls.audio_path, cls.indizes)
+        text_data_loader = TextLoader(cls.text_path, cls.indizes)
+
+        # Load modalities (audio, video, text)
+        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
+        text = UnimodalModality(text_data_loader, ModalityType.TEXT)
+
+        cls.mods = [video, audio, text]
         cls.data_generator = TestDataGenerator(cls.mods, cls.test_file_path)
         cls.data_generator.create_multimodal_data(cls.num_instances)
-        cls.text.read_all(cls.indizes)
-        cls.audio.read_all(cls.indizes)
-        cls.video.read_all([i for i in range(0, cls.num_instances)])
+        cls.text_ref = text.apply_representation(Bert())
+        cls.audio_ref = audio.apply_representation(MelSpectrogram())
+        cls.video_ref = video.apply_representation(ResNet())
 
     @classmethod
     def tearDownClass(cls):
@@ -82,35 +82,31 @@ class TestDataLoaders(unittest.TestCase):
         shutil.rmtree(cls.test_file_path)
 
     def test_load_audio_data_from_file(self):
-        load_audio = AudioModality(
-            f"{self.test_file_path}/embeddings/mel_sp_embeddings.npy", NPY()
-        )
-        load_audio.read_all(self.indizes)
+        audio_data_loader = AudioLoader(self.audio_path, self.indizes)
+        audio = UnimodalModality(
+            audio_data_loader, ModalityType.AUDIO
+        ).apply_representation(MelSpectrogram())
 
         for i in range(0, self.num_instances):
-            assert round(sum(self.audio.data[i]), 4) == round(
-                sum(load_audio.data[i]), 4
-            )
+            assert round(sum(self.audio_ref.data[i]), 4) == 
round(sum(audio.data[i]), 4)
 
     def test_load_video_data_from_file(self):
-        load_video = VideoModality(
-            f"{self.test_file_path}/embeddings/resnet_embeddings.hdf5", HDF5()
-        )
-        load_video.read_all(self.indizes)
+        video_data_loader = VideoLoader(self.video_path, self.indizes)
+        video = UnimodalModality(
+            video_data_loader, ModalityType.VIDEO
+        ).apply_representation(ResNet())
 
         for i in range(0, self.num_instances):
-            assert round(sum(self.video.data[i]), 4) == round(
-                sum(load_video.data[i]), 4
-            )
+            assert round(sum(self.video_ref.data[i]), 4) == 
round(sum(video.data[i]), 4)
 
     def test_load_text_data_from_file(self):
-        load_text = TextModality(
-            f"{self.test_file_path}/embeddings/bert_embeddings.pkl", Pickle()
-        )
-        load_text.read_all(self.indizes)
+        text_data_loader = TextLoader(self.text_path, self.indizes)
+        text = UnimodalModality(
+            text_data_loader, ModalityType.TEXT
+        ).apply_representation(Bert())
 
         for i in range(0, self.num_instances):
-            assert round(sum(self.text.data[i]), 4) == 
round(sum(load_text.data[i]), 4)
+            assert round(sum(self.text_ref.data[i]), 4) == 
round(sum(text.data[i]), 4)
 
 
 if __name__ == "__main__":
diff --git a/src/main/python/tests/scuro/test_dr_search.py 
b/src/main/python/tests/scuro/test_dr_search.py
index eac4a77641..d0d7ef5077 100644
--- a/src/main/python/tests/scuro/test_dr_search.py
+++ b/src/main/python/tests/scuro/test_dr_search.py
@@ -28,11 +28,13 @@ from sklearn.metrics import classification_report
 from sklearn.model_selection import train_test_split, KFold
 from sklearn.preprocessing import MinMaxScaler
 
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
 from systemds.scuro.aligner.dr_search import DRSearch
 from systemds.scuro.aligner.task import Task
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.text_modality import TextModality
-from systemds.scuro.modality.video_modality import VideoModality
 from systemds.scuro.models.model import Model
 from systemds.scuro.representations.average import Average
 from systemds.scuro.representations.bert import Bert
@@ -101,28 +103,27 @@ class TestDataLoaders(unittest.TestCase):
 
         cls.num_instances = 8
         cls.indizes = [str(i) for i in range(0, cls.num_instances)]
-        cls.video = VideoModality(
-            "", 
ResNet(f"{cls.test_file_path}/embeddings/resnet_embeddings.hdf5")
+
+        video_data_loader = VideoLoader(
+            cls.test_file_path + "/" + ModalityType.VIDEO.name + "/", 
cls.indizes
         )
-        cls.audio = AudioModality(
-            "",
-            MelSpectrogram(
-                
output_file=f"{cls.test_file_path}/embeddings/mel_sp_embeddings.npy"
-            ),
+        audio_data_loader = AudioLoader(
+            cls.test_file_path + "/" + ModalityType.AUDIO.name + "/", 
cls.indizes
         )
-        cls.text = TextModality(
-            "",
-            Bert(
-                avg_layers=4,
-                
output_file=f"{cls.test_file_path}/embeddings/bert_embeddings.pkl",
-            ),
+        text_data_loader = TextLoader(
+            cls.test_file_path + "/" + ModalityType.TEXT.name + "/", 
cls.indizes
         )
-        cls.mods = [cls.video, cls.audio, cls.text]
-        cls.data_generator = TestDataGenerator(cls.mods, cls.test_file_path)
+        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
+        text = UnimodalModality(text_data_loader, ModalityType.TEXT)
+        cls.data_generator = TestDataGenerator([video, audio, text], 
cls.test_file_path)
         cls.data_generator.create_multimodal_data(cls.num_instances)
-        cls.text.read_all(cls.indizes)
-        cls.audio.read_all(cls.indizes)
-        cls.video.read_all([i for i in range(0, cls.num_instances)])
+
+        cls.bert = text.apply_representation(Bert())
+        cls.mel_spe = audio.apply_representation(MelSpectrogram())
+        cls.resnet = video.apply_representation(ResNet())
+
+        cls.mods = [cls.bert, cls.mel_spe, cls.resnet]
 
         split = train_test_split(
             cls.indizes, cls.data_generator.labels, test_size=0.2, 
random_state=42

(systemds) branch main updated: [SYSTEMDS-3701] Rework Scuro modalites, update python 3.8

Reply via email to