(systemds) branch main updated: [SYSTEMDS-3835] Add additional visual representations

cdionysio Thu, 13 Nov 2025 09:03:40 -0800

This is an automated email from the ASF dual-hosted git repository.

cdionysio pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new cd4c828f2a [SYSTEMDS-3835] Add additional visual representations
cd4c828f2a is described below

commit cd4c828f2af696fab3659f96377f168e97a381f2
Author: Christina Dionysio <[email protected]>
AuthorDate: Thu Nov 13 18:03:25 2025 +0100

    [SYSTEMDS-3835] Add additional visual representations
    
    This patch adds new visual (image, video) representations, and a test 
utility for the image modality.
---
 .github/workflows/python.yml                       |   5 +-
 src/main/python/systemds/scuro/__init__.py         |   8 +-
 src/main/python/systemds/scuro/modality/type.py    |   9 ++
 .../systemds/scuro/modality/unimodal_modality.py   |   5 +-
 .../python/systemds/scuro/representations/clip.py  | 133 +++++++++++++++++++++
 .../scuro/representations/color_histogram.py       | 111 +++++++++++++++++
 .../systemds/scuro/representations/resnet.py       |  10 +-
 .../representations/swin_video_transformer.py      |   5 +-
 .../scuro/representations/{resnet.py => vgg.py}    | 107 +++++------------
 .../python/systemds/scuro/representations/x3d.py   | 117 ++++++++++--------
 .../python/systemds/scuro/utils/torch_dataset.py   |  28 +++--
 src/main/python/tests/scuro/data_generator.py      |  38 ++++--
 .../python/tests/scuro/test_operator_registry.py   |  49 +++++++-
 .../tests/scuro/test_unimodal_representations.py   |  33 ++++-
 14 files changed, 494 insertions(+), 164 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index bf20101e6c..fcd8bf8c84 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -171,7 +171,8 @@ jobs:
           h5py \
           gensim \
           opt-einsum \
-          nltk
+          nltk \
+          fvcore
         kill $KA 
         cd src/main/python
-        python -m unittest discover -s tests/scuro -p 'test_*.py' -v
\ No newline at end of file
+        python -m unittest discover -s tests/scuro -p 'test_*.py' -v
diff --git a/src/main/python/systemds/scuro/__init__.py 
b/src/main/python/systemds/scuro/__init__.py
index c1db4c3d49..e74ae53f36 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -77,6 +77,7 @@ from systemds.scuro.representations.window_aggregation import 
(
 )
 from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.representations.x3d import X3D
+from systemds.scuro.representations.color_histogram import ColorHistogram
 from systemds.scuro.models.model import Model
 from systemds.scuro.models.discrete_model import DiscreteModel
 from systemds.scuro.modality.joined import JoinedModality
@@ -97,7 +98,8 @@ from systemds.scuro.representations.covarep_audio_features 
import (
 )
 from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
-
+from systemds.scuro.representations.vgg import VGG19
+from systemds.scuro.representations.clip import CLIPText, CLIPVisual
 
 __all__ = [
     "BaseLoader",
@@ -120,6 +122,7 @@ __all__ = [
     "MFCC",
     "Hadamard",
     "OpticalFlow",
+    "ColorHistogram",
     "Representation",
     "NPY",
     "JSON",
@@ -169,4 +172,7 @@ __all__ = [
     "Quantile",
     "BandpowerFFT",
     "ZeroCrossingRate",
+    "VGG19",
+    "CLIPVisual",
+    "CLIPText",
 ]
diff --git a/src/main/python/systemds/scuro/modality/type.py 
b/src/main/python/systemds/scuro/modality/type.py
index ef1e0eeab2..382e2631ad 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -254,7 +254,16 @@ class ModalityType(Flag):
         md["data_layout"]["representation"] = DataLayout.NESTED_LEVEL
         md["data_layout"]["type"] = float
         md["data_layout"]["shape"] = (width, height, num_channels)
+        return md
 
+    def create_image_metadata(self, width, height, num_channels):
+        md = deepcopy(self.get_schema())
+        md["width"] = width
+        md["height"] = height
+        md["num_channels"] = num_channels
+        md["data_layout"]["representation"] = DataLayout.SINGLE_LEVEL
+        md["data_layout"]["type"] = float
+        md["data_layout"]["shape"] = (width, height, num_channels)
         return md
 
 
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py 
b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 373921e95c..4ae1067c62 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -165,8 +165,9 @@ class UnimodalModality(Modality):
                         padded = np.pad(
                             embeddings,
                             pad_width=(
-                                (0, padding_needed),
-                                (0, 0),
+                                (0, padding_needed)
+                                if len(embeddings.shape) == 1
+                                else ((0, padding_needed), (0, 0))
                             ),
                             mode="constant",
                             constant_values=0,
diff --git a/src/main/python/systemds/scuro/representations/clip.py 
b/src/main/python/systemds/scuro/representations/clip.py
new file mode 100644
index 0000000000..044d0f795a
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/clip.py
@@ -0,0 +1,133 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import numpy as np
+from torchvision import transforms
+
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+import torch
+from systemds.scuro.representations.utils import save_embeddings
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.drsearch.operator_registry import register_representation
+from transformers import CLIPProcessor, CLIPModel
+
+from systemds.scuro.utils.converter import numpy_dtype_to_torch_dtype
+from systemds.scuro.utils.static_variables import get_device
+from systemds.scuro.utils.torch_dataset import CustomDataset
+
+
+@register_representation(ModalityType.VIDEO)
+class CLIPVisual(UnimodalRepresentation):
+    def __init__(self, output_file=None):
+        parameters = {}
+        super().__init__("CLIPVisual", ModalityType.EMBEDDING, parameters)
+        self.model = 
CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(
+            get_device()
+        )
+        self.processor = 
CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        self.output_file = output_file
+
+    def transform(self, modality):
+        transformed_modality = TransformedModality(modality, self)
+        self.data_type = numpy_dtype_to_torch_dtype(modality.data_type)
+        if next(self.model.parameters()).dtype != self.data_type:
+            self.model = self.model.to(self.data_type)
+
+        embeddings = self.create_visual_embeddings(modality)
+
+        if self.output_file is not None:
+            save_embeddings(embeddings, self.output_file)
+
+        transformed_modality.data = list(embeddings.values())
+        return transformed_modality
+
+    def create_visual_embeddings(self, modality):
+        tf = transforms.Compose([transforms.ToPILImage(), 
transforms.ToTensor()])
+        dataset = CustomDataset(
+            modality.data,
+            self.data_type,
+            get_device(),
+            (modality.metadata[0]["width"], modality.metadata[0]["height"]),
+            tf=tf,
+        )
+        embeddings = {}
+        for instance in torch.utils.data.DataLoader(dataset):
+            id = int(instance["id"][0])
+            frames = instance["data"][0]
+            embeddings[id] = []
+            batch_size = 64
+
+            for start_index in range(0, len(frames), batch_size):
+                end_index = min(start_index + batch_size, len(frames))
+                frame_ids_range = range(start_index, end_index)
+                frame_batch = frames[frame_ids_range]
+
+                inputs = self.processor(images=frame_batch, 
return_tensors="pt")
+                with torch.no_grad():
+                    output = self.model.get_image_features(**inputs)
+
+                if len(output.shape) > 2:
+                    output = torch.nn.functional.adaptive_avg_pool2d(output, 
(1, 1))
+
+                embeddings[id].extend(
+                    torch.flatten(output, 1)
+                    .detach()
+                    .cpu()
+                    .float()
+                    .numpy()
+                    .astype(modality.data_type)
+                )
+
+            embeddings[id] = np.array(embeddings[id])
+        return embeddings
+
+
+@register_representation(ModalityType.TEXT)
+class CLIPText(UnimodalRepresentation):
+    def __init__(self, output_file=None):
+        parameters = {}
+        super().__init__("CLIPText", ModalityType.EMBEDDING, parameters)
+        self.model = 
CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(
+            get_device()
+        )
+        self.processor = 
CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        self.output_file = output_file
+
+    def transform(self, modality):
+        transformed_modality = TransformedModality(modality, self)
+
+        embeddings = self.create_text_embeddings(modality.data, self.model)
+
+        if self.output_file is not None:
+            save_embeddings(embeddings, self.output_file)
+
+        transformed_modality.data = embeddings
+        return transformed_modality
+
+    def create_text_embeddings(self, data, model):
+        embeddings = []
+        for d in data:
+            inputs = self.processor(text=d, return_tensors="pt", padding=True)
+            with torch.no_grad():
+                text_embedding = model.get_text_features(**inputs)
+                embeddings.append(text_embedding.squeeze().numpy().reshape(1, 
-1))
+
+        return embeddings
diff --git a/src/main/python/systemds/scuro/representations/color_histogram.py 
b/src/main/python/systemds/scuro/representations/color_histogram.py
new file mode 100644
index 0000000000..6412b1979d
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/color_histogram.py
@@ -0,0 +1,111 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import numpy as np
+import cv2
+
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.modality.transformed import TransformedModality
+
+
+class ColorHistogram(UnimodalRepresentation):
+    def __init__(
+        self,
+        color_space="RGB",
+        bins=32,
+        normalize=True,
+        aggregation="mean",
+        output_file=None,
+    ):
+        super().__init__(
+            "ColorHistogram", ModalityType.EMBEDDING, self._get_parameters()
+        )
+        self.color_space = color_space
+        self.bins = bins
+        self.normalize = normalize
+        self.aggregation = aggregation
+        self.output_file = output_file
+
+    def _get_parameters(self):
+        return {
+            "color_space": ["RGB", "HSV", "GRAY"],
+            "bins": [8, 16, 32, 64, 128, 256, (8, 8, 8), (16, 16, 16)],
+            "normalize": [True, False],
+            "aggregation": ["mean", "max", "concat"],
+        }
+
+    def compute_histogram(self, image):
+        if self.color_space == "HSV":
+            img = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
+            channels = [0, 1, 2]
+        elif self.color_space == "GRAY":
+            img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+            channels = [0]
+        else:
+            img = image
+            channels = [0, 1, 2]
+
+        hist = self._region_histogram(img, channels)
+        return hist
+
+    def _region_histogram(self, img, channels):
+        if isinstance(self.bins, tuple):
+            bins = self.bins
+        elif len(channels) > 1:
+            bins = [self.bins] * len(channels)
+        else:
+            bins = [self.bins]
+        hist = cv2.calcHist([img], channels, None, bins, [0, 256] * 
len(channels))
+        hist = hist.flatten()
+        if self.normalize:
+            hist_sum = np.sum(hist)
+            if hist_sum > 0:
+                hist /= hist_sum
+        return hist.astype(np.float32)
+
+    def transform(self, modality):
+        if modality.modality_type == ModalityType.IMAGE:
+            images = modality.data
+            hist_list = [self.compute_histogram(img) for img in images]
+            transformed_modality = TransformedModality(
+                modality, self, ModalityType.EMBEDDING
+            )
+            transformed_modality.data = hist_list
+            return transformed_modality
+        elif modality.modality_type == ModalityType.VIDEO:
+            embeddings = []
+            for vid in modality.data:
+                frame_hists = [self.compute_histogram(frame) for frame in vid]
+                if self.aggregation == "mean":
+                    hist = np.mean(frame_hists, axis=0)
+                elif self.aggregation == "max":
+                    hist = np.max(frame_hists, axis=0)
+                elif self.aggregation == "concat":
+                    hist = np.concatenate(frame_hists)
+                embeddings.append(hist)
+            transformed_modality = TransformedModality(
+                modality, self, ModalityType.EMBEDDING
+            )
+            transformed_modality.data = embeddings
+            return transformed_modality
+        else:
+            raise ValueError("Unsupported data format for 
HistogramRepresentation")
diff --git a/src/main/python/systemds/scuro/representations/resnet.py 
b/src/main/python/systemds/scuro/representations/resnet.py
index 7bb94d8bfd..f544e6a46f 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -144,17 +144,21 @@ class ResNet(UnimodalRepresentation):
             embeddings[video_id] = []
             batch_size = 64
 
+            if modality.modality_type == ModalityType.IMAGE:
+                frames = frames.unsqueeze(0)
+
             for start_index in range(0, len(frames), batch_size):
                 end_index = min(start_index + batch_size, len(frames))
                 frame_ids_range = range(start_index, end_index)
                 frame_batch = frames[frame_ids_range]
 
                 _ = self.model(frame_batch)
-                values = res5c_output
-                pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 
1))
+                output = res5c_output
+                if len(output.shape) > 2:
+                    output = torch.nn.functional.adaptive_avg_pool2d(output, 
(1, 1))
 
                 embeddings[video_id].extend(
-                    torch.flatten(pooled, 1)
+                    torch.flatten(output, 1)
                     .detach()
                     .cpu()
                     .float()
diff --git 
a/src/main/python/systemds/scuro/representations/swin_video_transformer.py 
b/src/main/python/systemds/scuro/representations/swin_video_transformer.py
index c0b7ab38ab..e8511dd0cf 100644
--- a/src/main/python/systemds/scuro/representations/swin_video_transformer.py
+++ b/src/main/python/systemds/scuro/representations/swin_video_transformer.py
@@ -34,7 +34,7 @@ from systemds.scuro.utils.torch_dataset import CustomDataset
 from systemds.scuro.utils.static_variables import get_device
 
 
-# @register_representation([ModalityType.VIDEO])
+@register_representation([ModalityType.VIDEO])
 class SwinVideoTransformer(UnimodalRepresentation):
     def __init__(self, layer_name="avgpool"):
         parameters = {
@@ -50,7 +50,7 @@ class SwinVideoTransformer(UnimodalRepresentation):
             ],
         }
         self.data_type = torch.float
-        super().__init__("SwinVideoTransformer", ModalityType.TIMESERIES, 
parameters)
+        super().__init__("SwinVideoTransformer", ModalityType.EMBEDDING, 
parameters)
         self.layer_name = layer_name
         self.model = 
swin3d_t(weights=models.video.Swin3D_T_Weights.KINETICS400_V1).to(
             get_device()
@@ -95,6 +95,7 @@ class SwinVideoTransformer(UnimodalRepresentation):
                 .detach()
                 .cpu()
                 .numpy()
+                .flatten()
                 .astype(modality.data_type)
             )
 
diff --git a/src/main/python/systemds/scuro/representations/resnet.py 
b/src/main/python/systemds/scuro/representations/vgg.py
similarity index 57%
copy from src/main/python/systemds/scuro/representations/resnet.py
copy to src/main/python/systemds/scuro/representations/vgg.py
index 7bb94d8bfd..374586f2b9 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/vgg.py
@@ -26,6 +26,7 @@ from typing import Tuple, Any
 from systemds.scuro.drsearch.operator_registry import register_representation
 import torch.utils.data
 import torch
+import re
 import torchvision.models as models
 import numpy as np
 from systemds.scuro.modality.type import ModalityType
@@ -33,13 +34,12 @@ from systemds.scuro.utils.static_variables import get_device
 
 
 @register_representation([ModalityType.IMAGE, ModalityType.VIDEO])
-class ResNet(UnimodalRepresentation):
-    def __init__(self, model_name="ResNet18", layer="avgpool", 
output_file=None):
+class VGG19(UnimodalRepresentation):
+    def __init__(self, layer="classifier.0", output_file=None):
         self.data_type = torch.bfloat16
-        self.model_name = model_name
+        self.model = 
models.vgg19(weights=models.VGG19_Weights.DEFAULT).to(get_device())
         parameters = self._get_parameters()
-        super().__init__("ResNet", ModalityType.EMBEDDING, parameters)
-
+        super().__init__("VGG19", ModalityType.EMBEDDING, parameters)
         self.output_file = output_file
         self.layer_name = layer
         self.model.eval()
@@ -52,65 +52,16 @@ class ResNet(UnimodalRepresentation):
 
         self.model.fc = Identity()
 
-    @property
-    def model_name(self):
-        return self._model_name
-
-    @model_name.setter
-    def model_name(self, model_name):
-        self._model_name = model_name
-        if model_name == "ResNet18":
-            self.model = (
-                models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
-                .to(get_device())
-                .to(self.data_type)
-            )
-
-        elif model_name == "ResNet34":
-            self.model = 
models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to(
-                get_device()
-            )
-            self.model = self.model.to(self.data_type)
-        elif model_name == "ResNet50":
-            self.model = (
-                models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
-                .to(get_device())
-                .to(self.data_type)
-            )
+    def _get_parameters(self):
+        parameters = {"layer_name": []}
 
-        elif model_name == "ResNet101":
-            self.model = (
-                models.resnet101(weights=models.ResNet101_Weights.DEFAULT)
-                .to(get_device())
-                .to(self.data_type)
-            )
+        parameters["layer_name"] = [
+            "features.35",
+            "classifier.0",
+            "classifier.3",
+            "classifier.6",
+        ]
 
-        elif model_name == "ResNet152":
-            self.model = (
-                models.resnet152(weights=models.ResNet152_Weights.DEFAULT)
-                .to(get_device())
-                .to(self.data_type)
-            )
-        else:
-            raise NotImplementedError
-
-    def _get_parameters(self, high_level=True):
-        parameters = {"model_name": [], "layer_name": []}
-        for m in ["ResNet18", "ResNet34", "ResNet50", "ResNet101", 
"ResNet152"]:
-            parameters["model_name"].append(m)
-
-        if high_level:
-            parameters["layer_name"] = [
-                "conv1",
-                "layer1",
-                "layer2",
-                "layer3",
-                "layer4",
-                "avgpool",
-            ]
-        else:
-            for name, layer in self.model.named_modules():
-                parameters["layer_name"].append(name)
         return parameters
 
     def transform(self, modality):
@@ -121,28 +72,32 @@ class ResNet(UnimodalRepresentation):
         dataset = CustomDataset(modality.data, self.data_type, get_device())
         embeddings = {}
 
-        res5c_output = None
+        activations = {}
 
-        def get_features(name_):
+        def get_activation(name_):
             def hook(
                 _module: torch.nn.Module, input_: Tuple[torch.Tensor], output: 
Any
             ):
-                nonlocal res5c_output
-                res5c_output = output
+                activations[name_] = output
 
             return hook
 
-        if self.layer_name:
-            for name, layer in self.model.named_modules():
-                if name == self.layer_name:
-                    layer.register_forward_hook(get_features(name))
-                    break
+        digit = re.findall(r"\d+", self.layer_name)[0]
+        if "feature" in self.layer_name:
+            self.model.features[int(digit)].register_forward_hook(
+                get_activation(self.layer_name)
+            )
+        else:
+
+            self.model.classifier[int(digit)].register_forward_hook(
+                get_activation(self.layer_name)
+            )
 
         for instance in torch.utils.data.DataLoader(dataset):
             video_id = instance["id"][0]
             frames = instance["data"][0]
             embeddings[video_id] = []
-            batch_size = 64
+            batch_size = 32
 
             for start_index in range(0, len(frames), batch_size):
                 end_index = min(start_index + batch_size, len(frames))
@@ -150,11 +105,11 @@ class ResNet(UnimodalRepresentation):
                 frame_batch = frames[frame_ids_range]
 
                 _ = self.model(frame_batch)
-                values = res5c_output
-                pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 
1))
-
+                output = activations[self.layer_name]
+                if len(output.shape) == 4:
+                    output = torch.nn.functional.adaptive_avg_pool2d(output, 
(1, 1))
                 embeddings[video_id].extend(
-                    torch.flatten(pooled, 1)
+                    torch.flatten(output, 1)
                     .detach()
                     .cpu()
                     .float()
diff --git a/src/main/python/systemds/scuro/representations/x3d.py 
b/src/main/python/systemds/scuro/representations/x3d.py
index 1629ac6f30..7701865f82 100644
--- a/src/main/python/systemds/scuro/representations/x3d.py
+++ b/src/main/python/systemds/scuro/representations/x3d.py
@@ -18,34 +18,27 @@
 # under the License.
 #
 # -------------------------------------------------------------
+from systemds.scuro.utils.static_variables import get_device
 from systemds.scuro.utils.torch_dataset import CustomDataset
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
-from typing import Callable, Dict, Tuple, Any
+from typing import Tuple, Any
 import torch.utils.data
 import torch
 from torchvision.models.video import r3d_18, s3d
 import torchvision.models as models
-import torchvision.transforms as transforms
 import numpy as np
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.drsearch.operator_registry import register_representation
-import math
 
-if torch.backends.mps.is_available():
-    DEVICE = torch.device("mps")
-elif torch.cuda.is_available():
-    DEVICE = torch.device("cuda")
-else:
-    DEVICE = torch.device("cpu")
 
-
-# @register_representation([ModalityType.VIDEO])
+@register_representation([ModalityType.VIDEO])
 class X3D(UnimodalRepresentation):
-    def __init__(self, layer="avgpool", model_name="r3d", output_file=None):
+    def __init__(self, layer="classifier.1", model_name="s3d", 
output_file=None):
+        self.data_type = torch.float32
         self.model_name = model_name
         parameters = self._get_parameters()
-        super().__init__("X3D", ModalityType.TIMESERIES, parameters)
+        super().__init__("X3D", ModalityType.EMBEDDING, parameters)
 
         self.output_file = output_file
         self.layer_name = layer
@@ -67,25 +60,37 @@ class X3D(UnimodalRepresentation):
     def model_name(self, model_name):
         self._model_name = model_name
         if model_name == "r3d":
-            self.model = r3d_18(pretrained=True).to(DEVICE)
+            self.model = r3d_18(pretrained=True).to(get_device())
         elif model_name == "s3d":
-            self.model = 
s3d(weights=models.video.S3D_Weights.DEFAULT).to(DEVICE)
+            self.model = 
s3d(weights=models.video.S3D_Weights.DEFAULT).to(get_device())
         else:
             raise NotImplementedError
 
     def _get_parameters(self, high_level=True):
         parameters = {"model_name": [], "layer_name": []}
-        for m in ["r3d", "s3d"]:
+        for m in ["c3d", "s3d"]:
             parameters["model_name"].append(m)
 
         if high_level:
             parameters["layer_name"] = [
-                "conv1",
-                "layer1",
-                "layer2",
-                "layer3",
-                "layer4",
+                "features.1",
+                "features.2",
+                "features.3",
+                "features.4",
+                "features.5",
+                "features.6",
+                "features.7",
+                "features.8",
+                "features.9",
+                "features.10",
+                "features.11",
+                "features.12",
+                "features.13",
+                "features.14",
+                "features.15",
                 "avgpool",
+                "classifier.0",
+                "classifier.1",
             ]
         else:
             for name, layer in self.model.named_modules():
@@ -93,17 +98,18 @@ class X3D(UnimodalRepresentation):
         return parameters
 
     def transform(self, modality):
-        dataset = CustomDataset(modality.data)
+        dataset = CustomDataset(modality.data, self.data_type, get_device())
+
         embeddings = {}
 
-        res5c_output = None
+        activation = None
 
         def get_features(name_):
             def hook(
                 _module: torch.nn.Module, input_: Tuple[torch.Tensor], output: 
Any
             ):
-                nonlocal res5c_output
-                res5c_output = output
+                nonlocal activation
+                activation = output
 
             return hook
 
@@ -115,15 +121,20 @@ class X3D(UnimodalRepresentation):
 
         for instance in dataset:
             video_id = instance["id"]
-            frames = instance["data"].to(DEVICE)
+            frames = instance["data"].to(get_device())
             embeddings[video_id] = []
 
             frames = frames.unsqueeze(0).permute(0, 2, 1, 3, 4)
+            if frames.shape[2] < 14:
+                pad_width = (0, 0, 0, 0, 0, 14 - frames.shape[2], 0, 0, 0, 0)
+                frames = torch.nn.functional.pad(frames, pad_width, 
mode="constant")
             _ = self.model(frames)
-            values = res5c_output
+            values = activation
             pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
 
-            embeddings[video_id].extend(torch.flatten(pooled, 
1).detach().cpu().numpy())
+            embeddings[video_id].extend(
+                torch.flatten(pooled, 1).detach().cpu().numpy().flatten()
+            )
 
             embeddings[video_id] = np.array(embeddings[video_id])
 
@@ -137,13 +148,13 @@ class X3D(UnimodalRepresentation):
 
 
 class I3D(UnimodalRepresentation):
-    def __init__(self, layer="avgpool", model_name="i3d", output_file=None):
+    def __init__(self, layer="blocks.6", model_name="i3d", output_file=None):
         self.model_name = model_name
-        parameters = self._get_parameters()
         self.model = torch.hub.load(
             "facebookresearch/pytorchvideo", "i3d_r50", pretrained=True
-        ).to(DEVICE)
-        super().__init__("I3D", ModalityType.TIMESERIES, parameters)
+        ).to(get_device())
+        parameters = self._get_parameters()
+        super().__init__("I3D", ModalityType.EMBEDDING, parameters)
 
         self.output_file = output_file
         self.layer_name = layer
@@ -152,18 +163,17 @@ class I3D(UnimodalRepresentation):
             param.requires_grad = False
 
     def _get_parameters(self, high_level=True):
-        parameters = {"model_name": [], "layer_name": []}
-        for m in ["r3d", "s3d"]:
-            parameters["model_name"].append(m)
+        parameters = {"layer_name": []}
 
         if high_level:
             parameters["layer_name"] = [
-                "conv1",
-                "layer1",
-                "layer2",
-                "layer3",
-                "layer4",
-                "avgpool",
+                "blocks.0",
+                "blocks.1",
+                "blocks.2",
+                "blocks.3",
+                "blocks.4",
+                "blocks.5",
+                "blocks.6",
             ]
         else:
             for name, layer in self.model.named_modules():
@@ -171,28 +181,37 @@ class I3D(UnimodalRepresentation):
         return parameters
 
     def transform(self, modality):
-        dataset = CustomDataset(modality.data, torch.float32, DEVICE)
+        dataset = CustomDataset(modality.data, torch.float32, get_device())
         embeddings = {}
 
         features = None
 
-        def hook(module, input, output):
-            pooled = torch.nn.functional.adaptive_avg_pool3d(output, 
1).squeeze()
-            nonlocal features
-            features = pooled.detach().cpu().numpy()
+        def get_features(name_):
+            def hook(
+                _module: torch.nn.Module, input_: Tuple[torch.Tensor], output: 
Any
+            ):
+                # pooled = torch.nn.functional.adaptive_avg_pool3d(output, 
1).squeeze()
+                nonlocal features
+                features = output.detach().cpu().numpy()
+
+            return hook
 
-        handle = self.model.blocks[6].dropout.register_forward_hook(hook)
+        if self.layer_name:
+            for name, layer in self.model.named_modules():
+                if name == self.layer_name:
+                    layer.register_forward_hook(get_features(name))
+                    break
 
         for instance in dataset:
             video_id = instance["id"]
-            frames = instance["data"].to(DEVICE)
+            frames = instance["data"].to(get_device())
             embeddings[video_id] = []
 
             batch = torch.transpose(frames, 1, 0)
             batch = batch.unsqueeze(0)
             _ = self.model(batch)
 
-            embeddings[video_id] = features
+            embeddings[video_id] = features.flatten()
 
         transformed_modality = TransformedModality(
             modality, self, self.output_modality_type
diff --git a/src/main/python/systemds/scuro/utils/torch_dataset.py 
b/src/main/python/systemds/scuro/utils/torch_dataset.py
index c04be0ec7b..2a7ec1f963 100644
--- a/src/main/python/systemds/scuro/utils/torch_dataset.py
+++ b/src/main/python/systemds/scuro/utils/torch_dataset.py
@@ -20,12 +20,13 @@
 # -------------------------------------------------------------
 from typing import Dict
 
+import numpy as np
 import torch
 import torchvision.transforms as transforms
 
 
 class CustomDataset(torch.utils.data.Dataset):
-    def __init__(self, data, data_type, device, size=None):
+    def __init__(self, data, data_type, device, size=None, tf=None):
         self.data = data
         self.data_type = data_type
         self.device = device
@@ -33,7 +34,7 @@ class CustomDataset(torch.utils.data.Dataset):
         if size is None:
             self.size = (256, 224)
 
-        self.tf = transforms.Compose(
+        tf_default = transforms.Compose(
             [
                 transforms.ToPILImage(),
                 transforms.Resize(self.size[0]),
@@ -46,6 +47,11 @@ class CustomDataset(torch.utils.data.Dataset):
             ]
         )
 
+        if tf is None:
+            self.tf = tf_default
+        else:
+            self.tf = tf
+
     def __getitem__(self, index) -> Dict[str, object]:
         data = self.data[index]
         output = torch.empty(
@@ -54,12 +60,20 @@ class CustomDataset(torch.utils.data.Dataset):
             device=self.device,
         )
 
-        for i, d in enumerate(data):
-            if data[0].ndim < 3:
-                d = torch.tensor(d)
-                d = d.repeat(3, 1, 1)
+        if isinstance(data, np.ndarray) and data.ndim == 3:
+            # image
+            data = torch.tensor(data).permute(2, 0, 1)
+            output = self.tf(data).to(self.device)
+        else:
+            for i, d in enumerate(data):
+                if data[0].ndim < 3:
+                    d = torch.tensor(d)
+                    d = d.repeat(3, 1, 1)
 
-            output[i] = self.tf(d)
+                tf = self.tf(d)
+                if tf.shape[0] != 3:
+                    tf = tf[:3, :, :]
+                output[i] = tf
 
         return {"id": index, "data": output}
 
diff --git a/src/main/python/tests/scuro/data_generator.py 
b/src/main/python/tests/scuro/data_generator.py
index 11f034d9ce..7676906505 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -195,26 +195,38 @@ class ModalityRandomDataGenerator:
     def create_visual_modality(
         self, num_instances, max_num_frames=1, height=28, width=28
     ):
-        data = [
-            np.random.randint(
-                0,
-                256,
-                (np.random.randint(5, max_num_frames + 1), height, width, 3),
-                dtype=np.uint8,
-            )
-            for _ in range(num_instances)
-        ]
-        if max_num_frames == 1:
-            print(f"TODO: create image metadata")
-        else:
+        if max_num_frames > 1:
+            data = [
+                np.random.randint(
+                    0,
+                    256,
+                    (np.random.randint(1, max_num_frames + 1), height, width, 
3),
+                    dtype=np.uint8,
+                )
+                for _ in range(num_instances)
+            ]
             metadata = {
                 i: ModalityType.VIDEO.create_video_metadata(
                     30, data[i].shape[0], width, height, 3
                 )
                 for i in range(num_instances)
             }
+        else:
+            data = [
+                np.random.randint(
+                    0,
+                    256,
+                    (height, width, 3),
+                    dtype=np.uint8,
+                )
+                for _ in range(num_instances)
+            ]
+            metadata = {
+                i: ModalityType.IMAGE.create_image_metadata(width, height, 3)
+                for i in range(num_instances)
+            }
 
-        return (data, metadata)
+        return data, metadata
 
     def create_balanced_labels(self, num_instances, num_classes=2):
         if num_instances % num_classes != 0:
diff --git a/src/main/python/tests/scuro/test_operator_registry.py 
b/src/main/python/tests/scuro/test_operator_registry.py
index b5fa4b01b4..c33eb5fcc2 100644
--- a/src/main/python/tests/scuro/test_operator_registry.py
+++ b/src/main/python/tests/scuro/test_operator_registry.py
@@ -21,6 +21,7 @@
 
 import unittest
 
+from systemds.scuro import FrequencyMagnitude
 from systemds.scuro.representations.covarep_audio_features import (
     ZeroCrossing,
     Spectral,
@@ -29,6 +30,9 @@ from systemds.scuro.representations.covarep_audio_features 
import (
 )
 from systemds.scuro.representations.mfcc import MFCC
 from systemds.scuro.representations.swin_video_transformer import 
SwinVideoTransformer
+from systemds.scuro.representations.clip import CLIPText, CLIPVisual
+from systemds.scuro.representations.vgg import VGG19
+from systemds.scuro.representations.x3d import X3D, I3D
 from systemds.scuro.representations.wav2vec import Wav2Vec
 from systemds.scuro.representations.window_aggregation import (
     WindowAggregation,
@@ -39,6 +43,22 @@ from systemds.scuro.representations.bow import BoW
 from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.representations.tfidf import TfIdf
 from systemds.scuro.drsearch.operator_registry import Registry
+from systemds.scuro.representations.timeseries_representations import (
+    Max,
+    Mean,
+    Min,
+    RMS,
+    Sum,
+    Std,
+    Skew,
+    Kurtosis,
+    SpectralCentroid,
+    BandpowerFFT,
+    ACF,
+    Quantile,
+    ZeroCrossingRate,
+    FrequencyMagnitude,
+)
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.representations.average import Average
 from systemds.scuro.representations.bert import Bert
@@ -49,7 +69,6 @@ from systemds.scuro.representations.mel_spectrogram import 
MelSpectrogram
 from systemds.scuro.representations.spectrogram import Spectrogram
 from systemds.scuro.representations.hadamard import Hadamard
 from systemds.scuro.representations.resnet import ResNet
-from systemds.scuro.representations.sum import Sum
 from systemds.scuro.representations.multimodal_attention_fusion import 
AttentionFusion
 
 
@@ -71,16 +90,34 @@ class TestOperatorRegistry(unittest.TestCase):
         registry = Registry()
         assert registry.get_representations(ModalityType.VIDEO) == [
             ResNet,
-            # SwinVideoTransformer,
+            SwinVideoTransformer,
+            X3D,
+            VGG19,
+            CLIPVisual,
         ]
 
-    # def test_timeseries_representations_in_registry(self):
-    #     registry = Registry()
-    #     assert registry.get_representations(ModalityType.TIMESERIES) == 
[ResNet]
+    def test_timeseries_representations_in_registry(self):
+        registry = Registry()
+        assert registry.get_representations(ModalityType.TIMESERIES) == [
+            Mean,
+            Min,
+            Max,
+            Sum,
+            Std,
+            Skew,
+            Quantile,
+            Kurtosis,
+            RMS,
+            ZeroCrossingRate,
+            ACF,
+            FrequencyMagnitude,
+            SpectralCentroid,
+            BandpowerFFT,
+        ]
 
     def test_text_representations_in_registry(self):
         registry = Registry()
-        for representation in [BoW, TfIdf, W2V, Bert]:
+        for representation in [CLIPText, BoW, TfIdf, W2V, Bert]:
             assert representation in registry.get_representations(
                 ModalityType.TEXT
             ), f"{representation} not in registry"
diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py 
b/src/main/python/tests/scuro/test_unimodal_representations.py
index 6789786cfd..3bc28ee23c 100644
--- a/src/main/python/tests/scuro/test_unimodal_representations.py
+++ b/src/main/python/tests/scuro/test_unimodal_representations.py
@@ -23,6 +23,7 @@ import unittest
 import copy
 import numpy as np
 
+from systemds.scuro.representations.clip import CLIPVisual, CLIPText
 from systemds.scuro.representations.bow import BoW
 from systemds.scuro.representations.covarep_audio_features import (
     Spectral,
@@ -34,6 +35,9 @@ from systemds.scuro.representations.wav2vec import Wav2Vec
 from systemds.scuro.representations.spectrogram import Spectrogram
 from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.representations.tfidf import TfIdf
+from systemds.scuro.representations.x3d import X3D
+from systemds.scuro.representations.x3d import I3D
+from systemds.scuro.representations.color_histogram import ColorHistogram
 from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.mfcc import MFCC
@@ -59,6 +63,7 @@ from 
systemds.scuro.representations.timeseries_representations import (
     ZeroCrossingRate,
     BandpowerFFT,
 )
+from systemds.scuro.representations.vgg import VGG19
 
 
 class TestUnimodalRepresentations(unittest.TestCase):
@@ -143,11 +148,34 @@ class TestUnimodalRepresentations(unittest.TestCase):
             for i in range(self.num_instances):
                 assert (ts.data[i] == original_data[i]).all()
 
+    def test_image_representations(self):
+        image_representations = [ColorHistogram(), CLIPVisual(), ResNet()]
+
+        image_data, image_md = 
ModalityRandomDataGenerator().create_visual_modality(
+            self.num_instances, 1
+        )
+
+        image = UnimodalModality(
+            TestDataLoader(
+                self.indices, None, ModalityType.IMAGE, image_data, 
np.float32, image_md
+            )
+        )
+
+        for representation in image_representations:
+            r = image.apply_representation(representation)
+            assert r.data is not None
+            assert len(r.data) == self.num_instances
+
     def test_video_representations(self):
         video_representations = [
+            CLIPVisual(),
+            ColorHistogram(),
+            I3D(),
+            X3D(),
+            VGG19(),
             ResNet(),
             SwinVideoTransformer(),
-        ]  # Todo: add other video representations
+        ]
         video_data, video_md = 
ModalityRandomDataGenerator().create_visual_modality(
             self.num_instances, 60
         )
@@ -160,10 +188,9 @@ class TestUnimodalRepresentations(unittest.TestCase):
             r = video.apply_representation(representation)
             assert r.data is not None
             assert len(r.data) == self.num_instances
-            assert r.data[0].ndim == 2
 
     def test_text_representations(self):
-        test_representations = [BoW(2, 2), TfIdf(), W2V()]
+        test_representations = [CLIPText(), BoW(2, 2), TfIdf(), W2V()]
         text_data, text_md = ModalityRandomDataGenerator().create_text_data(
             self.num_instances
         )

(systemds) branch main updated: [SYSTEMDS-3835] Add additional visual representations

Reply via email to