[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-22 Thread Aiden Grossman via llvm-branch-commits


@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)} triplets for {input_file} 
with {opt_level}"
+)
+except Exception as e:
+logger.warning(f"Error processing {input_file} with 
{opt_level}: {e}")
+
+return TripletResult(all_triplets, max_relation)
+
+def _run_pipeline(self, input_file: Path, opt_level: str) -> 
Tuple[Set[str], int]:
+"""Run opt | llvm-ir2vec pipeline elegantly."""
+pipeline_cmd = (
+f'"{self.opt_binary}" -{opt_level} "{input_file}" -o - | '
+f'"{self.ir2vec_binary}" --mode=triplets - -o -'
+)
+
+try:
+result = subprocess.run(
+pipeline_cmd, shell=True, capture_output=True, text=True, 
check=True
+)
+return self._parse_triplet_output(result.stdout)
+except subprocess.CalledProcessError:
+return set(), 1
+
+def _parse_triplet_output(self, output: str) -> Tuple[Set[str], int]:
+"""Parse triplet output and extract max relation"""
+if not output.strip():
+return set(), 1
+
+lines = output.strip().split("\n")
+max_relation = 1
+
+# Extract max relation from metadata line
+if lines and lines[0].startswith("MAX_RELATION="):
+max_relation = int(lines[0]

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-22 Thread Aiden Grossman via llvm-branch-commits


@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)} triplets for {input_file} 
with {opt_level}"
+)
+except Exception as e:
+logger.warning(f"Error processing {input_file} with 
{opt_level}: {e}")
+
+return TripletResult(all_triplets, max_relation)
+
+def _run_pipeline(self, input_file: Path, opt_level: str) -> 
Tuple[Set[str], int]:
+"""Run opt | llvm-ir2vec pipeline elegantly."""
+pipeline_cmd = (
+f'"{self.opt_binary}" -{opt_level} "{input_file}" -o - | '
+f'"{self.ir2vec_binary}" --mode=triplets - -o -'
+)
+
+try:
+result = subprocess.run(
+pipeline_cmd, shell=True, capture_output=True, text=True, 
check=True

boomanaiden154 wrote:

Try and avoid `shell=True` if possible. There are security concerns (although 
they probably do not matter here) and it adds an implicit dep on the shell, 
which might differ between systems.

https://github.com/llvm/llvm-project/pull/149215
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-22 Thread Aiden Grossman via llvm-branch-commits


@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)

boomanaiden154 wrote:

This should probably be somewhere outside of `_validate_setup`? Slightly odd to 
be in here although I can see the motivation (validating that the output path 
exists in a way).

https://github.com/llvm/llvm-project/pull/149215
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-22 Thread Aiden Grossman via llvm-branch-commits


@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)} triplets for {input_file} 
with {opt_level}"
+)
+except Exception as e:
+logger.warning(f"Error processing {input_file} with 
{opt_level}: {e}")
+
+return TripletResult(all_triplets, max_relation)
+
+def _run_pipeline(self, input_file: Path, opt_level: str) -> 
Tuple[Set[str], int]:
+"""Run opt | llvm-ir2vec pipeline elegantly."""
+pipeline_cmd = (
+f'"{self.opt_binary}" -{opt_level} "{input_file}" -o - | '
+f'"{self.ir2vec_binary}" --mode=triplets - -o -'
+)
+
+try:
+result = subprocess.run(
+pipeline_cmd, shell=True, capture_output=True, text=True, 
check=True
+)
+return self._parse_triplet_output(result.stdout)
+except subprocess.CalledProcessError:
+return set(), 1
+
+def _parse_triplet_output(self, output: str) -> Tuple[Set[str], int]:
+"""Parse triplet output and extract max relation"""
+if not output.strip():
+return set(), 1
+
+lines = output.strip().split("\n")
+max_relation = 1
+
+# Extract max relation from metadata line
+if lines and lines[0].startswith("MAX_RELATION="):
+max_relation = int(lines[0]

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-22 Thread Aiden Grossman via llvm-branch-commits


@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)} triplets for {input_file} 
with {opt_level}"
+)
+except Exception as e:

boomanaiden154 wrote:

This code probably shouldn't be in a try catch block at all given you're 
already catching `CalledProcessError` inside `_run_pipeline`.

https://github.com/llvm/llvm-project/pull/149215
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From c4122999df1f8a2a4b4203fdad206a17d787c3d0 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From d19f53d35c186d98c11cf093445254a41853bcae Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From d19f53d35c186d98c11cf093445254a41853bcae Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From e088eb8b169eb292c17ebe33b0d2106f628dce6d Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From e088eb8b169eb292c17ebe33b0d2106f628dce6d Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From b2e9297fbc0bc9452efbbd66e04ecb12a3c578c1 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From 0007c062b403d12347b54e28494c5037a0d21cfd Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From 0007c062b403d12347b54e28494c5037a0d21cfd Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)