================ @@ -0,0 +1,291 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""IR2Vec Triplet Generator + +Generates IR2Vec triplets by applying random optimization levels to LLVM IR files +and extracting triplets using llvm-ir2vec. Automatically generates preprocessed +files: entity2id.txt, relation2id.txt, and train2id.txt. + +Usage: + python generateTriplets.py <llvm_build_dir> <num_optimizations> <ll_file_list> <output_dir> +""" + +import argparse +import logging +import os +import random +import subprocess +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import List, Set, Tuple + +# Configuration +OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"] +DEFAULT_MAX_WORKERS = 100 + +logger = logging.getLogger(__name__) + + +class TripletResult: + """Result from processing a single LLVM IR file""" + + __slots__ = ["triplets", "max_relation"] + + def __init__(self, triplets: Set[str], max_relation: int): + self.triplets = triplets + self.max_relation = max_relation + + +class IR2VecTripletGenerator: + """Main class for generating IR2Vec triplets""" + + def __init__( + self, + llvm_build_dir: Path, + num_optimizations: int, + output_dir: Path, + max_workers: int = DEFAULT_MAX_WORKERS, + ): + self.llvm_build_dir = llvm_build_dir + self.num_optimizations = num_optimizations + self.output_dir = output_dir + self.max_workers = max_workers + + # Tool paths + self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt") + self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec") + + self._validate_setup() + + def _validate_setup(self): + """Validate that all required tools and paths exist""" + if not self.llvm_build_dir.exists(): + raise FileNotFoundError( + f"LLVM build directory not found: {self.llvm_build_dir}" + ) + + if not os.path.isfile(self.opt_binary) or not os.access( + self.opt_binary, os.X_OK + ): + raise FileNotFoundError( + f"opt binary not found or not executable: {self.opt_binary}" + ) + + if not os.path.isfile(self.ir2vec_binary) or not os.access( + self.ir2vec_binary, os.X_OK + ): + raise FileNotFoundError( + f"llvm-ir2vec binary not found or not executable: {self.ir2vec_binary}" + ) + + if not (1 <= self.num_optimizations <= len(OPT_LEVELS)): + raise ValueError( + f"Number of optimizations must be between 1-{len(OPT_LEVELS)}" + ) + + self.output_dir.mkdir(parents=True, exist_ok=True) + + def _select_optimization_levels(self) -> List[str]: + """Select unique random optimization levels""" + return random.sample(OPT_LEVELS, self.num_optimizations) + + def _process_single_file(self, input_file: Path) -> TripletResult: + """Process a single LLVM IR file with multiple optimization levels""" + all_triplets = set() + max_relation = 1 + opt_levels = self._select_optimization_levels() + + for opt_level in opt_levels: + try: + triplets, file_max_relation = self._run_pipeline(input_file, opt_level) + if triplets: + all_triplets.update(triplets) + max_relation = max(max_relation, file_max_relation) + logger.debug( + f"Generated {len(triplets)} triplets for {input_file} with {opt_level}" + ) + except Exception as e: ---------------- boomanaiden154 wrote:
This code probably shouldn't be in a try catch block at all given you're already catching `CalledProcessError` inside `_run_pipeline`. https://github.com/llvm/llvm-project/pull/149215 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits