[llvm-branch-commits] [llvm] [NFC][IR2Vec] Minor refactoring of opcode access in vocabulary (PR #147585)

2025-07-09 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy ready_for_review 
https://github.com/llvm/llvm-project/pull/147585
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NFC][IR2Vec] Minor refactoring of opcode access in vocabulary (PR #147585)

2025-07-09 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy edited 
https://github.com/llvm/llvm-project/pull/147585
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NFC][IR2Vec] Exposing helpers in IR2Vec Vocabulary (PR #147841)

2025-07-09 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy edited 
https://github.com/llvm/llvm-project/pull/147841
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NFC][IR2Vec] Exposing helpers in IR2Vec Vocabulary (PR #147841)

2025-07-09 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy ready_for_review 
https://github.com/llvm/llvm-project/pull/147841
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add llvm-ir2vec tool for generating triplet embeddings (PR #147842)

2025-07-09 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy edited 
https://github.com/llvm/llvm-project/pull/147842
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NFC][IR2Vec] Minor refactoring of opcode access in vocabulary (PR #147585)

2025-07-10 Thread S. VenkataKeerthy via llvm-branch-commits


@@ -447,21 +453,18 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
   // Handle Opcodes
   std::vector NumericOpcodeEmbeddings(Vocabulary::MaxOpcodes,
  Embedding(Dim, 0));
-#define HANDLE_INST(NUM, OPCODE, CLASS)
\
-  {
\
-auto It = OpcVocab.find(#OPCODE);  
\
-if (It != OpcVocab.end())  
\
-  NumericOpcodeEmbeddings[NUM - 1] = It->second;   
\
-else   
\
-  handleMissingEntity(#OPCODE);
\
+  for (unsigned Opcode : seq(0u, Vocabulary::MaxOpcodes)) {

svkeerthy wrote:

Without `u` it would be considered as `int` and will throw compilation errors.

https://github.com/llvm/llvm-project/pull/147585
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-11 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From 6fd2dcac888426bc68570a6352e3f9c7b5fb358c Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/lit.cfg.py  |   7 +
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 +
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 182 --
 4 files changed, 252 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 771d9245368b1..8a1b001695edc 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+(
+"%ir2vec_test_vocab_dir",
+os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+)
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 
2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 

[llvm-branch-commits] [llvm] [IR2Vec] Add llvm-ir2vec tool for generating triplet embeddings (PR #147842)

2025-07-11 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147842

>From 5f1f3fe9f7e07bb44802d3026fdf8ac3abf89ba2 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:39:39 +
Subject: [PATCH] IR2Vec Tool

---
 llvm/test/CMakeLists.txt|   1 +
 llvm/test/lit.cfg.py|   1 +
 llvm/test/tools/llvm-ir2vec/triplets.ll |  38 ++
 llvm/tools/llvm-ir2vec/CMakeLists.txt   |  10 ++
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp  | 150 
 5 files changed, 200 insertions(+)
 create mode 100644 llvm/test/tools/llvm-ir2vec/triplets.ll
 create mode 100644 llvm/tools/llvm-ir2vec/CMakeLists.txt
 create mode 100644 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp

diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 2a6135da9a61e..3426b6ff8d24d 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -97,6 +97,7 @@ set(LLVM_TEST_DEPENDS
   llvm-exegesis
   llvm-extract
   llvm-gsymutil
+  llvm-ir2vec
   llvm-isel-fuzzer
   llvm-ifs
   llvm-install-name-tool
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index bd6e37c848d8c..771d9245368b1 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -197,6 +197,7 @@ def get_asan_rtlib():
 "llvm-dlltool",
 "llvm-exegesis",
 "llvm-extract",
+"llvm-ir2vec",
 "llvm-isel-fuzzer",
 "llvm-ifs",
 "llvm-install-name-tool",
diff --git a/llvm/test/tools/llvm-ir2vec/triplets.ll 
b/llvm/test/tools/llvm-ir2vec/triplets.ll
new file mode 100644
index 0..fa5aaa895406f
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/triplets.ll
@@ -0,0 +1,38 @@
+; RUN: llvm-ir2vec %s | FileCheck %s -check-prefix=TRIPLETS
+
+define i32 @simple_add(i32 %a, i32 %b) {
+entry:
+  %add = add i32 %a, %b
+  ret i32 %add
+}
+
+define i32 @simple_mul(i32 %x, i32 %y) {
+entry:
+  %mul = mul i32 %x, %y
+  ret i32 %mul
+}
+
+define i32 @test_function(i32 %arg1, i32 %arg2) {
+entry:
+  %local1 = alloca i32, align 4
+  %local2 = alloca i32, align 4
+  store i32 %arg1, ptr %local1, align 4
+  store i32 %arg2, ptr %local2, align 4
+  %load1 = load i32, ptr %local1, align 4
+  %load2 = load i32, ptr %local2, align 4
+  %result = add i32 %load1, %load2
+  ret i32 %result
+}
+
+; TRIPLETS: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Mul IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt 
b/llvm/tools/llvm-ir2vec/CMakeLists.txt
new file mode 100644
index 0..a4cf9690e86b5
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  Analysis
+  Core
+  IRReader
+  Support
+  )
+
+add_llvm_tool(llvm-ir2vec
+  llvm-ir2vec.cpp
+  )
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
new file mode 100644
index 0..35e1c995fa4cc
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -0,0 +1,150 @@
+//===- llvm-ir2vec.cpp - IR2Vec Embedding Generation Tool 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+///
+/// \file
+/// This file implements the IR2Vec embedding generation tool.
+///
+/// Currently supports triplet generation for vocabulary training.
+/// Future updates will support embedding generation using trained vocabulary.
+///
+/// Usage: llvm-ir2vec input.bc -o triplets.txt
+///
+/// TODO: Add embedding generation mode with vocabulary support
+///
+//===--===//
+
+#include "llvm/Analysis/IR2Vec.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace ir2vec;
+
+#define DEBUG_TYPE "ir2vec"
+
+static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
+
+static cl::opt InputFilename(cl::Positional,
+  cl::desc("

[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-11 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From bf757c03868bf5e85966440408e41f5343727384 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 +
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 185 --
 3 files changed, 248 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..d5eed749193ac
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4 [ 
97.00  98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %0 = load i32, ptr %a.addr, align 4 [ 
94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %1 = load i32, ptr %a.addr, align 4 [ 
94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %mul = mul nsw i32 %0, %1 [ 49.00  50.00  
51.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %conv = sitofp i32 %mul to float [ 130.00  
131.00  132.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %2 = load float, ptr %b.addr, align 4 [ 
94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %add = fadd float %conv, %2 [ 40.00  41.00 
 42.00 ]
+; CHECK-INST-LEVEL-ABC-R

[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-11 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From bf757c03868bf5e85966440408e41f5343727384 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 +
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 185 --
 3 files changed, 248 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..d5eed749193ac
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%S/../../Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
 %s | FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4 [ 
97.00  98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %0 = load i32, ptr %a.addr, align 4 [ 
94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %1 = load i32, ptr %a.addr, align 4 [ 
94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %mul = mul nsw i32 %0, %1 [ 49.00  50.00  
51.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %conv = sitofp i32 %mul to float [ 130.00  
131.00  132.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %2 = load float, ptr %b.addr, align 4 [ 
94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %add = fadd float %conv, %2 [ 40.00  41.00 
 42.00 ]
+; CHECK-INST-LEVEL-ABC-R

[llvm-branch-commits] [llvm] [IR2Vec] Overloading `operator+` for `Embeddings` (PR #145118)

2025-06-25 Thread S. VenkataKeerthy via llvm-branch-commits


@@ -106,6 +106,7 @@ struct Embedding {
   const std::vector &getData() const { return Data; }
 
   /// Arithmetic operators
+  Embedding operator+(const Embedding &RHS) const;

svkeerthy wrote:

Sure. Will add them too! 

https://github.com/llvm/llvm-project/pull/145118
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add out-of-place arithmetic operators to Embedding class (PR #145118)

2025-07-01 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/145118

>From 10019cae162bb53e147797b655da75aac33b0a20 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Fri, 20 Jun 2025 23:00:40 +
Subject: [PATCH] Overloading operator+ for Embeddngs

---
 llvm/include/llvm/Analysis/IR2Vec.h|  9 --
 llvm/lib/Analysis/IR2Vec.cpp   | 19 -
 llvm/unittests/Analysis/IR2VecTest.cpp | 39 ++
 3 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 040cb84ff27a1..ef8f630d7feb1 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -107,9 +107,12 @@ struct Embedding {
   const std::vector &getData() const { return Data; }
 
   /// Arithmetic operators
-  Embedding &operator+=(const Embedding &RHS);
-  Embedding &operator-=(const Embedding &RHS);
-  Embedding &operator*=(double Factor);
+  LLVM_ABI Embedding &operator+=(const Embedding &RHS);
+  LLVM_ABI Embedding operator+(const Embedding &RHS) const;
+  LLVM_ABI Embedding &operator-=(const Embedding &RHS);
+  LLVM_ABI Embedding operator-(const Embedding &RHS) const;
+  LLVM_ABI Embedding &operator*=(double Factor);
+  LLVM_ABI Embedding operator*(double Factor) const;
 
   /// Adds Src Embedding scaled by Factor with the called Embedding.
   /// Called_Embedding += Src * Factor
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 895b3de58a54e..bf456102bb618 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -70,7 +70,6 @@ inline bool fromJSON(const llvm::json::Value &E, Embedding 
&Out,
 // 
==--===//
 // Embedding
 
//===--===//
-
 Embedding &Embedding::operator+=(const Embedding &RHS) {
   assert(this->size() == RHS.size() && "Vectors must have the same dimension");
   std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
@@ -78,6 +77,12 @@ Embedding &Embedding::operator+=(const Embedding &RHS) {
   return *this;
 }
 
+Embedding Embedding::operator+(const Embedding &RHS) const {
+  Embedding Result(*this);
+  Result += RHS;
+  return Result;
+}
+
 Embedding &Embedding::operator-=(const Embedding &RHS) {
   assert(this->size() == RHS.size() && "Vectors must have the same dimension");
   std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
@@ -85,12 +90,24 @@ Embedding &Embedding::operator-=(const Embedding &RHS) {
   return *this;
 }
 
+Embedding Embedding::operator-(const Embedding &RHS) const {
+  Embedding Result(*this);
+  Result -= RHS;
+  return Result;
+}
+
 Embedding &Embedding::operator*=(double Factor) {
   std::transform(this->begin(), this->end(), this->begin(),
  [Factor](double Elem) { return Elem * Factor; });
   return *this;
 }
 
+Embedding Embedding::operator*(double Factor) const {
+  Embedding Result(*this);
+  Result *= Factor;
+  return Result;
+}
+
 Embedding &Embedding::scaleAndAdd(const Embedding &Src, float Factor) {
   assert(this->size() == Src.size() && "Vectors must have the same dimension");
   for (size_t Itr = 0; Itr < this->size(); ++Itr)
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp 
b/llvm/unittests/Analysis/IR2VecTest.cpp
index 3c97c20ae72d5..70d4808dc6d54 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -109,6 +109,18 @@ TEST(EmbeddingTest, ConstructorsAndAccessors) {
   }
 }
 
+TEST(EmbeddingTest, AddVectorsOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {0.5, 1.5, -1.0};
+
+  Embedding E3 = E1 + E2;
+  EXPECT_THAT(E3, ElementsAre(1.5, 3.5, 2.0));
+
+  // Check that E1 and E2 are unchanged
+  EXPECT_THAT(E1, ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
+}
+
 TEST(EmbeddingTest, AddVectors) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {0.5, 1.5, -1.0};
@@ -120,6 +132,18 @@ TEST(EmbeddingTest, AddVectors) {
   EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
 }
 
+TEST(EmbeddingTest, SubtractVectorsOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {0.5, 1.5, -1.0};
+
+  Embedding E3 = E1 - E2;
+  EXPECT_THAT(E3, ElementsAre(0.5, 0.5, 4.0));
+
+  // Check that E1 and E2 are unchanged
+  EXPECT_THAT(E1, ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
+}
+
 TEST(EmbeddingTest, SubtractVectors) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {0.5, 1.5, -1.0};
@@ -137,6 +161,15 @@ TEST(EmbeddingTest, ScaleVector) {
   EXPECT_THAT(E1, ElementsAre(0.5, 1.0, 1.5));
 }
 
+TEST(EmbeddingTest, ScaleVectorOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = E1 * 0.5f;
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.0, 1.5));
+
+  // Check that E1 is unchanged
+  EXPECT_THAT(E1, ElementsAre(1.0, 2.0, 3.0));
+}
+
 TEST(EmbeddingTest, AddS

[llvm-branch-commits] [llvm] [IR2Vec] Overloading `operator+` for `Embeddings` (PR #145118)

2025-07-01 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/145118

>From f1976fa2454846d80822761f7a095b29c2062652 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Fri, 20 Jun 2025 23:00:40 +
Subject: [PATCH] Overloading operator+ for Embeddngs

---
 llvm/include/llvm/Analysis/IR2Vec.h|  9 --
 llvm/lib/Analysis/IR2Vec.cpp   | 23 +++
 llvm/unittests/Analysis/IR2VecTest.cpp | 39 ++
 3 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 040cb84ff27a1..d63be227b1849 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -107,9 +107,12 @@ struct Embedding {
   const std::vector &getData() const { return Data; }
 
   /// Arithmetic operators
-  Embedding &operator+=(const Embedding &RHS);
-  Embedding &operator-=(const Embedding &RHS);
-  Embedding &operator*=(double Factor);
+  LLVM_ABI Embedding operator+(const Embedding &RHS) const;
+  LLVM_ABI Embedding &operator+=(const Embedding &RHS);
+  LLVM_ABI Embedding operator-(const Embedding &RHS) const;
+  LLVM_ABI Embedding &operator-=(const Embedding &RHS);
+  LLVM_ABI Embedding operator*(double Factor) const;
+  LLVM_ABI Embedding &operator*=(double Factor);
 
   /// Adds Src Embedding scaled by Factor with the called Embedding.
   /// Called_Embedding += Src * Factor
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 895b3de58a54e..e499ebdd5ed3c 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -71,6 +71,14 @@ inline bool fromJSON(const llvm::json::Value &E, Embedding 
&Out,
 // Embedding
 
//===--===//
 
+Embedding Embedding::operator+(const Embedding &RHS) const {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  Embedding Result(*this);
+  std::transform(this->begin(), this->end(), RHS.begin(), Result.begin(),
+ std::plus());
+  return Result;
+}
+
 Embedding &Embedding::operator+=(const Embedding &RHS) {
   assert(this->size() == RHS.size() && "Vectors must have the same dimension");
   std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
@@ -78,6 +86,14 @@ Embedding &Embedding::operator+=(const Embedding &RHS) {
   return *this;
 }
 
+Embedding Embedding::operator-(const Embedding &RHS) const {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  Embedding Result(*this);
+  std::transform(this->begin(), this->end(), RHS.begin(), Result.begin(),
+ std::minus());
+  return Result;
+}
+
 Embedding &Embedding::operator-=(const Embedding &RHS) {
   assert(this->size() == RHS.size() && "Vectors must have the same dimension");
   std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
@@ -85,6 +101,13 @@ Embedding &Embedding::operator-=(const Embedding &RHS) {
   return *this;
 }
 
+Embedding Embedding::operator*(double Factor) const {
+  Embedding Result(*this);
+  std::transform(this->begin(), this->end(), Result.begin(),
+ [Factor](double Elem) { return Elem * Factor; });
+  return Result;
+}
+
 Embedding &Embedding::operator*=(double Factor) {
   std::transform(this->begin(), this->end(), this->begin(),
  [Factor](double Elem) { return Elem * Factor; });
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp 
b/llvm/unittests/Analysis/IR2VecTest.cpp
index 3c97c20ae72d5..70d4808dc6d54 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -109,6 +109,18 @@ TEST(EmbeddingTest, ConstructorsAndAccessors) {
   }
 }
 
+TEST(EmbeddingTest, AddVectorsOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {0.5, 1.5, -1.0};
+
+  Embedding E3 = E1 + E2;
+  EXPECT_THAT(E3, ElementsAre(1.5, 3.5, 2.0));
+
+  // Check that E1 and E2 are unchanged
+  EXPECT_THAT(E1, ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
+}
+
 TEST(EmbeddingTest, AddVectors) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {0.5, 1.5, -1.0};
@@ -120,6 +132,18 @@ TEST(EmbeddingTest, AddVectors) {
   EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
 }
 
+TEST(EmbeddingTest, SubtractVectorsOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {0.5, 1.5, -1.0};
+
+  Embedding E3 = E1 - E2;
+  EXPECT_THAT(E3, ElementsAre(0.5, 0.5, 4.0));
+
+  // Check that E1 and E2 are unchanged
+  EXPECT_THAT(E1, ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
+}
+
 TEST(EmbeddingTest, SubtractVectors) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {0.5, 1.5, -1.0};
@@ -137,6 +161,15 @@ TEST(EmbeddingTest, ScaleVector) {
   EXPECT_THAT(E1, ElementsAre(0.5, 1.0, 1.5));
 }
 
+TEST(EmbeddingTest, ScaleVectorOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = E1 * 0.5f;
+  EXPECT_THAT(E2,

[llvm-branch-commits] [llvm] [IR2Vec] Add out-of-place arithmetic operators to Embedding class (PR #145118)

2025-07-01 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy edited 
https://github.com/llvm/llvm-project/pull/145118
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Overloading `operator+` for `Embeddings` (PR #145118)

2025-07-01 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/145118

>From f1976fa2454846d80822761f7a095b29c2062652 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Fri, 20 Jun 2025 23:00:40 +
Subject: [PATCH] Overloading operator+ for Embeddngs

---
 llvm/include/llvm/Analysis/IR2Vec.h|  9 --
 llvm/lib/Analysis/IR2Vec.cpp   | 23 +++
 llvm/unittests/Analysis/IR2VecTest.cpp | 39 ++
 3 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 040cb84ff27a1..d63be227b1849 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -107,9 +107,12 @@ struct Embedding {
   const std::vector &getData() const { return Data; }
 
   /// Arithmetic operators
-  Embedding &operator+=(const Embedding &RHS);
-  Embedding &operator-=(const Embedding &RHS);
-  Embedding &operator*=(double Factor);
+  LLVM_ABI Embedding operator+(const Embedding &RHS) const;
+  LLVM_ABI Embedding &operator+=(const Embedding &RHS);
+  LLVM_ABI Embedding operator-(const Embedding &RHS) const;
+  LLVM_ABI Embedding &operator-=(const Embedding &RHS);
+  LLVM_ABI Embedding operator*(double Factor) const;
+  LLVM_ABI Embedding &operator*=(double Factor);
 
   /// Adds Src Embedding scaled by Factor with the called Embedding.
   /// Called_Embedding += Src * Factor
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 895b3de58a54e..e499ebdd5ed3c 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -71,6 +71,14 @@ inline bool fromJSON(const llvm::json::Value &E, Embedding 
&Out,
 // Embedding
 
//===--===//
 
+Embedding Embedding::operator+(const Embedding &RHS) const {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  Embedding Result(*this);
+  std::transform(this->begin(), this->end(), RHS.begin(), Result.begin(),
+ std::plus());
+  return Result;
+}
+
 Embedding &Embedding::operator+=(const Embedding &RHS) {
   assert(this->size() == RHS.size() && "Vectors must have the same dimension");
   std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
@@ -78,6 +86,14 @@ Embedding &Embedding::operator+=(const Embedding &RHS) {
   return *this;
 }
 
+Embedding Embedding::operator-(const Embedding &RHS) const {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  Embedding Result(*this);
+  std::transform(this->begin(), this->end(), RHS.begin(), Result.begin(),
+ std::minus());
+  return Result;
+}
+
 Embedding &Embedding::operator-=(const Embedding &RHS) {
   assert(this->size() == RHS.size() && "Vectors must have the same dimension");
   std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
@@ -85,6 +101,13 @@ Embedding &Embedding::operator-=(const Embedding &RHS) {
   return *this;
 }
 
+Embedding Embedding::operator*(double Factor) const {
+  Embedding Result(*this);
+  std::transform(this->begin(), this->end(), Result.begin(),
+ [Factor](double Elem) { return Elem * Factor; });
+  return Result;
+}
+
 Embedding &Embedding::operator*=(double Factor) {
   std::transform(this->begin(), this->end(), this->begin(),
  [Factor](double Elem) { return Elem * Factor; });
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp 
b/llvm/unittests/Analysis/IR2VecTest.cpp
index 3c97c20ae72d5..70d4808dc6d54 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -109,6 +109,18 @@ TEST(EmbeddingTest, ConstructorsAndAccessors) {
   }
 }
 
+TEST(EmbeddingTest, AddVectorsOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {0.5, 1.5, -1.0};
+
+  Embedding E3 = E1 + E2;
+  EXPECT_THAT(E3, ElementsAre(1.5, 3.5, 2.0));
+
+  // Check that E1 and E2 are unchanged
+  EXPECT_THAT(E1, ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
+}
+
 TEST(EmbeddingTest, AddVectors) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {0.5, 1.5, -1.0};
@@ -120,6 +132,18 @@ TEST(EmbeddingTest, AddVectors) {
   EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
 }
 
+TEST(EmbeddingTest, SubtractVectorsOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {0.5, 1.5, -1.0};
+
+  Embedding E3 = E1 - E2;
+  EXPECT_THAT(E3, ElementsAre(0.5, 0.5, 4.0));
+
+  // Check that E1 and E2 are unchanged
+  EXPECT_THAT(E1, ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
+}
+
 TEST(EmbeddingTest, SubtractVectors) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {0.5, 1.5, -1.0};
@@ -137,6 +161,15 @@ TEST(EmbeddingTest, ScaleVector) {
   EXPECT_THAT(E1, ElementsAre(0.5, 1.0, 1.5));
 }
 
+TEST(EmbeddingTest, ScaleVectorOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = E1 * 0.5f;
+  EXPECT_THAT(E2,

[llvm-branch-commits] [llvm] [IR2Vec] Overloading `operator+` for `Embeddings` (PR #145118)

2025-07-01 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy edited 
https://github.com/llvm/llvm-project/pull/145118
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add out-of-place arithmetic operators to Embedding class (PR #145118)

2025-07-01 Thread S. VenkataKeerthy via llvm-branch-commits


@@ -71,20 +71,43 @@ inline bool fromJSON(const llvm::json::Value &E, Embedding 
&Out,
 // Embedding
 
//===--===//
 
+Embedding Embedding::operator+(const Embedding &RHS) const {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  Embedding Result(*this);
+  std::transform(this->begin(), this->end(), RHS.begin(), Result.begin(),
+ std::plus());
+  return Result;
+}
+
 Embedding &Embedding::operator+=(const Embedding &RHS) {
   assert(this->size() == RHS.size() && "Vectors must have the same dimension");

svkeerthy wrote:

Implemented  in terms of = as it would avoid copies in 
=.

https://github.com/llvm/llvm-project/pull/145118
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add out-of-place arithmetic operators to Embedding class (PR #145118)

2025-07-01 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/145118

>From 10019cae162bb53e147797b655da75aac33b0a20 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Fri, 20 Jun 2025 23:00:40 +
Subject: [PATCH] Overloading operator+ for Embeddngs

---
 llvm/include/llvm/Analysis/IR2Vec.h|  9 --
 llvm/lib/Analysis/IR2Vec.cpp   | 19 -
 llvm/unittests/Analysis/IR2VecTest.cpp | 39 ++
 3 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 040cb84ff27a1..ef8f630d7feb1 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -107,9 +107,12 @@ struct Embedding {
   const std::vector &getData() const { return Data; }
 
   /// Arithmetic operators
-  Embedding &operator+=(const Embedding &RHS);
-  Embedding &operator-=(const Embedding &RHS);
-  Embedding &operator*=(double Factor);
+  LLVM_ABI Embedding &operator+=(const Embedding &RHS);
+  LLVM_ABI Embedding operator+(const Embedding &RHS) const;
+  LLVM_ABI Embedding &operator-=(const Embedding &RHS);
+  LLVM_ABI Embedding operator-(const Embedding &RHS) const;
+  LLVM_ABI Embedding &operator*=(double Factor);
+  LLVM_ABI Embedding operator*(double Factor) const;
 
   /// Adds Src Embedding scaled by Factor with the called Embedding.
   /// Called_Embedding += Src * Factor
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 895b3de58a54e..bf456102bb618 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -70,7 +70,6 @@ inline bool fromJSON(const llvm::json::Value &E, Embedding 
&Out,
 // 
==--===//
 // Embedding
 
//===--===//
-
 Embedding &Embedding::operator+=(const Embedding &RHS) {
   assert(this->size() == RHS.size() && "Vectors must have the same dimension");
   std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
@@ -78,6 +77,12 @@ Embedding &Embedding::operator+=(const Embedding &RHS) {
   return *this;
 }
 
+Embedding Embedding::operator+(const Embedding &RHS) const {
+  Embedding Result(*this);
+  Result += RHS;
+  return Result;
+}
+
 Embedding &Embedding::operator-=(const Embedding &RHS) {
   assert(this->size() == RHS.size() && "Vectors must have the same dimension");
   std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
@@ -85,12 +90,24 @@ Embedding &Embedding::operator-=(const Embedding &RHS) {
   return *this;
 }
 
+Embedding Embedding::operator-(const Embedding &RHS) const {
+  Embedding Result(*this);
+  Result -= RHS;
+  return Result;
+}
+
 Embedding &Embedding::operator*=(double Factor) {
   std::transform(this->begin(), this->end(), this->begin(),
  [Factor](double Elem) { return Elem * Factor; });
   return *this;
 }
 
+Embedding Embedding::operator*(double Factor) const {
+  Embedding Result(*this);
+  Result *= Factor;
+  return Result;
+}
+
 Embedding &Embedding::scaleAndAdd(const Embedding &Src, float Factor) {
   assert(this->size() == Src.size() && "Vectors must have the same dimension");
   for (size_t Itr = 0; Itr < this->size(); ++Itr)
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp 
b/llvm/unittests/Analysis/IR2VecTest.cpp
index 3c97c20ae72d5..70d4808dc6d54 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -109,6 +109,18 @@ TEST(EmbeddingTest, ConstructorsAndAccessors) {
   }
 }
 
+TEST(EmbeddingTest, AddVectorsOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {0.5, 1.5, -1.0};
+
+  Embedding E3 = E1 + E2;
+  EXPECT_THAT(E3, ElementsAre(1.5, 3.5, 2.0));
+
+  // Check that E1 and E2 are unchanged
+  EXPECT_THAT(E1, ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
+}
+
 TEST(EmbeddingTest, AddVectors) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {0.5, 1.5, -1.0};
@@ -120,6 +132,18 @@ TEST(EmbeddingTest, AddVectors) {
   EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
 }
 
+TEST(EmbeddingTest, SubtractVectorsOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {0.5, 1.5, -1.0};
+
+  Embedding E3 = E1 - E2;
+  EXPECT_THAT(E3, ElementsAre(0.5, 0.5, 4.0));
+
+  // Check that E1 and E2 are unchanged
+  EXPECT_THAT(E1, ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
+}
+
 TEST(EmbeddingTest, SubtractVectors) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {0.5, 1.5, -1.0};
@@ -137,6 +161,15 @@ TEST(EmbeddingTest, ScaleVector) {
   EXPECT_THAT(E1, ElementsAre(0.5, 1.0, 1.5));
 }
 
+TEST(EmbeddingTest, ScaleVectorOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = E1 * 0.5f;
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.0, 1.5));
+
+  // Check that E1 is unchanged
+  EXPECT_THAT(E1, ElementsAre(1.0, 2.0, 3.0));
+}
+
 TEST(EmbeddingTest, AddS

[llvm-branch-commits] [llvm] [IR2Vec] Add out-of-place arithmetic operators to Embedding class (PR #145118)

2025-07-01 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy edited 
https://github.com/llvm/llvm-project/pull/145118
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Adding documentation for llvm-ir2vec tool (PR #148719)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy created 
https://github.com/llvm/llvm-project/pull/148719

None

>From 5490291f7f7670b1d326603cb0bc1047337a2fcf Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Mon, 14 Jul 2025 20:27:42 +
Subject: [PATCH] [IR2Vec] Adding documentation for llvm-ir2vec tool

---
 llvm/docs/CommandGuide/index.rst   |   1 +
 llvm/docs/CommandGuide/llvm-ir2vec.rst | 170 +
 llvm/docs/MLGO.rst |  12 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp |   8 +-
 4 files changed, 184 insertions(+), 7 deletions(-)
 create mode 100644 llvm/docs/CommandGuide/llvm-ir2vec.rst

diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst
index 88fc1fd326b76..f85f32a1fdd51 100644
--- a/llvm/docs/CommandGuide/index.rst
+++ b/llvm/docs/CommandGuide/index.rst
@@ -27,6 +27,7 @@ Basic Commands
llvm-dis
llvm-dwarfdump
llvm-dwarfutil
+   llvm-ir2vec
llvm-lib
llvm-libtool-darwin
llvm-link
diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
new file mode 100644
index 0..13fe4996b968f
--- /dev/null
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -0,0 +1,170 @@
+llvm-ir2vec - IR2Vec Embedding Generation Tool
+==
+
+.. program:: llvm-ir2vec
+
+SYNOPSIS
+
+
+:program:`llvm-ir2vec` [*options*] *input-file*
+
+DESCRIPTION
+---
+
+:program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
+generates IR2Vec embeddings for LLVM IR and supports triplet generation 
+for vocabulary training. It provides two main operation modes:
+
+1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+   training from LLVM IR.
+
+2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+   at different granularity levels (instruction, basic block, or function).
+
+The tool is designed to facilitate machine learning applications that work with
+LLVM IR by converting the IR into numerical representations that can be used by
+ML models.
+
+.. note::
+
+   For information about using IR2Vec programmatically within LLVM passes and 
+   the C++ API, see the `IR2Vec Embeddings 
`_ 
+   section in the MLGO documentation.
+
+OPERATION MODES
+---
+
+Triplet Generation Mode
+~~~
+
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
+consisting of opcodes, types, and operands. These triplets can be used to train
+vocabularies for embedding generation.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+
+Embedding Generation Mode
+~~
+
+In embedding mode, :program:`llvm-ir2vec` uses a pre-trained vocabulary to
+generate numerical embeddings for LLVM IR at different levels of granularity.
+
+Example Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json --level=func 
input.bc -o embeddings.txt
+
+OPTIONS
+---
+
+.. option:: --mode=
+
+ Specify the operation mode. Valid values are:
+
+ * ``triplets`` - Generate triplets for vocabulary training
+ * ``embeddings`` - Generate embeddings using trained vocabulary (default)
+
+.. option:: --level=
+
+ Specify the embedding generation level. Valid values are:
+
+ * ``inst`` - Generate instruction-level embeddings
+ * ``bb`` - Generate basic block-level embeddings  
+ * ``func`` - Generate function-level embeddings (default)
+
+.. option:: --function=
+
+ Process only the specified function instead of all functions in the module.
+
+.. option:: --ir2vec-vocab-path=
+
+ Specify the path to the vocabulary file (required for embedding mode).
+ The vocabulary file should be in JSON format and contain the trained
+ vocabulary for embedding generation. See `llvm/lib/Analysis/models`
+ for pre-trained vocabulary files.
+
+.. option:: --ir2vec-opc-weight=
+
+ Specify the weight for opcode embeddings (default: 1.0). This controls
+ the relative importance of instruction opcodes in the final embedding.
+
+.. option:: --ir2vec-type-weight=
+
+ Specify the weight for type embeddings (default: 0.5). This controls
+ the relative importance of type information in the final embedding.
+
+.. option:: --ir2vec-arg-weight=
+
+ Specify the weight for argument embeddings (default: 0.2). This controls
+ the relative importance of operand information in the final embedding.
+
+.. option:: -o 
+
+ Specify the output filename. Use ``-`` to write to standard output (default).
+
+.. option:: --help
+
+ Print a summary of command line options.
+
+.. note::
+
+   ``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
+   ``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
+   mode. These options are ignored in triplet mode.
+
+INPUT FILE FORMAT
+-
+
+:program:`llvm-ir2vec` accepts LLVM bitcode files

[llvm-branch-commits] [llvm] [IR2Vec] Adding documentation for llvm-ir2vec tool (PR #148719)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

svkeerthy wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/148719?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#148719** https://app.graphite.dev/github/pr/llvm/llvm-project/148719?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/148719?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#147844** https://app.graphite.dev/github/pr/llvm/llvm-project/147844?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#147842** https://app.graphite.dev/github/pr/llvm/llvm-project/147842?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#147841** https://app.graphite.dev/github/pr/llvm/llvm-project/147841?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#147585** https://app.graphite.dev/github/pr/llvm/llvm-project/147585?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#145119** https://app.graphite.dev/github/pr/llvm/llvm-project/145119?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#145118** https://app.graphite.dev/github/pr/llvm/llvm-project/145118?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#145117** https://app.graphite.dev/github/pr/llvm/llvm-project/145117?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#143999** https://app.graphite.dev/github/pr/llvm/llvm-project/143999?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#143986** https://app.graphite.dev/github/pr/llvm/llvm-project/143986?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#143479** https://app.graphite.dev/github/pr/llvm/llvm-project/143479?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>: 1 other dependent PR 
([#144139](https://github.com/llvm/llvm-project/pull/144139) https://app.graphite.dev/github/pr/llvm/llvm-project/144139?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>)
* **#143476** https://app.graphite.dev/github/pr/llvm/llvm-project/143476?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#143200** https://app.graphite.dev/github/pr/llvm/llvm-project/143200?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#143197** https://app.graphite.dev/github/pr/llvm/llvm-project/143197?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/148719
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Adding documentation for llvm-ir2vec tool (PR #148719)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy ready_for_review 
https://github.com/llvm/llvm-project/pull/148719
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Adding documentation for llvm-ir2vec tool (PR #148719)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy edited 
https://github.com/llvm/llvm-project/pull/148719
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Adding documentation for llvm-ir2vec tool (PR #148719)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy edited 
https://github.com/llvm/llvm-project/pull/148719
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NFC][IR2Vec] Minor refactoring of opcode access in vocabulary (PR #147585)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147585

>From 5eaecce25822a1e4d1aa7e1bb200f6eff7f29234 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Mon, 7 Jul 2025 21:30:29 +
Subject: [PATCH] [NFC][IR2Vec] Minor refactoring of opcode access in
 vocabulary

---
 llvm/include/llvm/Analysis/IR2Vec.h |  9 ---
 llvm/lib/Analysis/IR2Vec.cpp| 41 -
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index e35793617f7da..2498a211e80e5 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -162,15 +162,18 @@ class Vocabulary {
   static constexpr unsigned MaxOperandKinds =
   static_cast(OperandKind::MaxOperandKind);
 
+  /// Helper function to get vocabulary key for a given Opcode
+  static StringRef getVocabKeyForOpcode(unsigned Opcode);
+
+  /// Helper function to get vocabulary key for a given TypeID
+  static StringRef getVocabKeyForTypeID(Type::TypeID TypeID);
+
   /// Helper function to get vocabulary key for a given OperandKind
   static StringRef getVocabKeyForOperandKind(OperandKind Kind);
 
   /// Helper function to classify an operand into OperandKind
   static OperandKind getOperandKind(const Value *Op);
 
-  /// Helper function to get vocabulary key for a given TypeID
-  static StringRef getVocabKeyForTypeID(Type::TypeID TypeID);
-
 public:
   Vocabulary() = default;
   Vocabulary(VocabVector &&Vocab);
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index b1255c76367b2..c6e1fa32c9ffd 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -243,6 +243,17 @@ const ir2vec::Embedding &Vocabulary::operator[](const 
Value *Arg) const {
   return Vocab[MaxOpcodes + MaxTypeIDs + static_cast(ArgKind)];
 }
 
+StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
+  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+#define HANDLE_INST(NUM, OPCODE, CLASS)
\
+  if (Opcode == NUM) { 
\
+return #OPCODE;
\
+  }
+#include "llvm/IR/Instruction.def"
+#undef HANDLE_INST
+  return "UnknownOpcode";
+}
+
 StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID TypeID) {
   switch (TypeID) {
   case Type::VoidTyID:
@@ -280,6 +291,7 @@ StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID 
TypeID) {
   default:
 return "UnknownTy";
   }
+  return "UnknownTy";
 }
 
 StringRef Vocabulary::getVocabKeyForOperandKind(Vocabulary::OperandKind Kind) {
@@ -316,14 +328,8 @@ StringRef Vocabulary::getStringKey(unsigned Pos) {
   assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds &&
  "Position out of bounds in vocabulary");
   // Opcode
-  if (Pos < MaxOpcodes) {
-#define HANDLE_INST(NUM, OPCODE, CLASS)
\
-  if (Pos == NUM - 1) {
\
-return #OPCODE;
\
-  }
-#include "llvm/IR/Instruction.def"
-#undef HANDLE_INST
-  }
+  if (Pos < MaxOpcodes)
+return getVocabKeyForOpcode(Pos + 1);
   // Type
   if (Pos < MaxOpcodes + MaxTypeIDs)
 return getVocabKeyForTypeID(static_cast(Pos - MaxOpcodes));
@@ -431,21 +437,18 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
   // Handle Opcodes
   std::vector NumericOpcodeEmbeddings(Vocabulary::MaxOpcodes,
  Embedding(Dim, 0));
-#define HANDLE_INST(NUM, OPCODE, CLASS)
\
-  {
\
-auto It = OpcVocab.find(#OPCODE);  
\
-if (It != OpcVocab.end())  
\
-  NumericOpcodeEmbeddings[NUM - 1] = It->second;   
\
-else   
\
-  handleMissingEntity(#OPCODE);
\
+  for (unsigned Opcode : seq(0u, Vocabulary::MaxOpcodes)) {
+StringRef VocabKey = Vocabulary::getVocabKeyForOpcode(Opcode + 1);
+auto It = OpcVocab.find(VocabKey.str());
+if (It != OpcVocab.end())
+  NumericOpcodeEmbeddings[Opcode] = It->second;
+else
+  handleMissingEntity(VocabKey.str());
   }
-#include "llvm/IR/Instruction.def"
-#undef HANDLE_INST
   Vocab.insert(Vocab.end(), NumericOpcodeEmbeddings.begin(),
NumericOpcodeEmbeddings.end());
 
-  // Handle Types using direct iteration through TypeID enum
-  // We iterate through all possible TypeID values and map them to embeddings
+  // Handle Types
   std::vector NumericTypeEmbeddings(Vocabulary::MaxTypeIDs,
   

[llvm-branch-commits] [llvm] [IR2Vec] Add llvm-ir2vec tool for generating triplet embeddings (PR #147842)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147842

>From 744b38b147f101cc5ea84e71f724c21713150a3d Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:39:39 +
Subject: [PATCH] IR2Vec Tool

---
 llvm/test/CMakeLists.txt|   1 +
 llvm/test/lit.cfg.py|   1 +
 llvm/test/tools/llvm-ir2vec/triplets.ll |  38 ++
 llvm/tools/llvm-ir2vec/CMakeLists.txt   |  10 ++
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp  | 150 
 5 files changed, 200 insertions(+)
 create mode 100644 llvm/test/tools/llvm-ir2vec/triplets.ll
 create mode 100644 llvm/tools/llvm-ir2vec/CMakeLists.txt
 create mode 100644 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp

diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 2a6135da9a61e..3426b6ff8d24d 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -97,6 +97,7 @@ set(LLVM_TEST_DEPENDS
   llvm-exegesis
   llvm-extract
   llvm-gsymutil
+  llvm-ir2vec
   llvm-isel-fuzzer
   llvm-ifs
   llvm-install-name-tool
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index bd6e37c848d8c..771d9245368b1 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -197,6 +197,7 @@ def get_asan_rtlib():
 "llvm-dlltool",
 "llvm-exegesis",
 "llvm-extract",
+"llvm-ir2vec",
 "llvm-isel-fuzzer",
 "llvm-ifs",
 "llvm-install-name-tool",
diff --git a/llvm/test/tools/llvm-ir2vec/triplets.ll 
b/llvm/test/tools/llvm-ir2vec/triplets.ll
new file mode 100644
index 0..fa5aaa895406f
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/triplets.ll
@@ -0,0 +1,38 @@
+; RUN: llvm-ir2vec %s | FileCheck %s -check-prefix=TRIPLETS
+
+define i32 @simple_add(i32 %a, i32 %b) {
+entry:
+  %add = add i32 %a, %b
+  ret i32 %add
+}
+
+define i32 @simple_mul(i32 %x, i32 %y) {
+entry:
+  %mul = mul i32 %x, %y
+  ret i32 %mul
+}
+
+define i32 @test_function(i32 %arg1, i32 %arg2) {
+entry:
+  %local1 = alloca i32, align 4
+  %local2 = alloca i32, align 4
+  store i32 %arg1, ptr %local1, align 4
+  store i32 %arg2, ptr %local2, align 4
+  %load1 = load i32, ptr %local1, align 4
+  %load2 = load i32, ptr %local2, align 4
+  %result = add i32 %load1, %load2
+  ret i32 %result
+}
+
+; TRIPLETS: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Mul IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt 
b/llvm/tools/llvm-ir2vec/CMakeLists.txt
new file mode 100644
index 0..a4cf9690e86b5
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  Analysis
+  Core
+  IRReader
+  Support
+  )
+
+add_llvm_tool(llvm-ir2vec
+  llvm-ir2vec.cpp
+  )
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
new file mode 100644
index 0..35e1c995fa4cc
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -0,0 +1,150 @@
+//===- llvm-ir2vec.cpp - IR2Vec Embedding Generation Tool 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+///
+/// \file
+/// This file implements the IR2Vec embedding generation tool.
+///
+/// Currently supports triplet generation for vocabulary training.
+/// Future updates will support embedding generation using trained vocabulary.
+///
+/// Usage: llvm-ir2vec input.bc -o triplets.txt
+///
+/// TODO: Add embedding generation mode with vocabulary support
+///
+//===--===//
+
+#include "llvm/Analysis/IR2Vec.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace ir2vec;
+
+#define DEBUG_TYPE "ir2vec"
+
+static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
+
+static cl::opt InputFilename(cl::Positional,
+  cl::desc("

[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From 4e92c2bc3b155eb9d9cf3634c1f59a1152d0ebae Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/lit.cfg.py  |   7 +
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 +
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 182 --
 4 files changed, 252 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 771d9245368b1..8a1b001695edc 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+(
+"%ir2vec_test_vocab_dir",
+os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+)
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 
2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 

[llvm-branch-commits] [llvm] [IR2Vec] Add llvm-ir2vec tool for generating triplet embeddings (PR #147842)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147842

>From 744b38b147f101cc5ea84e71f724c21713150a3d Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:39:39 +
Subject: [PATCH] IR2Vec Tool

---
 llvm/test/CMakeLists.txt|   1 +
 llvm/test/lit.cfg.py|   1 +
 llvm/test/tools/llvm-ir2vec/triplets.ll |  38 ++
 llvm/tools/llvm-ir2vec/CMakeLists.txt   |  10 ++
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp  | 150 
 5 files changed, 200 insertions(+)
 create mode 100644 llvm/test/tools/llvm-ir2vec/triplets.ll
 create mode 100644 llvm/tools/llvm-ir2vec/CMakeLists.txt
 create mode 100644 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp

diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 2a6135da9a61e..3426b6ff8d24d 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -97,6 +97,7 @@ set(LLVM_TEST_DEPENDS
   llvm-exegesis
   llvm-extract
   llvm-gsymutil
+  llvm-ir2vec
   llvm-isel-fuzzer
   llvm-ifs
   llvm-install-name-tool
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index bd6e37c848d8c..771d9245368b1 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -197,6 +197,7 @@ def get_asan_rtlib():
 "llvm-dlltool",
 "llvm-exegesis",
 "llvm-extract",
+"llvm-ir2vec",
 "llvm-isel-fuzzer",
 "llvm-ifs",
 "llvm-install-name-tool",
diff --git a/llvm/test/tools/llvm-ir2vec/triplets.ll 
b/llvm/test/tools/llvm-ir2vec/triplets.ll
new file mode 100644
index 0..fa5aaa895406f
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/triplets.ll
@@ -0,0 +1,38 @@
+; RUN: llvm-ir2vec %s | FileCheck %s -check-prefix=TRIPLETS
+
+define i32 @simple_add(i32 %a, i32 %b) {
+entry:
+  %add = add i32 %a, %b
+  ret i32 %add
+}
+
+define i32 @simple_mul(i32 %x, i32 %y) {
+entry:
+  %mul = mul i32 %x, %y
+  ret i32 %mul
+}
+
+define i32 @test_function(i32 %arg1, i32 %arg2) {
+entry:
+  %local1 = alloca i32, align 4
+  %local2 = alloca i32, align 4
+  store i32 %arg1, ptr %local1, align 4
+  store i32 %arg2, ptr %local2, align 4
+  %load1 = load i32, ptr %local1, align 4
+  %load2 = load i32, ptr %local2, align 4
+  %result = add i32 %load1, %load2
+  ret i32 %result
+}
+
+; TRIPLETS: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Mul IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt 
b/llvm/tools/llvm-ir2vec/CMakeLists.txt
new file mode 100644
index 0..a4cf9690e86b5
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  Analysis
+  Core
+  IRReader
+  Support
+  )
+
+add_llvm_tool(llvm-ir2vec
+  llvm-ir2vec.cpp
+  )
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
new file mode 100644
index 0..35e1c995fa4cc
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -0,0 +1,150 @@
+//===- llvm-ir2vec.cpp - IR2Vec Embedding Generation Tool 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+///
+/// \file
+/// This file implements the IR2Vec embedding generation tool.
+///
+/// Currently supports triplet generation for vocabulary training.
+/// Future updates will support embedding generation using trained vocabulary.
+///
+/// Usage: llvm-ir2vec input.bc -o triplets.txt
+///
+/// TODO: Add embedding generation mode with vocabulary support
+///
+//===--===//
+
+#include "llvm/Analysis/IR2Vec.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace ir2vec;
+
+#define DEBUG_TYPE "ir2vec"
+
+static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
+
+static cl::opt InputFilename(cl::Positional,
+  cl::desc("

[llvm-branch-commits] [llvm] [NFC][IR2Vec] Minor refactoring of opcode access in vocabulary (PR #147585)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147585

>From 5eaecce25822a1e4d1aa7e1bb200f6eff7f29234 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Mon, 7 Jul 2025 21:30:29 +
Subject: [PATCH] [NFC][IR2Vec] Minor refactoring of opcode access in
 vocabulary

---
 llvm/include/llvm/Analysis/IR2Vec.h |  9 ---
 llvm/lib/Analysis/IR2Vec.cpp| 41 -
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index e35793617f7da..2498a211e80e5 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -162,15 +162,18 @@ class Vocabulary {
   static constexpr unsigned MaxOperandKinds =
   static_cast(OperandKind::MaxOperandKind);
 
+  /// Helper function to get vocabulary key for a given Opcode
+  static StringRef getVocabKeyForOpcode(unsigned Opcode);
+
+  /// Helper function to get vocabulary key for a given TypeID
+  static StringRef getVocabKeyForTypeID(Type::TypeID TypeID);
+
   /// Helper function to get vocabulary key for a given OperandKind
   static StringRef getVocabKeyForOperandKind(OperandKind Kind);
 
   /// Helper function to classify an operand into OperandKind
   static OperandKind getOperandKind(const Value *Op);
 
-  /// Helper function to get vocabulary key for a given TypeID
-  static StringRef getVocabKeyForTypeID(Type::TypeID TypeID);
-
 public:
   Vocabulary() = default;
   Vocabulary(VocabVector &&Vocab);
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index b1255c76367b2..c6e1fa32c9ffd 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -243,6 +243,17 @@ const ir2vec::Embedding &Vocabulary::operator[](const 
Value *Arg) const {
   return Vocab[MaxOpcodes + MaxTypeIDs + static_cast(ArgKind)];
 }
 
+StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
+  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+#define HANDLE_INST(NUM, OPCODE, CLASS)
\
+  if (Opcode == NUM) { 
\
+return #OPCODE;
\
+  }
+#include "llvm/IR/Instruction.def"
+#undef HANDLE_INST
+  return "UnknownOpcode";
+}
+
 StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID TypeID) {
   switch (TypeID) {
   case Type::VoidTyID:
@@ -280,6 +291,7 @@ StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID 
TypeID) {
   default:
 return "UnknownTy";
   }
+  return "UnknownTy";
 }
 
 StringRef Vocabulary::getVocabKeyForOperandKind(Vocabulary::OperandKind Kind) {
@@ -316,14 +328,8 @@ StringRef Vocabulary::getStringKey(unsigned Pos) {
   assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds &&
  "Position out of bounds in vocabulary");
   // Opcode
-  if (Pos < MaxOpcodes) {
-#define HANDLE_INST(NUM, OPCODE, CLASS)
\
-  if (Pos == NUM - 1) {
\
-return #OPCODE;
\
-  }
-#include "llvm/IR/Instruction.def"
-#undef HANDLE_INST
-  }
+  if (Pos < MaxOpcodes)
+return getVocabKeyForOpcode(Pos + 1);
   // Type
   if (Pos < MaxOpcodes + MaxTypeIDs)
 return getVocabKeyForTypeID(static_cast(Pos - MaxOpcodes));
@@ -431,21 +437,18 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
   // Handle Opcodes
   std::vector NumericOpcodeEmbeddings(Vocabulary::MaxOpcodes,
  Embedding(Dim, 0));
-#define HANDLE_INST(NUM, OPCODE, CLASS)
\
-  {
\
-auto It = OpcVocab.find(#OPCODE);  
\
-if (It != OpcVocab.end())  
\
-  NumericOpcodeEmbeddings[NUM - 1] = It->second;   
\
-else   
\
-  handleMissingEntity(#OPCODE);
\
+  for (unsigned Opcode : seq(0u, Vocabulary::MaxOpcodes)) {
+StringRef VocabKey = Vocabulary::getVocabKeyForOpcode(Opcode + 1);
+auto It = OpcVocab.find(VocabKey.str());
+if (It != OpcVocab.end())
+  NumericOpcodeEmbeddings[Opcode] = It->second;
+else
+  handleMissingEntity(VocabKey.str());
   }
-#include "llvm/IR/Instruction.def"
-#undef HANDLE_INST
   Vocab.insert(Vocab.end(), NumericOpcodeEmbeddings.begin(),
NumericOpcodeEmbeddings.end());
 
-  // Handle Types using direct iteration through TypeID enum
-  // We iterate through all possible TypeID values and map them to embeddings
+  // Handle Types
   std::vector NumericTypeEmbeddings(Vocabulary::MaxTypeIDs,
   

[llvm-branch-commits] [llvm] [NFC][IR2Vec] Exposing helpers in IR2Vec Vocabulary (PR #147841)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147841

>From 1803736667b49978152af659a2d88a09824adf84 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:38:22 +
Subject: [PATCH] [NFC][IR2Vec] Exposing helpers in IR2Vec Vocabulary

---
 llvm/include/llvm/Analysis/IR2Vec.h | 16 
 llvm/lib/Analysis/IR2Vec.cpp|  3 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 2498a211e80e5..2ad751b2a5aaa 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -162,6 +162,14 @@ class Vocabulary {
   static constexpr unsigned MaxOperandKinds =
   static_cast(OperandKind::MaxOperandKind);
 
+public:
+  Vocabulary() = default;
+  Vocabulary(VocabVector &&Vocab);
+
+  bool isValid() const;
+  unsigned getDimension() const;
+  size_t size() const;
+
   /// Helper function to get vocabulary key for a given Opcode
   static StringRef getVocabKeyForOpcode(unsigned Opcode);
 
@@ -174,14 +182,6 @@ class Vocabulary {
   /// Helper function to classify an operand into OperandKind
   static OperandKind getOperandKind(const Value *Op);
 
-public:
-  Vocabulary() = default;
-  Vocabulary(VocabVector &&Vocab);
-
-  bool isValid() const;
-  unsigned getDimension() const;
-  size_t size() const;
-
   /// Accessors to get the embedding for a given entity.
   const ir2vec::Embedding &operator[](unsigned Opcode) const;
   const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index c6e1fa32c9ffd..d3d0c30bd05a7 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -502,7 +502,8 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager 
&AM) {
   // Otherwise, try to read from the vocabulary file.
   if (VocabFile.empty()) {
 // FIXME: Use default vocabulary
-Ctx->emitError("IR2Vec vocabulary file path not specified");
+Ctx->emitError("IR2Vec vocabulary file path not specified; You may need to 
"
+   "set it using --ir2vec-vocab-path");
 return Vocabulary(); // Return invalid result
   }
   if (auto Err = readVocabulary()) {

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From 4e92c2bc3b155eb9d9cf3634c1f59a1152d0ebae Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/lit.cfg.py  |   7 +
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 +
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 182 --
 4 files changed, 252 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 771d9245368b1..8a1b001695edc 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+(
+"%ir2vec_test_vocab_dir",
+os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+)
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 
2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 

[llvm-branch-commits] [llvm] [NFC][IR2Vec] Exposing helpers in IR2Vec Vocabulary (PR #147841)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147841

>From 1803736667b49978152af659a2d88a09824adf84 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:38:22 +
Subject: [PATCH] [NFC][IR2Vec] Exposing helpers in IR2Vec Vocabulary

---
 llvm/include/llvm/Analysis/IR2Vec.h | 16 
 llvm/lib/Analysis/IR2Vec.cpp|  3 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 2498a211e80e5..2ad751b2a5aaa 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -162,6 +162,14 @@ class Vocabulary {
   static constexpr unsigned MaxOperandKinds =
   static_cast(OperandKind::MaxOperandKind);
 
+public:
+  Vocabulary() = default;
+  Vocabulary(VocabVector &&Vocab);
+
+  bool isValid() const;
+  unsigned getDimension() const;
+  size_t size() const;
+
   /// Helper function to get vocabulary key for a given Opcode
   static StringRef getVocabKeyForOpcode(unsigned Opcode);
 
@@ -174,14 +182,6 @@ class Vocabulary {
   /// Helper function to classify an operand into OperandKind
   static OperandKind getOperandKind(const Value *Op);
 
-public:
-  Vocabulary() = default;
-  Vocabulary(VocabVector &&Vocab);
-
-  bool isValid() const;
-  unsigned getDimension() const;
-  size_t size() const;
-
   /// Accessors to get the embedding for a given entity.
   const ir2vec::Embedding &operator[](unsigned Opcode) const;
   const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index c6e1fa32c9ffd..d3d0c30bd05a7 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -502,7 +502,8 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager 
&AM) {
   // Otherwise, try to read from the vocabulary file.
   if (VocabFile.empty()) {
 // FIXME: Use default vocabulary
-Ctx->emitError("IR2Vec vocabulary file path not specified");
+Ctx->emitError("IR2Vec vocabulary file path not specified; You may need to 
"
+   "set it using --ir2vec-vocab-path");
 return Vocabulary(); // Return invalid result
   }
   if (auto Err = readVocabulary()) {

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NFC][IR2Vec] Exposing helpers in IR2Vec Vocabulary (PR #147841)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits


@@ -163,6 +163,14 @@ class Vocabulary {
   static constexpr unsigned MaxOperandKinds =
   static_cast(OperandKind::MaxOperandKind);
 
+public:

svkeerthy wrote:

I just prefer to have constructor in the top :) 

https://github.com/llvm/llvm-project/pull/147841
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits


@@ -81,6 +136,68 @@ class IR2VecTool {
 OS << LocalOutput;
   }
 
+  /// Generate embeddings for the entire module
+  void generateEmbeddings(raw_ostream &OS) const {
+if (!Vocab->isValid()) {
+  OS << "Error: Vocabulary is not valid. IR2VecTool not initialized.\n";
+  return;
+}
+
+for (const Function &F : M)
+  generateEmbeddings(F, OS);
+  }
+
+  /// Generate embeddings for a single function
+  void generateEmbeddings(const Function &F, raw_ostream &OS) const {
+if (F.isDeclaration()) {
+  OS << "Function " << F.getName() << " is a declaration, skipping.\n";
+  return;
+}
+
+// Create embedder for this function
+assert(Vocab->isValid() && "Vocabulary is not valid");
+auto Emb = Embedder::create(IR2VecKind::Symbolic, F, *Vocab);
+if (!Emb) {
+  OS << "Error: Failed to create embedder for function " << F.getName()
+ << "\n";
+  return;
+}
+
+OS << "Function: " << F.getName() << "\n";
+
+// Generate embeddings based on the specified level
+switch (Level) {
+case FunctionLevel: {

svkeerthy wrote:

Wierdly yes!

https://github.com/llvm/llvm-project/pull/147844
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From ab123750a6d52faecff9276b2d06a95964857ef4 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/lit.cfg.py  |   7 +
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 194 --
 4 files changed, 260 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 771d9245368b1..8a1b001695edc 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+(
+"%ir2vec_test_vocab_dir",
+os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+)
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 
2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4

[llvm-branch-commits] [llvm] [IR2Vec] Add llvm-ir2vec tool for generating triplet embeddings (PR #147842)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits


@@ -0,0 +1,150 @@
+//===- llvm-ir2vec.cpp - IR2Vec Embedding Generation Tool 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+///
+/// \file
+/// This file implements the IR2Vec embedding generation tool.
+///
+/// Currently supports triplet generation for vocabulary training.
+/// Future updates will support embedding generation using trained vocabulary.
+///
+/// Usage: llvm-ir2vec input.bc -o triplets.txt
+///
+/// TODO: Add embedding generation mode with vocabulary support
+///
+//===--===//
+
+#include "llvm/Analysis/IR2Vec.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace ir2vec;

svkeerthy wrote:

Addressed this in the next PR.

https://github.com/llvm/llvm-project/pull/147842
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add llvm-ir2vec tool for generating triplet embeddings (PR #147842)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147842

>From 51b0120e4dd4c9052141f5b334bf9e4716721b56 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:39:39 +
Subject: [PATCH] IR2Vec Tool

---
 llvm/test/CMakeLists.txt|   1 +
 llvm/test/lit.cfg.py|   1 +
 llvm/test/tools/llvm-ir2vec/triplets.ll |  38 ++
 llvm/tools/llvm-ir2vec/CMakeLists.txt   |  10 ++
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp  | 150 
 5 files changed, 200 insertions(+)
 create mode 100644 llvm/test/tools/llvm-ir2vec/triplets.ll
 create mode 100644 llvm/tools/llvm-ir2vec/CMakeLists.txt
 create mode 100644 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp

diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 2a6135da9a61e..3426b6ff8d24d 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -97,6 +97,7 @@ set(LLVM_TEST_DEPENDS
   llvm-exegesis
   llvm-extract
   llvm-gsymutil
+  llvm-ir2vec
   llvm-isel-fuzzer
   llvm-ifs
   llvm-install-name-tool
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index bd6e37c848d8c..771d9245368b1 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -197,6 +197,7 @@ def get_asan_rtlib():
 "llvm-dlltool",
 "llvm-exegesis",
 "llvm-extract",
+"llvm-ir2vec",
 "llvm-isel-fuzzer",
 "llvm-ifs",
 "llvm-install-name-tool",
diff --git a/llvm/test/tools/llvm-ir2vec/triplets.ll 
b/llvm/test/tools/llvm-ir2vec/triplets.ll
new file mode 100644
index 0..fa5aaa895406f
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/triplets.ll
@@ -0,0 +1,38 @@
+; RUN: llvm-ir2vec %s | FileCheck %s -check-prefix=TRIPLETS
+
+define i32 @simple_add(i32 %a, i32 %b) {
+entry:
+  %add = add i32 %a, %b
+  ret i32 %add
+}
+
+define i32 @simple_mul(i32 %x, i32 %y) {
+entry:
+  %mul = mul i32 %x, %y
+  ret i32 %mul
+}
+
+define i32 @test_function(i32 %arg1, i32 %arg2) {
+entry:
+  %local1 = alloca i32, align 4
+  %local2 = alloca i32, align 4
+  store i32 %arg1, ptr %local1, align 4
+  store i32 %arg2, ptr %local2, align 4
+  %load1 = load i32, ptr %local1, align 4
+  %load2 = load i32, ptr %local2, align 4
+  %result = add i32 %load1, %load2
+  ret i32 %result
+}
+
+; TRIPLETS: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Mul IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt 
b/llvm/tools/llvm-ir2vec/CMakeLists.txt
new file mode 100644
index 0..a4cf9690e86b5
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  Analysis
+  Core
+  IRReader
+  Support
+  )
+
+add_llvm_tool(llvm-ir2vec
+  llvm-ir2vec.cpp
+  )
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
new file mode 100644
index 0..35e1c995fa4cc
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -0,0 +1,150 @@
+//===- llvm-ir2vec.cpp - IR2Vec Embedding Generation Tool 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+///
+/// \file
+/// This file implements the IR2Vec embedding generation tool.
+///
+/// Currently supports triplet generation for vocabulary training.
+/// Future updates will support embedding generation using trained vocabulary.
+///
+/// Usage: llvm-ir2vec input.bc -o triplets.txt
+///
+/// TODO: Add embedding generation mode with vocabulary support
+///
+//===--===//
+
+#include "llvm/Analysis/IR2Vec.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace ir2vec;
+
+#define DEBUG_TYPE "ir2vec"
+
+static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
+
+static cl::opt InputFilename(cl::Positional,
+  cl::desc("

[llvm-branch-commits] [llvm] [NFC][IR2Vec] Exposing helpers in IR2Vec Vocabulary (PR #147841)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147841

>From 3f1914ea0f920f93c75557d348c53a9245a00a5b Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:38:22 +
Subject: [PATCH] [NFC][IR2Vec] Exposing helpers in IR2Vec Vocabulary

---
 llvm/include/llvm/Analysis/IR2Vec.h | 16 
 llvm/lib/Analysis/IR2Vec.cpp|  3 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 0127df7970010..3d7edf08c8807 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -162,6 +162,14 @@ class Vocabulary {
   static constexpr unsigned MaxOperandKinds =
   static_cast(OperandKind::MaxOperandKind);
 
+public:
+  Vocabulary() = default;
+  Vocabulary(VocabVector &&Vocab);
+
+  bool isValid() const;
+  unsigned getDimension() const;
+  size_t size() const;
+
   /// Helper function to get vocabulary key for a given Opcode
   static StringRef getVocabKeyForOpcode(unsigned Opcode);
 
@@ -174,14 +182,6 @@ class Vocabulary {
   /// Helper function to classify an operand into OperandKind
   static OperandKind getOperandKind(const Value *Op);
 
-public:
-  Vocabulary() = default;
-  Vocabulary(VocabVector &&Vocab);
-
-  bool isValid() const;
-  unsigned getDimension() const;
-  size_t size() const;
-
   /// Accessors to get the embedding for a given entity.
   const ir2vec::Embedding &operator[](unsigned Opcode) const;
   const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index c6e1fa32c9ffd..d3d0c30bd05a7 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -502,7 +502,8 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager 
&AM) {
   // Otherwise, try to read from the vocabulary file.
   if (VocabFile.empty()) {
 // FIXME: Use default vocabulary
-Ctx->emitError("IR2Vec vocabulary file path not specified");
+Ctx->emitError("IR2Vec vocabulary file path not specified; You may need to 
"
+   "set it using --ir2vec-vocab-path");
 return Vocabulary(); // Return invalid result
   }
   if (auto Err = readVocabulary()) {

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From f975249e07c16cf621dcea0189d400ebbc8da7bc Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/lit.cfg.py  |   7 +
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 194 --
 4 files changed, 260 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 771d9245368b1..8a1b001695edc 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+(
+"%ir2vec_test_vocab_dir",
+os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+)
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 
2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4

[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-14 Thread S. VenkataKeerthy via llvm-branch-commits


@@ -34,7 +42,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
-using namespace ir2vec;
+using namespace llvm::ir2vec;

svkeerthy wrote:

Done

https://github.com/llvm/llvm-project/pull/147844
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Adding documentation for llvm-ir2vec tool (PR #148719)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/148719

>From 6ae5021a2a5e3d46a7275ae9aa50fa54956b2e82 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Mon, 14 Jul 2025 20:27:42 +
Subject: [PATCH] [IR2Vec] Adding documentation for llvm-ir2vec tool

---
 llvm/docs/CommandGuide/index.rst   |   1 +
 llvm/docs/CommandGuide/llvm-ir2vec.rst | 170 +
 llvm/docs/MLGO.rst |  12 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp |   8 +-
 4 files changed, 184 insertions(+), 7 deletions(-)
 create mode 100644 llvm/docs/CommandGuide/llvm-ir2vec.rst

diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst
index 88fc1fd326b76..f85f32a1fdd51 100644
--- a/llvm/docs/CommandGuide/index.rst
+++ b/llvm/docs/CommandGuide/index.rst
@@ -27,6 +27,7 @@ Basic Commands
llvm-dis
llvm-dwarfdump
llvm-dwarfutil
+   llvm-ir2vec
llvm-lib
llvm-libtool-darwin
llvm-link
diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
new file mode 100644
index 0..13fe4996b968f
--- /dev/null
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -0,0 +1,170 @@
+llvm-ir2vec - IR2Vec Embedding Generation Tool
+==
+
+.. program:: llvm-ir2vec
+
+SYNOPSIS
+
+
+:program:`llvm-ir2vec` [*options*] *input-file*
+
+DESCRIPTION
+---
+
+:program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
+generates IR2Vec embeddings for LLVM IR and supports triplet generation 
+for vocabulary training. It provides two main operation modes:
+
+1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+   training from LLVM IR.
+
+2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+   at different granularity levels (instruction, basic block, or function).
+
+The tool is designed to facilitate machine learning applications that work with
+LLVM IR by converting the IR into numerical representations that can be used by
+ML models.
+
+.. note::
+
+   For information about using IR2Vec programmatically within LLVM passes and 
+   the C++ API, see the `IR2Vec Embeddings 
`_ 
+   section in the MLGO documentation.
+
+OPERATION MODES
+---
+
+Triplet Generation Mode
+~~~
+
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
+consisting of opcodes, types, and operands. These triplets can be used to train
+vocabularies for embedding generation.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+
+Embedding Generation Mode
+~~
+
+In embedding mode, :program:`llvm-ir2vec` uses a pre-trained vocabulary to
+generate numerical embeddings for LLVM IR at different levels of granularity.
+
+Example Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json --level=func 
input.bc -o embeddings.txt
+
+OPTIONS
+---
+
+.. option:: --mode=
+
+ Specify the operation mode. Valid values are:
+
+ * ``triplets`` - Generate triplets for vocabulary training
+ * ``embeddings`` - Generate embeddings using trained vocabulary (default)
+
+.. option:: --level=
+
+ Specify the embedding generation level. Valid values are:
+
+ * ``inst`` - Generate instruction-level embeddings
+ * ``bb`` - Generate basic block-level embeddings  
+ * ``func`` - Generate function-level embeddings (default)
+
+.. option:: --function=
+
+ Process only the specified function instead of all functions in the module.
+
+.. option:: --ir2vec-vocab-path=
+
+ Specify the path to the vocabulary file (required for embedding mode).
+ The vocabulary file should be in JSON format and contain the trained
+ vocabulary for embedding generation. See `llvm/lib/Analysis/models`
+ for pre-trained vocabulary files.
+
+.. option:: --ir2vec-opc-weight=
+
+ Specify the weight for opcode embeddings (default: 1.0). This controls
+ the relative importance of instruction opcodes in the final embedding.
+
+.. option:: --ir2vec-type-weight=
+
+ Specify the weight for type embeddings (default: 0.5). This controls
+ the relative importance of type information in the final embedding.
+
+.. option:: --ir2vec-arg-weight=
+
+ Specify the weight for argument embeddings (default: 0.2). This controls
+ the relative importance of operand information in the final embedding.
+
+.. option:: -o 
+
+ Specify the output filename. Use ``-`` to write to standard output (default).
+
+.. option:: --help
+
+ Print a summary of command line options.
+
+.. note::
+
+   ``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
+   ``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
+   mode. These options are ignored in triplet mode.
+
+INPUT FILE FORMAT
+-
+
+:program:`llvm-ir2vec` accepts LLVM bitcode files (``.b

[llvm-branch-commits] [llvm] [IR2Vec] Adding documentation for llvm-ir2vec tool (PR #148719)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/148719

>From 6ae5021a2a5e3d46a7275ae9aa50fa54956b2e82 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Mon, 14 Jul 2025 20:27:42 +
Subject: [PATCH] [IR2Vec] Adding documentation for llvm-ir2vec tool

---
 llvm/docs/CommandGuide/index.rst   |   1 +
 llvm/docs/CommandGuide/llvm-ir2vec.rst | 170 +
 llvm/docs/MLGO.rst |  12 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp |   8 +-
 4 files changed, 184 insertions(+), 7 deletions(-)
 create mode 100644 llvm/docs/CommandGuide/llvm-ir2vec.rst

diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst
index 88fc1fd326b76..f85f32a1fdd51 100644
--- a/llvm/docs/CommandGuide/index.rst
+++ b/llvm/docs/CommandGuide/index.rst
@@ -27,6 +27,7 @@ Basic Commands
llvm-dis
llvm-dwarfdump
llvm-dwarfutil
+   llvm-ir2vec
llvm-lib
llvm-libtool-darwin
llvm-link
diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
new file mode 100644
index 0..13fe4996b968f
--- /dev/null
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -0,0 +1,170 @@
+llvm-ir2vec - IR2Vec Embedding Generation Tool
+==
+
+.. program:: llvm-ir2vec
+
+SYNOPSIS
+
+
+:program:`llvm-ir2vec` [*options*] *input-file*
+
+DESCRIPTION
+---
+
+:program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
+generates IR2Vec embeddings for LLVM IR and supports triplet generation 
+for vocabulary training. It provides two main operation modes:
+
+1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+   training from LLVM IR.
+
+2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+   at different granularity levels (instruction, basic block, or function).
+
+The tool is designed to facilitate machine learning applications that work with
+LLVM IR by converting the IR into numerical representations that can be used by
+ML models.
+
+.. note::
+
+   For information about using IR2Vec programmatically within LLVM passes and 
+   the C++ API, see the `IR2Vec Embeddings 
`_ 
+   section in the MLGO documentation.
+
+OPERATION MODES
+---
+
+Triplet Generation Mode
+~~~
+
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
+consisting of opcodes, types, and operands. These triplets can be used to train
+vocabularies for embedding generation.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+
+Embedding Generation Mode
+~~
+
+In embedding mode, :program:`llvm-ir2vec` uses a pre-trained vocabulary to
+generate numerical embeddings for LLVM IR at different levels of granularity.
+
+Example Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json --level=func 
input.bc -o embeddings.txt
+
+OPTIONS
+---
+
+.. option:: --mode=
+
+ Specify the operation mode. Valid values are:
+
+ * ``triplets`` - Generate triplets for vocabulary training
+ * ``embeddings`` - Generate embeddings using trained vocabulary (default)
+
+.. option:: --level=
+
+ Specify the embedding generation level. Valid values are:
+
+ * ``inst`` - Generate instruction-level embeddings
+ * ``bb`` - Generate basic block-level embeddings  
+ * ``func`` - Generate function-level embeddings (default)
+
+.. option:: --function=
+
+ Process only the specified function instead of all functions in the module.
+
+.. option:: --ir2vec-vocab-path=
+
+ Specify the path to the vocabulary file (required for embedding mode).
+ The vocabulary file should be in JSON format and contain the trained
+ vocabulary for embedding generation. See `llvm/lib/Analysis/models`
+ for pre-trained vocabulary files.
+
+.. option:: --ir2vec-opc-weight=
+
+ Specify the weight for opcode embeddings (default: 1.0). This controls
+ the relative importance of instruction opcodes in the final embedding.
+
+.. option:: --ir2vec-type-weight=
+
+ Specify the weight for type embeddings (default: 0.5). This controls
+ the relative importance of type information in the final embedding.
+
+.. option:: --ir2vec-arg-weight=
+
+ Specify the weight for argument embeddings (default: 0.2). This controls
+ the relative importance of operand information in the final embedding.
+
+.. option:: -o 
+
+ Specify the output filename. Use ``-`` to write to standard output (default).
+
+.. option:: --help
+
+ Print a summary of command line options.
+
+.. note::
+
+   ``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
+   ``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
+   mode. These options are ignored in triplet mode.
+
+INPUT FILE FORMAT
+-
+
+:program:`llvm-ir2vec` accepts LLVM bitcode files (``.b

[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From 7b801dfc5c070be416696c41a96294c60600071e Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/lit.cfg.py  |   7 +
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 194 --
 4 files changed, 260 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 771d9245368b1..8a1b001695edc 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+(
+"%ir2vec_test_vocab_dir",
+os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+)
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 
2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4

[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From 7b801dfc5c070be416696c41a96294c60600071e Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/lit.cfg.py  |   7 +
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 194 --
 4 files changed, 260 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 771d9245368b1..8a1b001695edc 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+(
+"%ir2vec_test_vocab_dir",
+os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+)
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 
2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4

[llvm-branch-commits] [llvm] exposing-new-methods (PR #149212)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy created 
https://github.com/llvm/llvm-project/pull/149212

None

>From d1f4e861a6b66441c726bfa57eb71b432a730624 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 21:49:05 +
Subject: [PATCH] exposing-new-methods

---
 llvm/include/llvm/Analysis/IR2Vec.h|  9 
 llvm/lib/Analysis/IR2Vec.cpp   | 20 +++-
 llvm/unittests/Analysis/IR2VecTest.cpp | 63 ++
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 3d7edf08c8807..d87457cac7642 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,6 +170,10 @@ class Vocabulary {
   unsigned getDimension() const;
   size_t size() const;
 
+  static size_t expectedSize() {
+return MaxOpcodes + MaxTypeIDs + MaxOperandKinds;
+  }
+
   /// Helper function to get vocabulary key for a given Opcode
   static StringRef getVocabKeyForOpcode(unsigned Opcode);
 
@@ -182,6 +186,11 @@ class Vocabulary {
   /// Helper function to classify an operand into OperandKind
   static OperandKind getOperandKind(const Value *Op);
 
+  /// Helpers to return the IDs of a given Opcode, TypeID, or OperandKind
+  static unsigned getNumericID(unsigned Opcode);
+  static unsigned getNumericID(Type::TypeID TypeID);
+  static unsigned getNumericID(const Value *Op);
+
   /// Accessors to get the embedding for a given entity.
   const ir2vec::Embedding &operator[](unsigned Opcode) const;
   const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 898bf5b202feb..95f30fd3f4275 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -215,7 +215,7 @@ Vocabulary::Vocabulary(VocabVector &&Vocab)
 : Vocab(std::move(Vocab)), Valid(true) {}
 
 bool Vocabulary::isValid() const {
-  return Vocab.size() == (MaxOpcodes + MaxTypeIDs + MaxOperandKinds) && Valid;
+  return Vocab.size() == Vocabulary::expectedSize() && Valid;
 }
 
 size_t Vocabulary::size() const {
@@ -324,8 +324,24 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const 
Value *Op) {
   return OperandKind::VariableID;
 }
 
+unsigned Vocabulary::getNumericID(unsigned Opcode) {
+  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+  return Opcode - 1; // Convert to zero-based index
+}
+
+unsigned Vocabulary::getNumericID(Type::TypeID TypeID) {
+  assert(static_cast(TypeID) < MaxTypeIDs && "Invalid type ID");
+  return MaxOpcodes + static_cast(TypeID);
+}
+
+unsigned Vocabulary::getNumericID(const Value *Op) {
+  unsigned Index = static_cast(getOperandKind(Op));
+  assert(Index < MaxOperandKinds && "Invalid OperandKind");
+  return MaxOpcodes + MaxTypeIDs + Index;
+}
+
 StringRef Vocabulary::getStringKey(unsigned Pos) {
-  assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds &&
+  assert(Pos < Vocabulary::expectedSize() &&
  "Position out of bounds in vocabulary");
   // Opcode
   if (Pos < MaxOpcodes)
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp 
b/llvm/unittests/Analysis/IR2VecTest.cpp
index cb6d633306a81..7c9a5464bfe1d 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -396,6 +396,69 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
   }
 }
 
+TEST(IR2VecVocabularyTest, NumericIDMap) {
+  // Test getNumericID for opcodes
+  EXPECT_EQ(Vocabulary::getNumericID(1u), 0u);
+  EXPECT_EQ(Vocabulary::getNumericID(13u), 12u);
+  EXPECT_EQ(Vocabulary::getNumericID(MaxOpcodes), MaxOpcodes - 1);
+
+  // Test getNumericID for Type IDs
+  EXPECT_EQ(Vocabulary::getNumericID(Type::VoidTyID),
+MaxOpcodes + static_cast(Type::VoidTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::HalfTyID),
+MaxOpcodes + static_cast(Type::HalfTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::FloatTyID),
+MaxOpcodes + static_cast(Type::FloatTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::IntegerTyID),
+MaxOpcodes + static_cast(Type::IntegerTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::PointerTyID),
+MaxOpcodes + static_cast(Type::PointerTyID));
+
+  // Test getNumericID for Value operands
+  LLVMContext Ctx;
+  Module M("TestM", Ctx);
+  FunctionType *FTy =
+  FunctionType::get(Type::getVoidTy(Ctx), {Type::getInt32Ty(Ctx)}, false);
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "testFunc", 
M);
+
+  // Test Function operand
+  EXPECT_EQ(Vocabulary::getNumericID(F),
+MaxOpcodes + MaxTypeIDs + 0u); // Function = 0
+
+  // Test Constant operand
+  Constant *C = ConstantInt::get(Type::getInt32Ty(Ctx), 42);
+  EXPECT_EQ(Vocabulary::getNumericID(C),
+MaxOpcodes + MaxTypeIDs + 2u); // Constant = 2
+
+  // Test Pointer operand
+  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+  AllocaInst *PtrVal = new AllocaInst(Type::getInt32Ty(Ctx), 0, "ptr", BB);
+  

[llvm-branch-commits] [llvm] support-stdin-input-llvm-ir2vec (PR #149213)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy created 
https://github.com/llvm/llvm-project/pull/149213

None

>From 6efc8a8ef6fcb794af9a2d4657c943d4479bca78 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:01:47 +
Subject: [PATCH] support-stdin-input-llvm-ir2vec

---
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 27 --
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index c9e2c7c713e18..3e6cb4b64fde5 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -48,10 +49,10 @@ namespace ir2vec {
 
 static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
 
-static cl::opt InputFilename(cl::Positional,
-  cl::desc(""),
-  cl::Required,
-  cl::cat(IR2VecToolCategory));
+static cl::opt
+InputFilename(cl::Positional,
+  cl::desc(""),
+  cl::init("-"), cl::cat(IR2VecToolCategory));
 
 static cl::opt OutputFilename("o", cl::desc("Output filename"),
cl::value_desc("filename"),
@@ -283,10 +284,24 @@ int main(int argc, char **argv) {
   if (Mode == TripletMode && Level.getNumOccurrences() > 0)
 errs() << "Warning: --level option is ignored in triplet mode\n";
 
-  // Parse the input LLVM IR file
+  // Parse the input LLVM IR file or stdin
   SMDiagnostic Err;
   LLVMContext Context;
-  std::unique_ptr M = parseIRFile(InputFilename, Err, Context);
+  std::unique_ptr M;
+
+  if (InputFilename == "-") {
+// Read from stdin
+auto StdinBuffer = MemoryBuffer::getSTDIN();
+if (std::error_code EC = StdinBuffer.getError()) {
+  errs() << "Error reading from stdin: " << EC.message() << "\n";
+  return 1;
+}
+M = parseIR(StdinBuffer.get()->getMemBufferRef(), Err, Context);
+  } else {
+// Read from file
+M = parseIRFile(InputFilename, Err, Context);
+  }
+
   if (!M) {
 Err.print(argv[0], errs());
 return 1;

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Add support for reading from stdin (PR #149213)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149213

>From 1e2226100f1068b27e96766bd69e0876a2a98663 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:01:47 +
Subject: [PATCH] support-stdin-input-llvm-ir2vec

---
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 27 --
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index c9e2c7c713e18..3e6cb4b64fde5 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -48,10 +49,10 @@ namespace ir2vec {
 
 static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
 
-static cl::opt InputFilename(cl::Positional,
-  cl::desc(""),
-  cl::Required,
-  cl::cat(IR2VecToolCategory));
+static cl::opt
+InputFilename(cl::Positional,
+  cl::desc(""),
+  cl::init("-"), cl::cat(IR2VecToolCategory));
 
 static cl::opt OutputFilename("o", cl::desc("Output filename"),
cl::value_desc("filename"),
@@ -283,10 +284,24 @@ int main(int argc, char **argv) {
   if (Mode == TripletMode && Level.getNumOccurrences() > 0)
 errs() << "Warning: --level option is ignored in triplet mode\n";
 
-  // Parse the input LLVM IR file
+  // Parse the input LLVM IR file or stdin
   SMDiagnostic Err;
   LLVMContext Context;
-  std::unique_ptr M = parseIRFile(InputFilename, Err, Context);
+  std::unique_ptr M;
+
+  if (InputFilename == "-") {
+// Read from stdin
+auto StdinBuffer = MemoryBuffer::getSTDIN();
+if (std::error_code EC = StdinBuffer.getError()) {
+  errs() << "Error reading from stdin: " << EC.message() << "\n";
+  return 1;
+}
+M = parseIR(StdinBuffer.get()->getMemBufferRef(), Err, Context);
+  } else {
+// Read from file
+M = parseIRFile(InputFilename, Err, Context);
+  }
+
   if (!M) {
 Err.print(argv[0], errs());
 return 1;

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From 0ee74a899d0cc2d5c6e91de898343e26b4ec1109 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/lit.cfg.py  |   7 +
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 194 --
 4 files changed, 260 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 771d9245368b1..8a1b001695edc 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+(
+"%ir2vec_test_vocab_dir",
+os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+)
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 
2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4

[llvm-branch-commits] [llvm] [NFC][IR2Vec] Add reference to generateTriplets.py in documentation (PR #149215)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From 5a8f74a2c56d6052bf1db29fe3c16950c50c3987 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec][NFC] Add helper methods for numeric ID mapping in Vocabulary (PR #149212)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149212

>From 68ae9f559439dd1b486713536c925f900afdfbad Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 21:49:05 +
Subject: [PATCH] exposing-new-methods

---
 llvm/include/llvm/Analysis/IR2Vec.h|  9 
 llvm/lib/Analysis/IR2Vec.cpp   | 20 +++-
 llvm/unittests/Analysis/IR2VecTest.cpp | 63 ++
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 3d7edf08c8807..d87457cac7642 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,6 +170,10 @@ class Vocabulary {
   unsigned getDimension() const;
   size_t size() const;
 
+  static size_t expectedSize() {
+return MaxOpcodes + MaxTypeIDs + MaxOperandKinds;
+  }
+
   /// Helper function to get vocabulary key for a given Opcode
   static StringRef getVocabKeyForOpcode(unsigned Opcode);
 
@@ -182,6 +186,11 @@ class Vocabulary {
   /// Helper function to classify an operand into OperandKind
   static OperandKind getOperandKind(const Value *Op);
 
+  /// Helpers to return the IDs of a given Opcode, TypeID, or OperandKind
+  static unsigned getNumericID(unsigned Opcode);
+  static unsigned getNumericID(Type::TypeID TypeID);
+  static unsigned getNumericID(const Value *Op);
+
   /// Accessors to get the embedding for a given entity.
   const ir2vec::Embedding &operator[](unsigned Opcode) const;
   const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 898bf5b202feb..95f30fd3f4275 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -215,7 +215,7 @@ Vocabulary::Vocabulary(VocabVector &&Vocab)
 : Vocab(std::move(Vocab)), Valid(true) {}
 
 bool Vocabulary::isValid() const {
-  return Vocab.size() == (MaxOpcodes + MaxTypeIDs + MaxOperandKinds) && Valid;
+  return Vocab.size() == Vocabulary::expectedSize() && Valid;
 }
 
 size_t Vocabulary::size() const {
@@ -324,8 +324,24 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const 
Value *Op) {
   return OperandKind::VariableID;
 }
 
+unsigned Vocabulary::getNumericID(unsigned Opcode) {
+  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+  return Opcode - 1; // Convert to zero-based index
+}
+
+unsigned Vocabulary::getNumericID(Type::TypeID TypeID) {
+  assert(static_cast(TypeID) < MaxTypeIDs && "Invalid type ID");
+  return MaxOpcodes + static_cast(TypeID);
+}
+
+unsigned Vocabulary::getNumericID(const Value *Op) {
+  unsigned Index = static_cast(getOperandKind(Op));
+  assert(Index < MaxOperandKinds && "Invalid OperandKind");
+  return MaxOpcodes + MaxTypeIDs + Index;
+}
+
 StringRef Vocabulary::getStringKey(unsigned Pos) {
-  assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds &&
+  assert(Pos < Vocabulary::expectedSize() &&
  "Position out of bounds in vocabulary");
   // Opcode
   if (Pos < MaxOpcodes)
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp 
b/llvm/unittests/Analysis/IR2VecTest.cpp
index cb6d633306a81..7c9a5464bfe1d 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -396,6 +396,69 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
   }
 }
 
+TEST(IR2VecVocabularyTest, NumericIDMap) {
+  // Test getNumericID for opcodes
+  EXPECT_EQ(Vocabulary::getNumericID(1u), 0u);
+  EXPECT_EQ(Vocabulary::getNumericID(13u), 12u);
+  EXPECT_EQ(Vocabulary::getNumericID(MaxOpcodes), MaxOpcodes - 1);
+
+  // Test getNumericID for Type IDs
+  EXPECT_EQ(Vocabulary::getNumericID(Type::VoidTyID),
+MaxOpcodes + static_cast(Type::VoidTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::HalfTyID),
+MaxOpcodes + static_cast(Type::HalfTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::FloatTyID),
+MaxOpcodes + static_cast(Type::FloatTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::IntegerTyID),
+MaxOpcodes + static_cast(Type::IntegerTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::PointerTyID),
+MaxOpcodes + static_cast(Type::PointerTyID));
+
+  // Test getNumericID for Value operands
+  LLVMContext Ctx;
+  Module M("TestM", Ctx);
+  FunctionType *FTy =
+  FunctionType::get(Type::getVoidTy(Ctx), {Type::getInt32Ty(Ctx)}, false);
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "testFunc", 
M);
+
+  // Test Function operand
+  EXPECT_EQ(Vocabulary::getNumericID(F),
+MaxOpcodes + MaxTypeIDs + 0u); // Function = 0
+
+  // Test Constant operand
+  Constant *C = ConstantInt::get(Type::getInt32Ty(Ctx), 42);
+  EXPECT_EQ(Vocabulary::getNumericID(C),
+MaxOpcodes + MaxTypeIDs + 2u); // Constant = 2
+
+  // Test Pointer operand
+  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+  AllocaInst *PtrVal = new AllocaInst(Type::getInt32Ty(Ctx), 0, "ptr", BB);
+  EXPECT

[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Add support for reading from stdin (PR #149213)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149213

>From 1e2226100f1068b27e96766bd69e0876a2a98663 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:01:47 +
Subject: [PATCH] support-stdin-input-llvm-ir2vec

---
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 27 --
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index c9e2c7c713e18..3e6cb4b64fde5 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -48,10 +49,10 @@ namespace ir2vec {
 
 static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
 
-static cl::opt InputFilename(cl::Positional,
-  cl::desc(""),
-  cl::Required,
-  cl::cat(IR2VecToolCategory));
+static cl::opt
+InputFilename(cl::Positional,
+  cl::desc(""),
+  cl::init("-"), cl::cat(IR2VecToolCategory));
 
 static cl::opt OutputFilename("o", cl::desc("Output filename"),
cl::value_desc("filename"),
@@ -283,10 +284,24 @@ int main(int argc, char **argv) {
   if (Mode == TripletMode && Level.getNumOccurrences() > 0)
 errs() << "Warning: --level option is ignored in triplet mode\n";
 
-  // Parse the input LLVM IR file
+  // Parse the input LLVM IR file or stdin
   SMDiagnostic Err;
   LLVMContext Context;
-  std::unique_ptr M = parseIRFile(InputFilename, Err, Context);
+  std::unique_ptr M;
+
+  if (InputFilename == "-") {
+// Read from stdin
+auto StdinBuffer = MemoryBuffer::getSTDIN();
+if (std::error_code EC = StdinBuffer.getError()) {
+  errs() << "Error reading from stdin: " << EC.message() << "\n";
+  return 1;
+}
+M = parseIR(StdinBuffer.get()->getMemBufferRef(), Err, Context);
+  } else {
+// Read from file
+M = parseIRFile(InputFilename, Err, Context);
+  }
+
   if (!M) {
 Err.print(argv[0], errs());
 return 1;

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NFC][IR2Vec] Add reference to generateTriplets.py in documentation (PR #149215)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From 5a8f74a2c56d6052bf1db29fe3c16950c50c3987 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Adding documentation for llvm-ir2vec tool (PR #148719)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/148719

>From 42671b8760e87f31f851c7d60004f9d1759c81f9 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Mon, 14 Jul 2025 20:27:42 +
Subject: [PATCH] [IR2Vec] Adding documentation for llvm-ir2vec tool

---
 llvm/docs/CommandGuide/index.rst   |   1 +
 llvm/docs/CommandGuide/llvm-ir2vec.rst | 170 +
 llvm/docs/MLGO.rst |  12 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp |   8 +-
 4 files changed, 184 insertions(+), 7 deletions(-)
 create mode 100644 llvm/docs/CommandGuide/llvm-ir2vec.rst

diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst
index 88fc1fd326b76..f85f32a1fdd51 100644
--- a/llvm/docs/CommandGuide/index.rst
+++ b/llvm/docs/CommandGuide/index.rst
@@ -27,6 +27,7 @@ Basic Commands
llvm-dis
llvm-dwarfdump
llvm-dwarfutil
+   llvm-ir2vec
llvm-lib
llvm-libtool-darwin
llvm-link
diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
new file mode 100644
index 0..13fe4996b968f
--- /dev/null
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -0,0 +1,170 @@
+llvm-ir2vec - IR2Vec Embedding Generation Tool
+==
+
+.. program:: llvm-ir2vec
+
+SYNOPSIS
+
+
+:program:`llvm-ir2vec` [*options*] *input-file*
+
+DESCRIPTION
+---
+
+:program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
+generates IR2Vec embeddings for LLVM IR and supports triplet generation 
+for vocabulary training. It provides two main operation modes:
+
+1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+   training from LLVM IR.
+
+2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+   at different granularity levels (instruction, basic block, or function).
+
+The tool is designed to facilitate machine learning applications that work with
+LLVM IR by converting the IR into numerical representations that can be used by
+ML models.
+
+.. note::
+
+   For information about using IR2Vec programmatically within LLVM passes and 
+   the C++ API, see the `IR2Vec Embeddings 
`_ 
+   section in the MLGO documentation.
+
+OPERATION MODES
+---
+
+Triplet Generation Mode
+~~~
+
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
+consisting of opcodes, types, and operands. These triplets can be used to train
+vocabularies for embedding generation.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+
+Embedding Generation Mode
+~~
+
+In embedding mode, :program:`llvm-ir2vec` uses a pre-trained vocabulary to
+generate numerical embeddings for LLVM IR at different levels of granularity.
+
+Example Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json --level=func 
input.bc -o embeddings.txt
+
+OPTIONS
+---
+
+.. option:: --mode=
+
+ Specify the operation mode. Valid values are:
+
+ * ``triplets`` - Generate triplets for vocabulary training
+ * ``embeddings`` - Generate embeddings using trained vocabulary (default)
+
+.. option:: --level=
+
+ Specify the embedding generation level. Valid values are:
+
+ * ``inst`` - Generate instruction-level embeddings
+ * ``bb`` - Generate basic block-level embeddings  
+ * ``func`` - Generate function-level embeddings (default)
+
+.. option:: --function=
+
+ Process only the specified function instead of all functions in the module.
+
+.. option:: --ir2vec-vocab-path=
+
+ Specify the path to the vocabulary file (required for embedding mode).
+ The vocabulary file should be in JSON format and contain the trained
+ vocabulary for embedding generation. See `llvm/lib/Analysis/models`
+ for pre-trained vocabulary files.
+
+.. option:: --ir2vec-opc-weight=
+
+ Specify the weight for opcode embeddings (default: 1.0). This controls
+ the relative importance of instruction opcodes in the final embedding.
+
+.. option:: --ir2vec-type-weight=
+
+ Specify the weight for type embeddings (default: 0.5). This controls
+ the relative importance of type information in the final embedding.
+
+.. option:: --ir2vec-arg-weight=
+
+ Specify the weight for argument embeddings (default: 0.2). This controls
+ the relative importance of operand information in the final embedding.
+
+.. option:: -o 
+
+ Specify the output filename. Use ``-`` to write to standard output (default).
+
+.. option:: --help
+
+ Print a summary of command line options.
+
+.. note::
+
+   ``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
+   ``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
+   mode. These options are ignored in triplet mode.
+
+INPUT FILE FORMAT
+-
+
+:program:`llvm-ir2vec` accepts LLVM bitcode files (``.b

[llvm-branch-commits] [llvm] [IR2Vec][NFC] Add helper methods for numeric ID mapping in Vocabulary (PR #149212)

2025-07-16 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149212

>From 68ae9f559439dd1b486713536c925f900afdfbad Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 21:49:05 +
Subject: [PATCH] exposing-new-methods

---
 llvm/include/llvm/Analysis/IR2Vec.h|  9 
 llvm/lib/Analysis/IR2Vec.cpp   | 20 +++-
 llvm/unittests/Analysis/IR2VecTest.cpp | 63 ++
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 3d7edf08c8807..d87457cac7642 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,6 +170,10 @@ class Vocabulary {
   unsigned getDimension() const;
   size_t size() const;
 
+  static size_t expectedSize() {
+return MaxOpcodes + MaxTypeIDs + MaxOperandKinds;
+  }
+
   /// Helper function to get vocabulary key for a given Opcode
   static StringRef getVocabKeyForOpcode(unsigned Opcode);
 
@@ -182,6 +186,11 @@ class Vocabulary {
   /// Helper function to classify an operand into OperandKind
   static OperandKind getOperandKind(const Value *Op);
 
+  /// Helpers to return the IDs of a given Opcode, TypeID, or OperandKind
+  static unsigned getNumericID(unsigned Opcode);
+  static unsigned getNumericID(Type::TypeID TypeID);
+  static unsigned getNumericID(const Value *Op);
+
   /// Accessors to get the embedding for a given entity.
   const ir2vec::Embedding &operator[](unsigned Opcode) const;
   const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 898bf5b202feb..95f30fd3f4275 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -215,7 +215,7 @@ Vocabulary::Vocabulary(VocabVector &&Vocab)
 : Vocab(std::move(Vocab)), Valid(true) {}
 
 bool Vocabulary::isValid() const {
-  return Vocab.size() == (MaxOpcodes + MaxTypeIDs + MaxOperandKinds) && Valid;
+  return Vocab.size() == Vocabulary::expectedSize() && Valid;
 }
 
 size_t Vocabulary::size() const {
@@ -324,8 +324,24 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const 
Value *Op) {
   return OperandKind::VariableID;
 }
 
+unsigned Vocabulary::getNumericID(unsigned Opcode) {
+  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+  return Opcode - 1; // Convert to zero-based index
+}
+
+unsigned Vocabulary::getNumericID(Type::TypeID TypeID) {
+  assert(static_cast(TypeID) < MaxTypeIDs && "Invalid type ID");
+  return MaxOpcodes + static_cast(TypeID);
+}
+
+unsigned Vocabulary::getNumericID(const Value *Op) {
+  unsigned Index = static_cast(getOperandKind(Op));
+  assert(Index < MaxOperandKinds && "Invalid OperandKind");
+  return MaxOpcodes + MaxTypeIDs + Index;
+}
+
 StringRef Vocabulary::getStringKey(unsigned Pos) {
-  assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds &&
+  assert(Pos < Vocabulary::expectedSize() &&
  "Position out of bounds in vocabulary");
   // Opcode
   if (Pos < MaxOpcodes)
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp 
b/llvm/unittests/Analysis/IR2VecTest.cpp
index cb6d633306a81..7c9a5464bfe1d 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -396,6 +396,69 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
   }
 }
 
+TEST(IR2VecVocabularyTest, NumericIDMap) {
+  // Test getNumericID for opcodes
+  EXPECT_EQ(Vocabulary::getNumericID(1u), 0u);
+  EXPECT_EQ(Vocabulary::getNumericID(13u), 12u);
+  EXPECT_EQ(Vocabulary::getNumericID(MaxOpcodes), MaxOpcodes - 1);
+
+  // Test getNumericID for Type IDs
+  EXPECT_EQ(Vocabulary::getNumericID(Type::VoidTyID),
+MaxOpcodes + static_cast(Type::VoidTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::HalfTyID),
+MaxOpcodes + static_cast(Type::HalfTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::FloatTyID),
+MaxOpcodes + static_cast(Type::FloatTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::IntegerTyID),
+MaxOpcodes + static_cast(Type::IntegerTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::PointerTyID),
+MaxOpcodes + static_cast(Type::PointerTyID));
+
+  // Test getNumericID for Value operands
+  LLVMContext Ctx;
+  Module M("TestM", Ctx);
+  FunctionType *FTy =
+  FunctionType::get(Type::getVoidTy(Ctx), {Type::getInt32Ty(Ctx)}, false);
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "testFunc", 
M);
+
+  // Test Function operand
+  EXPECT_EQ(Vocabulary::getNumericID(F),
+MaxOpcodes + MaxTypeIDs + 0u); // Function = 0
+
+  // Test Constant operand
+  Constant *C = ConstantInt::get(Type::getInt32Ty(Ctx), 42);
+  EXPECT_EQ(Vocabulary::getNumericID(C),
+MaxOpcodes + MaxTypeIDs + 2u); // Constant = 2
+
+  // Test Pointer operand
+  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+  AllocaInst *PtrVal = new AllocaInst(Type::getInt32Ty(Ctx), 0, "ptr", BB);
+  EXPECT

[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Add support for reading from stdin (PR #149213)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits


@@ -283,10 +284,24 @@ int main(int argc, char **argv) {
   if (Mode == TripletMode && Level.getNumOccurrences() > 0)
 errs() << "Warning: --level option is ignored in triplet mode\n";
 
-  // Parse the input LLVM IR file
+  // Parse the input LLVM IR file or stdin
   SMDiagnostic Err;
   LLVMContext Context;
-  std::unique_ptr M = parseIRFile(InputFilename, Err, Context);
+  std::unique_ptr M;
+
+  if (InputFilename == "-") {

svkeerthy wrote:

Thanks! It works indeed. 

https://github.com/llvm/llvm-project/pull/149213
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Revamp triplet generation and add entity mapping mode (PR #149214)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149214

>From 83bba52eba431f776cdb1e051bad073b19aa9763 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:03:56 +
Subject: [PATCH 1/2] revamp-triplet-gen

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|  79 -
 llvm/test/tools/llvm-ir2vec/entities.ll   |  95 ++
 llvm/test/tools/llvm-ir2vec/triplets.ll   |  51 ++-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 204 
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 5 files changed, 627 insertions(+), 93 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/entities.ll
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 13fe4996b968f..56ece4f509f6e 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -13,17 +13,21 @@ DESCRIPTION
 
 :program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
 generates IR2Vec embeddings for LLVM IR and supports triplet generation 
-for vocabulary training. It provides two main operation modes:
+for vocabulary training. It provides three main operation modes:
 
-1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+1. **Triplet Mode**: Generates numeric triplets in train2id format for 
vocabulary
training from LLVM IR.
 
-2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+2. **Entity Mode**: Generates entity mapping files (entity2id.txt) for 
vocabulary 
+   training.
+
+3. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
at different granularity levels (instruction, basic block, or function).
 
 The tool is designed to facilitate machine learning applications that work with
 LLVM IR by converting the IR into numerical representations that can be used by
-ML models.
+ML models. The triplet mode generates numeric IDs directly instead of string 
+triplets, streamlining the training data preparation workflow.
 
 .. note::
 
@@ -34,18 +38,46 @@ ML models.
 OPERATION MODES
 ---
 
+Triplet Generation and Entity Mapping Modes are used for preparing
+vocabulary and training data for knowledge graph embeddings. The Embedding Mode
+is used for generating embeddings from LLVM IR using a pre-trained vocabulary.
+
+The Seed Embedding Vocabulary of IR2Vec is trained on a large corpus of LLVM IR
+by modeling the relationships between opcodes, types, and operands as a 
knowledge
+graph. For this purpose, Triplet Generation and Entity Mapping Modes generate
+triplets and entity mappings in the standard format used for knowledge graph
+embedding training (see 
+
 
+for details).
+
 Triplet Generation Mode
 ~~~
 
-In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
-consisting of opcodes, types, and operands. These triplets can be used to train
-vocabularies for embedding generation.
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts numeric
+triplets consisting of opcode IDs, type IDs, and operand IDs. These triplets 
+are generated in train2id format. The tool outputs numeric IDs directly using 
+the ir2vec::Vocabulary mapping infrastructure, eliminating the need for 
+string-to-ID preprocessing.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets_train2id.txt
+
+Entity Mapping Generation Mode
+~~~
+
+In entity mode, :program:`llvm-ir2vec` generates the entity mappings supported 
by
+IR2Vec in entity2id format. This mode outputs all supported entities (opcodes, 
+types, and operands) with their corresponding numeric IDs, and is not specific 
for 
+an LLVM IR file.
 
 Usage:
 
 .. code-block:: bash
 
-   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+   llvm-ir2vec --mode=entities -o entity2id.txt
 
 Embedding Generation Mode
 ~~
@@ -67,6 +99,7 @@ OPTIONS
  Specify the operation mode. Valid values are:
 
  * ``triplets`` - Generate triplets for vocabulary training
+ * ``entities`` - Generate entity mappings for vocabulary training
  * ``embeddings`` - Generate embeddings using trained vocabulary (default)
 
 .. option:: --level=
@@ -115,7 +148,7 @@ OPTIONS
 
``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
-   mode. These options are ignored in triplet mode.
+   mode. These options are ignored in triplet and entity modes.
 
 INPUT FILE FORMAT
 -
@@ -129,14 +162,34 @@ OUTPUT FORMAT
 Triplet Mode Output
 ~~~
 
-In triplet mode, the output consists of lines containing space-separated 
triplets:
+In triplet mode, the o

[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Add support for reading from stdin (PR #149213)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149213

>From 7fee5899ce6d4bd3d44483c40ef90a7666116f22 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:01:47 +
Subject: [PATCH] support-stdin-input-llvm-ir2vec

---
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 4e88282e85c14..e3aa7bd1b3b1e 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -48,10 +48,10 @@ namespace ir2vec {
 
 static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
 
-static cl::opt InputFilename(cl::Positional,
-  cl::desc(""),
-  cl::Required,
-  cl::cat(IR2VecToolCategory));
+static cl::opt
+InputFilename(cl::Positional,
+  cl::desc(""),
+  cl::init("-"), cl::cat(IR2VecToolCategory));
 
 static cl::opt OutputFilename("o", cl::desc("Output filename"),
cl::value_desc("filename"),
@@ -287,10 +287,14 @@ int main(int argc, char **argv) {
   if (Mode == TripletMode && Level.getNumOccurrences() > 0)
 errs() << "Warning: --level option is ignored in triplet mode\n";
 
-  // Parse the input LLVM IR file
+  // Parse the input LLVM IR file or stdin
   SMDiagnostic Err;
   LLVMContext Context;
-  std::unique_ptr M = parseIRFile(InputFilename, Err, Context);
+  std::unique_ptr M;
+
+  // Read from file or stdin
+  M = parseIRFile(InputFilename, Err, Context);
+
   if (!M) {
 Err.print(argv[0], errs());
 return 1;

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From b2e9297fbc0bc9452efbbd66e04ecb12a3c578c1 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec][NFC] Add helper methods for numeric ID mapping in Vocabulary (PR #149212)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149212

>From 1d7ca8076757401353b403256f03ae9498dbe404 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 21:49:05 +
Subject: [PATCH] exposing-new-methods

---
 llvm/include/llvm/Analysis/IR2Vec.h|  9 
 llvm/lib/Analysis/IR2Vec.cpp   | 20 +++-
 llvm/unittests/Analysis/IR2VecTest.cpp | 63 ++
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 3d7edf08c8807..d87457cac7642 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,6 +170,10 @@ class Vocabulary {
   unsigned getDimension() const;
   size_t size() const;
 
+  static size_t expectedSize() {
+return MaxOpcodes + MaxTypeIDs + MaxOperandKinds;
+  }
+
   /// Helper function to get vocabulary key for a given Opcode
   static StringRef getVocabKeyForOpcode(unsigned Opcode);
 
@@ -182,6 +186,11 @@ class Vocabulary {
   /// Helper function to classify an operand into OperandKind
   static OperandKind getOperandKind(const Value *Op);
 
+  /// Helpers to return the IDs of a given Opcode, TypeID, or OperandKind
+  static unsigned getNumericID(unsigned Opcode);
+  static unsigned getNumericID(Type::TypeID TypeID);
+  static unsigned getNumericID(const Value *Op);
+
   /// Accessors to get the embedding for a given entity.
   const ir2vec::Embedding &operator[](unsigned Opcode) const;
   const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 898bf5b202feb..95f30fd3f4275 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -215,7 +215,7 @@ Vocabulary::Vocabulary(VocabVector &&Vocab)
 : Vocab(std::move(Vocab)), Valid(true) {}
 
 bool Vocabulary::isValid() const {
-  return Vocab.size() == (MaxOpcodes + MaxTypeIDs + MaxOperandKinds) && Valid;
+  return Vocab.size() == Vocabulary::expectedSize() && Valid;
 }
 
 size_t Vocabulary::size() const {
@@ -324,8 +324,24 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const 
Value *Op) {
   return OperandKind::VariableID;
 }
 
+unsigned Vocabulary::getNumericID(unsigned Opcode) {
+  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+  return Opcode - 1; // Convert to zero-based index
+}
+
+unsigned Vocabulary::getNumericID(Type::TypeID TypeID) {
+  assert(static_cast(TypeID) < MaxTypeIDs && "Invalid type ID");
+  return MaxOpcodes + static_cast(TypeID);
+}
+
+unsigned Vocabulary::getNumericID(const Value *Op) {
+  unsigned Index = static_cast(getOperandKind(Op));
+  assert(Index < MaxOperandKinds && "Invalid OperandKind");
+  return MaxOpcodes + MaxTypeIDs + Index;
+}
+
 StringRef Vocabulary::getStringKey(unsigned Pos) {
-  assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds &&
+  assert(Pos < Vocabulary::expectedSize() &&
  "Position out of bounds in vocabulary");
   // Opcode
   if (Pos < MaxOpcodes)
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp 
b/llvm/unittests/Analysis/IR2VecTest.cpp
index cb6d633306a81..7c9a5464bfe1d 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -396,6 +396,69 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
   }
 }
 
+TEST(IR2VecVocabularyTest, NumericIDMap) {
+  // Test getNumericID for opcodes
+  EXPECT_EQ(Vocabulary::getNumericID(1u), 0u);
+  EXPECT_EQ(Vocabulary::getNumericID(13u), 12u);
+  EXPECT_EQ(Vocabulary::getNumericID(MaxOpcodes), MaxOpcodes - 1);
+
+  // Test getNumericID for Type IDs
+  EXPECT_EQ(Vocabulary::getNumericID(Type::VoidTyID),
+MaxOpcodes + static_cast(Type::VoidTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::HalfTyID),
+MaxOpcodes + static_cast(Type::HalfTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::FloatTyID),
+MaxOpcodes + static_cast(Type::FloatTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::IntegerTyID),
+MaxOpcodes + static_cast(Type::IntegerTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::PointerTyID),
+MaxOpcodes + static_cast(Type::PointerTyID));
+
+  // Test getNumericID for Value operands
+  LLVMContext Ctx;
+  Module M("TestM", Ctx);
+  FunctionType *FTy =
+  FunctionType::get(Type::getVoidTy(Ctx), {Type::getInt32Ty(Ctx)}, false);
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "testFunc", 
M);
+
+  // Test Function operand
+  EXPECT_EQ(Vocabulary::getNumericID(F),
+MaxOpcodes + MaxTypeIDs + 0u); // Function = 0
+
+  // Test Constant operand
+  Constant *C = ConstantInt::get(Type::getInt32Ty(Ctx), 42);
+  EXPECT_EQ(Vocabulary::getNumericID(C),
+MaxOpcodes + MaxTypeIDs + 2u); // Constant = 2
+
+  // Test Pointer operand
+  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+  AllocaInst *PtrVal = new AllocaInst(Type::getInt32Ty(Ctx), 0, "ptr", BB);
+  EXPECT

[llvm-branch-commits] [llvm] [IR2Vec] Adding documentation for llvm-ir2vec tool (PR #148719)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/148719

>From a395af507d3ca470e7dfcb20a8078de556bb34d6 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Mon, 14 Jul 2025 20:27:42 +
Subject: [PATCH] [IR2Vec] Adding documentation for llvm-ir2vec tool

---
 llvm/docs/CommandGuide/index.rst   |   1 +
 llvm/docs/CommandGuide/llvm-ir2vec.rst | 170 +
 llvm/docs/MLGO.rst |  12 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp |   8 +-
 4 files changed, 184 insertions(+), 7 deletions(-)
 create mode 100644 llvm/docs/CommandGuide/llvm-ir2vec.rst

diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst
index 88fc1fd326b76..f85f32a1fdd51 100644
--- a/llvm/docs/CommandGuide/index.rst
+++ b/llvm/docs/CommandGuide/index.rst
@@ -27,6 +27,7 @@ Basic Commands
llvm-dis
llvm-dwarfdump
llvm-dwarfutil
+   llvm-ir2vec
llvm-lib
llvm-libtool-darwin
llvm-link
diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
new file mode 100644
index 0..13fe4996b968f
--- /dev/null
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -0,0 +1,170 @@
+llvm-ir2vec - IR2Vec Embedding Generation Tool
+==
+
+.. program:: llvm-ir2vec
+
+SYNOPSIS
+
+
+:program:`llvm-ir2vec` [*options*] *input-file*
+
+DESCRIPTION
+---
+
+:program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
+generates IR2Vec embeddings for LLVM IR and supports triplet generation 
+for vocabulary training. It provides two main operation modes:
+
+1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+   training from LLVM IR.
+
+2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+   at different granularity levels (instruction, basic block, or function).
+
+The tool is designed to facilitate machine learning applications that work with
+LLVM IR by converting the IR into numerical representations that can be used by
+ML models.
+
+.. note::
+
+   For information about using IR2Vec programmatically within LLVM passes and 
+   the C++ API, see the `IR2Vec Embeddings 
`_ 
+   section in the MLGO documentation.
+
+OPERATION MODES
+---
+
+Triplet Generation Mode
+~~~
+
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
+consisting of opcodes, types, and operands. These triplets can be used to train
+vocabularies for embedding generation.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+
+Embedding Generation Mode
+~~
+
+In embedding mode, :program:`llvm-ir2vec` uses a pre-trained vocabulary to
+generate numerical embeddings for LLVM IR at different levels of granularity.
+
+Example Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json --level=func 
input.bc -o embeddings.txt
+
+OPTIONS
+---
+
+.. option:: --mode=
+
+ Specify the operation mode. Valid values are:
+
+ * ``triplets`` - Generate triplets for vocabulary training
+ * ``embeddings`` - Generate embeddings using trained vocabulary (default)
+
+.. option:: --level=
+
+ Specify the embedding generation level. Valid values are:
+
+ * ``inst`` - Generate instruction-level embeddings
+ * ``bb`` - Generate basic block-level embeddings  
+ * ``func`` - Generate function-level embeddings (default)
+
+.. option:: --function=
+
+ Process only the specified function instead of all functions in the module.
+
+.. option:: --ir2vec-vocab-path=
+
+ Specify the path to the vocabulary file (required for embedding mode).
+ The vocabulary file should be in JSON format and contain the trained
+ vocabulary for embedding generation. See `llvm/lib/Analysis/models`
+ for pre-trained vocabulary files.
+
+.. option:: --ir2vec-opc-weight=
+
+ Specify the weight for opcode embeddings (default: 1.0). This controls
+ the relative importance of instruction opcodes in the final embedding.
+
+.. option:: --ir2vec-type-weight=
+
+ Specify the weight for type embeddings (default: 0.5). This controls
+ the relative importance of type information in the final embedding.
+
+.. option:: --ir2vec-arg-weight=
+
+ Specify the weight for argument embeddings (default: 0.2). This controls
+ the relative importance of operand information in the final embedding.
+
+.. option:: -o 
+
+ Specify the output filename. Use ``-`` to write to standard output (default).
+
+.. option:: --help
+
+ Print a summary of command line options.
+
+.. note::
+
+   ``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
+   ``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
+   mode. These options are ignored in triplet mode.
+
+INPUT FILE FORMAT
+-
+
+:program:`llvm-ir2vec` accepts LLVM bitcode files (``.b

[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From c0360c730270ac3f51d7a887f430584f488f459c Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/lit.cfg.py  |   7 +
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 194 --
 4 files changed, 260 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 771d9245368b1..8a1b001695edc 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+(
+"%ir2vec_test_vocab_dir",
+os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+)
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 
2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4

[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Revamp triplet generation and add entity mapping mode (PR #149214)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149214

>From db6db83e5ee2ce1503bd041cbb975b36c0fc59c9 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:03:56 +
Subject: [PATCH 1/2] revamp-triplet-gen

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|  79 -
 llvm/test/tools/llvm-ir2vec/entities.ll   |  95 ++
 llvm/test/tools/llvm-ir2vec/triplets.ll   |  51 ++-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 204 
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 5 files changed, 627 insertions(+), 93 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/entities.ll
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 13fe4996b968f..56ece4f509f6e 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -13,17 +13,21 @@ DESCRIPTION
 
 :program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
 generates IR2Vec embeddings for LLVM IR and supports triplet generation 
-for vocabulary training. It provides two main operation modes:
+for vocabulary training. It provides three main operation modes:
 
-1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+1. **Triplet Mode**: Generates numeric triplets in train2id format for 
vocabulary
training from LLVM IR.
 
-2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+2. **Entity Mode**: Generates entity mapping files (entity2id.txt) for 
vocabulary 
+   training.
+
+3. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
at different granularity levels (instruction, basic block, or function).
 
 The tool is designed to facilitate machine learning applications that work with
 LLVM IR by converting the IR into numerical representations that can be used by
-ML models.
+ML models. The triplet mode generates numeric IDs directly instead of string 
+triplets, streamlining the training data preparation workflow.
 
 .. note::
 
@@ -34,18 +38,46 @@ ML models.
 OPERATION MODES
 ---
 
+Triplet Generation and Entity Mapping Modes are used for preparing
+vocabulary and training data for knowledge graph embeddings. The Embedding Mode
+is used for generating embeddings from LLVM IR using a pre-trained vocabulary.
+
+The Seed Embedding Vocabulary of IR2Vec is trained on a large corpus of LLVM IR
+by modeling the relationships between opcodes, types, and operands as a 
knowledge
+graph. For this purpose, Triplet Generation and Entity Mapping Modes generate
+triplets and entity mappings in the standard format used for knowledge graph
+embedding training (see 
+
 
+for details).
+
 Triplet Generation Mode
 ~~~
 
-In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
-consisting of opcodes, types, and operands. These triplets can be used to train
-vocabularies for embedding generation.
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts numeric
+triplets consisting of opcode IDs, type IDs, and operand IDs. These triplets 
+are generated in train2id format. The tool outputs numeric IDs directly using 
+the ir2vec::Vocabulary mapping infrastructure, eliminating the need for 
+string-to-ID preprocessing.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets_train2id.txt
+
+Entity Mapping Generation Mode
+~~~
+
+In entity mode, :program:`llvm-ir2vec` generates the entity mappings supported 
by
+IR2Vec in entity2id format. This mode outputs all supported entities (opcodes, 
+types, and operands) with their corresponding numeric IDs, and is not specific 
for 
+an LLVM IR file.
 
 Usage:
 
 .. code-block:: bash
 
-   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+   llvm-ir2vec --mode=entities -o entity2id.txt
 
 Embedding Generation Mode
 ~~
@@ -67,6 +99,7 @@ OPTIONS
  Specify the operation mode. Valid values are:
 
  * ``triplets`` - Generate triplets for vocabulary training
+ * ``entities`` - Generate entity mappings for vocabulary training
  * ``embeddings`` - Generate embeddings using trained vocabulary (default)
 
 .. option:: --level=
@@ -115,7 +148,7 @@ OPTIONS
 
``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
-   mode. These options are ignored in triplet mode.
+   mode. These options are ignored in triplet and entity modes.
 
 INPUT FILE FORMAT
 -
@@ -129,14 +162,34 @@ OUTPUT FORMAT
 Triplet Mode Output
 ~~~
 
-In triplet mode, the output consists of lines containing space-separated 
triplets:
+In triplet mode, the o

[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Revamp triplet generation and add entity mapping mode (PR #149214)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149214

>From db6db83e5ee2ce1503bd041cbb975b36c0fc59c9 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:03:56 +
Subject: [PATCH 1/2] revamp-triplet-gen

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|  79 -
 llvm/test/tools/llvm-ir2vec/entities.ll   |  95 ++
 llvm/test/tools/llvm-ir2vec/triplets.ll   |  51 ++-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 204 
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 5 files changed, 627 insertions(+), 93 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/entities.ll
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 13fe4996b968f..56ece4f509f6e 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -13,17 +13,21 @@ DESCRIPTION
 
 :program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
 generates IR2Vec embeddings for LLVM IR and supports triplet generation 
-for vocabulary training. It provides two main operation modes:
+for vocabulary training. It provides three main operation modes:
 
-1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+1. **Triplet Mode**: Generates numeric triplets in train2id format for 
vocabulary
training from LLVM IR.
 
-2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+2. **Entity Mode**: Generates entity mapping files (entity2id.txt) for 
vocabulary 
+   training.
+
+3. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
at different granularity levels (instruction, basic block, or function).
 
 The tool is designed to facilitate machine learning applications that work with
 LLVM IR by converting the IR into numerical representations that can be used by
-ML models.
+ML models. The triplet mode generates numeric IDs directly instead of string 
+triplets, streamlining the training data preparation workflow.
 
 .. note::
 
@@ -34,18 +38,46 @@ ML models.
 OPERATION MODES
 ---
 
+Triplet Generation and Entity Mapping Modes are used for preparing
+vocabulary and training data for knowledge graph embeddings. The Embedding Mode
+is used for generating embeddings from LLVM IR using a pre-trained vocabulary.
+
+The Seed Embedding Vocabulary of IR2Vec is trained on a large corpus of LLVM IR
+by modeling the relationships between opcodes, types, and operands as a 
knowledge
+graph. For this purpose, Triplet Generation and Entity Mapping Modes generate
+triplets and entity mappings in the standard format used for knowledge graph
+embedding training (see 
+
 
+for details).
+
 Triplet Generation Mode
 ~~~
 
-In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
-consisting of opcodes, types, and operands. These triplets can be used to train
-vocabularies for embedding generation.
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts numeric
+triplets consisting of opcode IDs, type IDs, and operand IDs. These triplets 
+are generated in train2id format. The tool outputs numeric IDs directly using 
+the ir2vec::Vocabulary mapping infrastructure, eliminating the need for 
+string-to-ID preprocessing.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets_train2id.txt
+
+Entity Mapping Generation Mode
+~~~
+
+In entity mode, :program:`llvm-ir2vec` generates the entity mappings supported 
by
+IR2Vec in entity2id format. This mode outputs all supported entities (opcodes, 
+types, and operands) with their corresponding numeric IDs, and is not specific 
for 
+an LLVM IR file.
 
 Usage:
 
 .. code-block:: bash
 
-   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+   llvm-ir2vec --mode=entities -o entity2id.txt
 
 Embedding Generation Mode
 ~~
@@ -67,6 +99,7 @@ OPTIONS
  Specify the operation mode. Valid values are:
 
  * ``triplets`` - Generate triplets for vocabulary training
+ * ``entities`` - Generate entity mappings for vocabulary training
  * ``embeddings`` - Generate embeddings using trained vocabulary (default)
 
 .. option:: --level=
@@ -115,7 +148,7 @@ OPTIONS
 
``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
-   mode. These options are ignored in triplet mode.
+   mode. These options are ignored in triplet and entity modes.
 
 INPUT FILE FORMAT
 -
@@ -129,14 +162,34 @@ OUTPUT FORMAT
 Triplet Mode Output
 ~~~
 
-In triplet mode, the output consists of lines containing space-separated 
triplets:
+In triplet mode, the o

[llvm-branch-commits] [llvm] [IR2Vec] Adding documentation for llvm-ir2vec tool (PR #148719)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/148719

>From a395af507d3ca470e7dfcb20a8078de556bb34d6 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Mon, 14 Jul 2025 20:27:42 +
Subject: [PATCH] [IR2Vec] Adding documentation for llvm-ir2vec tool

---
 llvm/docs/CommandGuide/index.rst   |   1 +
 llvm/docs/CommandGuide/llvm-ir2vec.rst | 170 +
 llvm/docs/MLGO.rst |  12 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp |   8 +-
 4 files changed, 184 insertions(+), 7 deletions(-)
 create mode 100644 llvm/docs/CommandGuide/llvm-ir2vec.rst

diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst
index 88fc1fd326b76..f85f32a1fdd51 100644
--- a/llvm/docs/CommandGuide/index.rst
+++ b/llvm/docs/CommandGuide/index.rst
@@ -27,6 +27,7 @@ Basic Commands
llvm-dis
llvm-dwarfdump
llvm-dwarfutil
+   llvm-ir2vec
llvm-lib
llvm-libtool-darwin
llvm-link
diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
new file mode 100644
index 0..13fe4996b968f
--- /dev/null
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -0,0 +1,170 @@
+llvm-ir2vec - IR2Vec Embedding Generation Tool
+==
+
+.. program:: llvm-ir2vec
+
+SYNOPSIS
+
+
+:program:`llvm-ir2vec` [*options*] *input-file*
+
+DESCRIPTION
+---
+
+:program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
+generates IR2Vec embeddings for LLVM IR and supports triplet generation 
+for vocabulary training. It provides two main operation modes:
+
+1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+   training from LLVM IR.
+
+2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+   at different granularity levels (instruction, basic block, or function).
+
+The tool is designed to facilitate machine learning applications that work with
+LLVM IR by converting the IR into numerical representations that can be used by
+ML models.
+
+.. note::
+
+   For information about using IR2Vec programmatically within LLVM passes and 
+   the C++ API, see the `IR2Vec Embeddings 
`_ 
+   section in the MLGO documentation.
+
+OPERATION MODES
+---
+
+Triplet Generation Mode
+~~~
+
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
+consisting of opcodes, types, and operands. These triplets can be used to train
+vocabularies for embedding generation.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+
+Embedding Generation Mode
+~~
+
+In embedding mode, :program:`llvm-ir2vec` uses a pre-trained vocabulary to
+generate numerical embeddings for LLVM IR at different levels of granularity.
+
+Example Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json --level=func 
input.bc -o embeddings.txt
+
+OPTIONS
+---
+
+.. option:: --mode=
+
+ Specify the operation mode. Valid values are:
+
+ * ``triplets`` - Generate triplets for vocabulary training
+ * ``embeddings`` - Generate embeddings using trained vocabulary (default)
+
+.. option:: --level=
+
+ Specify the embedding generation level. Valid values are:
+
+ * ``inst`` - Generate instruction-level embeddings
+ * ``bb`` - Generate basic block-level embeddings  
+ * ``func`` - Generate function-level embeddings (default)
+
+.. option:: --function=
+
+ Process only the specified function instead of all functions in the module.
+
+.. option:: --ir2vec-vocab-path=
+
+ Specify the path to the vocabulary file (required for embedding mode).
+ The vocabulary file should be in JSON format and contain the trained
+ vocabulary for embedding generation. See `llvm/lib/Analysis/models`
+ for pre-trained vocabulary files.
+
+.. option:: --ir2vec-opc-weight=
+
+ Specify the weight for opcode embeddings (default: 1.0). This controls
+ the relative importance of instruction opcodes in the final embedding.
+
+.. option:: --ir2vec-type-weight=
+
+ Specify the weight for type embeddings (default: 0.5). This controls
+ the relative importance of type information in the final embedding.
+
+.. option:: --ir2vec-arg-weight=
+
+ Specify the weight for argument embeddings (default: 0.2). This controls
+ the relative importance of operand information in the final embedding.
+
+.. option:: -o 
+
+ Specify the output filename. Use ``-`` to write to standard output (default).
+
+.. option:: --help
+
+ Print a summary of command line options.
+
+.. note::
+
+   ``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
+   ``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
+   mode. These options are ignored in triplet mode.
+
+INPUT FILE FORMAT
+-
+
+:program:`llvm-ir2vec` accepts LLVM bitcode files (``.b

[llvm-branch-commits] [llvm] [IR2Vec][NFC] Add helper methods for numeric ID mapping in Vocabulary (PR #149212)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149212

>From 1d7ca8076757401353b403256f03ae9498dbe404 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 21:49:05 +
Subject: [PATCH] exposing-new-methods

---
 llvm/include/llvm/Analysis/IR2Vec.h|  9 
 llvm/lib/Analysis/IR2Vec.cpp   | 20 +++-
 llvm/unittests/Analysis/IR2VecTest.cpp | 63 ++
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 3d7edf08c8807..d87457cac7642 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,6 +170,10 @@ class Vocabulary {
   unsigned getDimension() const;
   size_t size() const;
 
+  static size_t expectedSize() {
+return MaxOpcodes + MaxTypeIDs + MaxOperandKinds;
+  }
+
   /// Helper function to get vocabulary key for a given Opcode
   static StringRef getVocabKeyForOpcode(unsigned Opcode);
 
@@ -182,6 +186,11 @@ class Vocabulary {
   /// Helper function to classify an operand into OperandKind
   static OperandKind getOperandKind(const Value *Op);
 
+  /// Helpers to return the IDs of a given Opcode, TypeID, or OperandKind
+  static unsigned getNumericID(unsigned Opcode);
+  static unsigned getNumericID(Type::TypeID TypeID);
+  static unsigned getNumericID(const Value *Op);
+
   /// Accessors to get the embedding for a given entity.
   const ir2vec::Embedding &operator[](unsigned Opcode) const;
   const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 898bf5b202feb..95f30fd3f4275 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -215,7 +215,7 @@ Vocabulary::Vocabulary(VocabVector &&Vocab)
 : Vocab(std::move(Vocab)), Valid(true) {}
 
 bool Vocabulary::isValid() const {
-  return Vocab.size() == (MaxOpcodes + MaxTypeIDs + MaxOperandKinds) && Valid;
+  return Vocab.size() == Vocabulary::expectedSize() && Valid;
 }
 
 size_t Vocabulary::size() const {
@@ -324,8 +324,24 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const 
Value *Op) {
   return OperandKind::VariableID;
 }
 
+unsigned Vocabulary::getNumericID(unsigned Opcode) {
+  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+  return Opcode - 1; // Convert to zero-based index
+}
+
+unsigned Vocabulary::getNumericID(Type::TypeID TypeID) {
+  assert(static_cast(TypeID) < MaxTypeIDs && "Invalid type ID");
+  return MaxOpcodes + static_cast(TypeID);
+}
+
+unsigned Vocabulary::getNumericID(const Value *Op) {
+  unsigned Index = static_cast(getOperandKind(Op));
+  assert(Index < MaxOperandKinds && "Invalid OperandKind");
+  return MaxOpcodes + MaxTypeIDs + Index;
+}
+
 StringRef Vocabulary::getStringKey(unsigned Pos) {
-  assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds &&
+  assert(Pos < Vocabulary::expectedSize() &&
  "Position out of bounds in vocabulary");
   // Opcode
   if (Pos < MaxOpcodes)
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp 
b/llvm/unittests/Analysis/IR2VecTest.cpp
index cb6d633306a81..7c9a5464bfe1d 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -396,6 +396,69 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
   }
 }
 
+TEST(IR2VecVocabularyTest, NumericIDMap) {
+  // Test getNumericID for opcodes
+  EXPECT_EQ(Vocabulary::getNumericID(1u), 0u);
+  EXPECT_EQ(Vocabulary::getNumericID(13u), 12u);
+  EXPECT_EQ(Vocabulary::getNumericID(MaxOpcodes), MaxOpcodes - 1);
+
+  // Test getNumericID for Type IDs
+  EXPECT_EQ(Vocabulary::getNumericID(Type::VoidTyID),
+MaxOpcodes + static_cast(Type::VoidTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::HalfTyID),
+MaxOpcodes + static_cast(Type::HalfTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::FloatTyID),
+MaxOpcodes + static_cast(Type::FloatTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::IntegerTyID),
+MaxOpcodes + static_cast(Type::IntegerTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::PointerTyID),
+MaxOpcodes + static_cast(Type::PointerTyID));
+
+  // Test getNumericID for Value operands
+  LLVMContext Ctx;
+  Module M("TestM", Ctx);
+  FunctionType *FTy =
+  FunctionType::get(Type::getVoidTy(Ctx), {Type::getInt32Ty(Ctx)}, false);
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "testFunc", 
M);
+
+  // Test Function operand
+  EXPECT_EQ(Vocabulary::getNumericID(F),
+MaxOpcodes + MaxTypeIDs + 0u); // Function = 0
+
+  // Test Constant operand
+  Constant *C = ConstantInt::get(Type::getInt32Ty(Ctx), 42);
+  EXPECT_EQ(Vocabulary::getNumericID(C),
+MaxOpcodes + MaxTypeIDs + 2u); // Constant = 2
+
+  // Test Pointer operand
+  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+  AllocaInst *PtrVal = new AllocaInst(Type::getInt32Ty(Ctx), 0, "ptr", BB);
+  EXPECT

[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Add support for reading from stdin (PR #149213)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149213

>From 090355226c63ebe3a010061d2dab545f217edf5c Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:01:47 +
Subject: [PATCH] support-stdin-input-llvm-ir2vec

---
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 4e88282e85c14..e3aa7bd1b3b1e 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -48,10 +48,10 @@ namespace ir2vec {
 
 static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
 
-static cl::opt InputFilename(cl::Positional,
-  cl::desc(""),
-  cl::Required,
-  cl::cat(IR2VecToolCategory));
+static cl::opt
+InputFilename(cl::Positional,
+  cl::desc(""),
+  cl::init("-"), cl::cat(IR2VecToolCategory));
 
 static cl::opt OutputFilename("o", cl::desc("Output filename"),
cl::value_desc("filename"),
@@ -287,10 +287,14 @@ int main(int argc, char **argv) {
   if (Mode == TripletMode && Level.getNumOccurrences() > 0)
 errs() << "Warning: --level option is ignored in triplet mode\n";
 
-  // Parse the input LLVM IR file
+  // Parse the input LLVM IR file or stdin
   SMDiagnostic Err;
   LLVMContext Context;
-  std::unique_ptr M = parseIRFile(InputFilename, Err, Context);
+  std::unique_ptr M;
+
+  // Read from file or stdin
+  M = parseIRFile(InputFilename, Err, Context);
+
   if (!M) {
 Err.print(argv[0], errs());
 return 1;

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From 0007c062b403d12347b54e28494c5037a0d21cfd Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From 0007c062b403d12347b54e28494c5037a0d21cfd Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/147844

>From c0360c730270ac3f51d7a887f430584f488f459c Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 9 Jul 2025 22:44:03 +
Subject: [PATCH] IR2Vec Tool Enhancements

---
 llvm/test/lit.cfg.py  |   7 +
 llvm/test/tools/llvm-ir2vec/embeddings.ll |  73 
 llvm/test/tools/llvm-ir2vec/triplets.ll   |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 194 --
 4 files changed, 260 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 771d9245368b1..8a1b001695edc 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+(
+"%ir2vec_test_vocab_dir",
+os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+)
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll 
b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 
2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat 
--ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | 
FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) 
#0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  
92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 
 98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4

[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Add support for reading from stdin (PR #149213)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149213

>From 090355226c63ebe3a010061d2dab545f217edf5c Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:01:47 +
Subject: [PATCH] support-stdin-input-llvm-ir2vec

---
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 4e88282e85c14..e3aa7bd1b3b1e 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -48,10 +48,10 @@ namespace ir2vec {
 
 static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
 
-static cl::opt InputFilename(cl::Positional,
-  cl::desc(""),
-  cl::Required,
-  cl::cat(IR2VecToolCategory));
+static cl::opt
+InputFilename(cl::Positional,
+  cl::desc(""),
+  cl::init("-"), cl::cat(IR2VecToolCategory));
 
 static cl::opt OutputFilename("o", cl::desc("Output filename"),
cl::value_desc("filename"),
@@ -287,10 +287,14 @@ int main(int argc, char **argv) {
   if (Mode == TripletMode && Level.getNumOccurrences() > 0)
 errs() << "Warning: --level option is ignored in triplet mode\n";
 
-  // Parse the input LLVM IR file
+  // Parse the input LLVM IR file or stdin
   SMDiagnostic Err;
   LLVMContext Context;
-  std::unique_ptr M = parseIRFile(InputFilename, Err, Context);
+  std::unique_ptr M;
+
+  // Read from file or stdin
+  M = parseIRFile(InputFilename, Err, Context);
+
   if (!M) {
 Err.print(argv[0], errs());
 return 1;

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From c4122999df1f8a2a4b4203fdad206a17d787c3d0 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From e088eb8b169eb292c17ebe33b0d2106f628dce6d Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From e088eb8b169eb292c17ebe33b0d2106f628dce6d Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Add support for reading from stdin (PR #149213)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149213

>From 36ecab51e81c5715ca22912d18b5ae5fac33c52c Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:01:47 +
Subject: [PATCH] support-stdin-input-llvm-ir2vec

---
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 4e88282e85c14..5743ec0f4c1e9 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -48,10 +48,10 @@ namespace ir2vec {
 
 static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
 
-static cl::opt InputFilename(cl::Positional,
-  cl::desc(""),
-  cl::Required,
-  cl::cat(IR2VecToolCategory));
+static cl::opt
+InputFilename(cl::Positional,
+  cl::desc(""),
+  cl::init("-"), cl::cat(IR2VecToolCategory));
 
 static cl::opt OutputFilename("o", cl::desc("Output filename"),
cl::value_desc("filename"),
@@ -287,10 +287,11 @@ int main(int argc, char **argv) {
   if (Mode == TripletMode && Level.getNumOccurrences() > 0)
 errs() << "Warning: --level option is ignored in triplet mode\n";
 
-  // Parse the input LLVM IR file
+  // Parse the input LLVM IR file or stdin
   SMDiagnostic Err;
   LLVMContext Context;
   std::unique_ptr M = parseIRFile(InputFilename, Err, Context);
+
   if (!M) {
 Err.print(argv[0], errs());
 return 1;

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Revamp triplet generation and add entity mapping mode (PR #149214)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149214

>From 1212c724f1e93daefada8ce591aba0b8390ea6d1 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:03:56 +
Subject: [PATCH 1/2] revamp-triplet-gen

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|  79 -
 llvm/test/tools/llvm-ir2vec/entities.ll   |  95 ++
 llvm/test/tools/llvm-ir2vec/triplets.ll   |  51 ++-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 204 
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 5 files changed, 627 insertions(+), 93 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/entities.ll
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 13fe4996b968f..56ece4f509f6e 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -13,17 +13,21 @@ DESCRIPTION
 
 :program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
 generates IR2Vec embeddings for LLVM IR and supports triplet generation 
-for vocabulary training. It provides two main operation modes:
+for vocabulary training. It provides three main operation modes:
 
-1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+1. **Triplet Mode**: Generates numeric triplets in train2id format for 
vocabulary
training from LLVM IR.
 
-2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+2. **Entity Mode**: Generates entity mapping files (entity2id.txt) for 
vocabulary 
+   training.
+
+3. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
at different granularity levels (instruction, basic block, or function).
 
 The tool is designed to facilitate machine learning applications that work with
 LLVM IR by converting the IR into numerical representations that can be used by
-ML models.
+ML models. The triplet mode generates numeric IDs directly instead of string 
+triplets, streamlining the training data preparation workflow.
 
 .. note::
 
@@ -34,18 +38,46 @@ ML models.
 OPERATION MODES
 ---
 
+Triplet Generation and Entity Mapping Modes are used for preparing
+vocabulary and training data for knowledge graph embeddings. The Embedding Mode
+is used for generating embeddings from LLVM IR using a pre-trained vocabulary.
+
+The Seed Embedding Vocabulary of IR2Vec is trained on a large corpus of LLVM IR
+by modeling the relationships between opcodes, types, and operands as a 
knowledge
+graph. For this purpose, Triplet Generation and Entity Mapping Modes generate
+triplets and entity mappings in the standard format used for knowledge graph
+embedding training (see 
+
 
+for details).
+
 Triplet Generation Mode
 ~~~
 
-In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
-consisting of opcodes, types, and operands. These triplets can be used to train
-vocabularies for embedding generation.
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts numeric
+triplets consisting of opcode IDs, type IDs, and operand IDs. These triplets 
+are generated in train2id format. The tool outputs numeric IDs directly using 
+the ir2vec::Vocabulary mapping infrastructure, eliminating the need for 
+string-to-ID preprocessing.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets_train2id.txt
+
+Entity Mapping Generation Mode
+~~~
+
+In entity mode, :program:`llvm-ir2vec` generates the entity mappings supported 
by
+IR2Vec in entity2id format. This mode outputs all supported entities (opcodes, 
+types, and operands) with their corresponding numeric IDs, and is not specific 
for 
+an LLVM IR file.
 
 Usage:
 
 .. code-block:: bash
 
-   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+   llvm-ir2vec --mode=entities -o entity2id.txt
 
 Embedding Generation Mode
 ~~
@@ -67,6 +99,7 @@ OPTIONS
  Specify the operation mode. Valid values are:
 
  * ``triplets`` - Generate triplets for vocabulary training
+ * ``entities`` - Generate entity mappings for vocabulary training
  * ``embeddings`` - Generate embeddings using trained vocabulary (default)
 
 .. option:: --level=
@@ -115,7 +148,7 @@ OPTIONS
 
``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
-   mode. These options are ignored in triplet mode.
+   mode. These options are ignored in triplet and entity modes.
 
 INPUT FILE FORMAT
 -
@@ -129,14 +162,34 @@ OUTPUT FORMAT
 Triplet Mode Output
 ~~~
 
-In triplet mode, the output consists of lines containing space-separated 
triplets:
+In triplet mode, the o

[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Revamp triplet generation and add entity mapping mode (PR #149214)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149214

>From 1212c724f1e93daefada8ce591aba0b8390ea6d1 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:03:56 +
Subject: [PATCH 1/2] revamp-triplet-gen

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|  79 -
 llvm/test/tools/llvm-ir2vec/entities.ll   |  95 ++
 llvm/test/tools/llvm-ir2vec/triplets.ll   |  51 ++-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 204 
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 5 files changed, 627 insertions(+), 93 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/entities.ll
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 13fe4996b968f..56ece4f509f6e 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -13,17 +13,21 @@ DESCRIPTION
 
 :program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
 generates IR2Vec embeddings for LLVM IR and supports triplet generation 
-for vocabulary training. It provides two main operation modes:
+for vocabulary training. It provides three main operation modes:
 
-1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+1. **Triplet Mode**: Generates numeric triplets in train2id format for 
vocabulary
training from LLVM IR.
 
-2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+2. **Entity Mode**: Generates entity mapping files (entity2id.txt) for 
vocabulary 
+   training.
+
+3. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
at different granularity levels (instruction, basic block, or function).
 
 The tool is designed to facilitate machine learning applications that work with
 LLVM IR by converting the IR into numerical representations that can be used by
-ML models.
+ML models. The triplet mode generates numeric IDs directly instead of string 
+triplets, streamlining the training data preparation workflow.
 
 .. note::
 
@@ -34,18 +38,46 @@ ML models.
 OPERATION MODES
 ---
 
+Triplet Generation and Entity Mapping Modes are used for preparing
+vocabulary and training data for knowledge graph embeddings. The Embedding Mode
+is used for generating embeddings from LLVM IR using a pre-trained vocabulary.
+
+The Seed Embedding Vocabulary of IR2Vec is trained on a large corpus of LLVM IR
+by modeling the relationships between opcodes, types, and operands as a 
knowledge
+graph. For this purpose, Triplet Generation and Entity Mapping Modes generate
+triplets and entity mappings in the standard format used for knowledge graph
+embedding training (see 
+
 
+for details).
+
 Triplet Generation Mode
 ~~~
 
-In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
-consisting of opcodes, types, and operands. These triplets can be used to train
-vocabularies for embedding generation.
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts numeric
+triplets consisting of opcode IDs, type IDs, and operand IDs. These triplets 
+are generated in train2id format. The tool outputs numeric IDs directly using 
+the ir2vec::Vocabulary mapping infrastructure, eliminating the need for 
+string-to-ID preprocessing.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets_train2id.txt
+
+Entity Mapping Generation Mode
+~~~
+
+In entity mode, :program:`llvm-ir2vec` generates the entity mappings supported 
by
+IR2Vec in entity2id format. This mode outputs all supported entities (opcodes, 
+types, and operands) with their corresponding numeric IDs, and is not specific 
for 
+an LLVM IR file.
 
 Usage:
 
 .. code-block:: bash
 
-   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+   llvm-ir2vec --mode=entities -o entity2id.txt
 
 Embedding Generation Mode
 ~~
@@ -67,6 +99,7 @@ OPTIONS
  Specify the operation mode. Valid values are:
 
  * ``triplets`` - Generate triplets for vocabulary training
+ * ``entities`` - Generate entity mappings for vocabulary training
  * ``embeddings`` - Generate embeddings using trained vocabulary (default)
 
 .. option:: --level=
@@ -115,7 +148,7 @@ OPTIONS
 
``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
-   mode. These options are ignored in triplet mode.
+   mode. These options are ignored in triplet and entity modes.
 
 INPUT FILE FORMAT
 -
@@ -129,14 +162,34 @@ OUTPUT FORMAT
 Triplet Mode Output
 ~~~
 
-In triplet mode, the output consists of lines containing space-separated 
triplets:
+In triplet mode, the o

[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Add support for reading from stdin (PR #149213)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149213

>From 36ecab51e81c5715ca22912d18b5ae5fac33c52c Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:01:47 +
Subject: [PATCH] support-stdin-input-llvm-ir2vec

---
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 4e88282e85c14..5743ec0f4c1e9 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -48,10 +48,10 @@ namespace ir2vec {
 
 static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
 
-static cl::opt InputFilename(cl::Positional,
-  cl::desc(""),
-  cl::Required,
-  cl::cat(IR2VecToolCategory));
+static cl::opt
+InputFilename(cl::Positional,
+  cl::desc(""),
+  cl::init("-"), cl::cat(IR2VecToolCategory));
 
 static cl::opt OutputFilename("o", cl::desc("Output filename"),
cl::value_desc("filename"),
@@ -287,10 +287,11 @@ int main(int argc, char **argv) {
   if (Mode == TripletMode && Level.getNumOccurrences() > 0)
 errs() << "Warning: --level option is ignored in triplet mode\n";
 
-  // Parse the input LLVM IR file
+  // Parse the input LLVM IR file or stdin
   SMDiagnostic Err;
   LLVMContext Context;
   std::unique_ptr M = parseIRFile(InputFilename, Err, Context);
+
   if (!M) {
 Err.print(argv[0], errs());
 return 1;

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Adding documentation for llvm-ir2vec tool (PR #148719)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

svkeerthy wrote:

### Merge activity

* **Jul 17, 6:58 PM UTC**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/148719).


https://github.com/llvm/llvm-project/pull/148719
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (PR #147844)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

svkeerthy wrote:

### Merge activity

* **Jul 17, 6:58 PM UTC**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/147844).


https://github.com/llvm/llvm-project/pull/147844
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From d19f53d35c186d98c11cf093445254a41853bcae Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Add support for reading from stdin (PR #149213)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149213

>From 32275ce1e046510a779a9f6a1bd2cd64a242c1b2 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:01:47 +
Subject: [PATCH] support-stdin-input-llvm-ir2vec

---
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp 
b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 4e88282e85c14..e1e5fad13f413 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -48,10 +48,10 @@ namespace ir2vec {
 
 static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
 
-static cl::opt InputFilename(cl::Positional,
-  cl::desc(""),
-  cl::Required,
-  cl::cat(IR2VecToolCategory));
+static cl::opt
+InputFilename(cl::Positional,
+  cl::desc(""),
+  cl::init("-"), cl::cat(IR2VecToolCategory));
 
 static cl::opt OutputFilename("o", cl::desc("Output filename"),
cl::value_desc("filename"),
@@ -287,7 +287,7 @@ int main(int argc, char **argv) {
   if (Mode == TripletMode && Level.getNumOccurrences() > 0)
 errs() << "Warning: --level option is ignored in triplet mode\n";
 
-  // Parse the input LLVM IR file
+  // Parse the input LLVM IR file or stdin
   SMDiagnostic Err;
   LLVMContext Context;
   std::unique_ptr M = parseIRFile(InputFilename, Err, Context);

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Revamp triplet generation and add entity mapping mode (PR #149214)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149214

>From 5d93b96d4bb6e6849b3ba293dce90b98b8bed468 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:03:56 +
Subject: [PATCH 1/2] revamp-triplet-gen

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|  79 -
 llvm/test/tools/llvm-ir2vec/entities.ll   |  95 ++
 llvm/test/tools/llvm-ir2vec/triplets.ll   |  51 ++-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp| 204 
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 5 files changed, 627 insertions(+), 93 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/entities.ll
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 13fe4996b968f..56ece4f509f6e 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -13,17 +13,21 @@ DESCRIPTION
 
 :program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
 generates IR2Vec embeddings for LLVM IR and supports triplet generation 
-for vocabulary training. It provides two main operation modes:
+for vocabulary training. It provides three main operation modes:
 
-1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+1. **Triplet Mode**: Generates numeric triplets in train2id format for 
vocabulary
training from LLVM IR.
 
-2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+2. **Entity Mode**: Generates entity mapping files (entity2id.txt) for 
vocabulary 
+   training.
+
+3. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
at different granularity levels (instruction, basic block, or function).
 
 The tool is designed to facilitate machine learning applications that work with
 LLVM IR by converting the IR into numerical representations that can be used by
-ML models.
+ML models. The triplet mode generates numeric IDs directly instead of string 
+triplets, streamlining the training data preparation workflow.
 
 .. note::
 
@@ -34,18 +38,46 @@ ML models.
 OPERATION MODES
 ---
 
+Triplet Generation and Entity Mapping Modes are used for preparing
+vocabulary and training data for knowledge graph embeddings. The Embedding Mode
+is used for generating embeddings from LLVM IR using a pre-trained vocabulary.
+
+The Seed Embedding Vocabulary of IR2Vec is trained on a large corpus of LLVM IR
+by modeling the relationships between opcodes, types, and operands as a 
knowledge
+graph. For this purpose, Triplet Generation and Entity Mapping Modes generate
+triplets and entity mappings in the standard format used for knowledge graph
+embedding training (see 
+
 
+for details).
+
 Triplet Generation Mode
 ~~~
 
-In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
-consisting of opcodes, types, and operands. These triplets can be used to train
-vocabularies for embedding generation.
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts numeric
+triplets consisting of opcode IDs, type IDs, and operand IDs. These triplets 
+are generated in train2id format. The tool outputs numeric IDs directly using 
+the ir2vec::Vocabulary mapping infrastructure, eliminating the need for 
+string-to-ID preprocessing.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets_train2id.txt
+
+Entity Mapping Generation Mode
+~~~
+
+In entity mode, :program:`llvm-ir2vec` generates the entity mappings supported 
by
+IR2Vec in entity2id format. This mode outputs all supported entities (opcodes, 
+types, and operands) with their corresponding numeric IDs, and is not specific 
for 
+an LLVM IR file.
 
 Usage:
 
 .. code-block:: bash
 
-   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+   llvm-ir2vec --mode=entities -o entity2id.txt
 
 Embedding Generation Mode
 ~~
@@ -67,6 +99,7 @@ OPTIONS
  Specify the operation mode. Valid values are:
 
  * ``triplets`` - Generate triplets for vocabulary training
+ * ``entities`` - Generate entity mappings for vocabulary training
  * ``embeddings`` - Generate embeddings using trained vocabulary (default)
 
 .. option:: --level=
@@ -115,7 +148,7 @@ OPTIONS
 
``--level``, ``--function``, ``--ir2vec-vocab-path``, 
``--ir2vec-opc-weight``, 
``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in 
embedding 
-   mode. These options are ignored in triplet mode.
+   mode. These options are ignored in triplet and entity modes.
 
 INPUT FILE FORMAT
 -
@@ -129,14 +162,34 @@ OUTPUT FORMAT
 Triplet Mode Output
 ~~~
 
-In triplet mode, the output consists of lines containing space-separated 
triplets:
+In triplet mode, the o

[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)

2025-07-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/149215

>From d19f53d35c186d98c11cf093445254a41853bcae Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Wed, 16 Jul 2025 22:45:36 +
Subject: [PATCH] triplet-ext-script

---
 llvm/docs/CommandGuide/llvm-ir2vec.rst|   3 +
 .../mlgo-utils/IR2Vec/generateTriplets.py | 291 ++
 2 files changed, 294 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py

diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst 
b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 56ece4f509f6e..e39a663e3be5a 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -50,6 +50,9 @@ embedding training (see
 

 
 for details).
 
+See `llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py` for more details on how
+these two modes are used to generate the triplets and entity mappings.
+
 Triplet Generation Mode
 ~~~
 
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py 
b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
new file mode 100644
index 0..0858d10ce0138
--- /dev/null
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR 
files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+python generateTriplets.py   
 
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+"""Result from processing a single LLVM IR file"""
+
+__slots__ = ["triplets", "max_relation"]
+
+def __init__(self, triplets: Set[str], max_relation: int):
+self.triplets = triplets
+self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+"""Main class for generating IR2Vec triplets"""
+
+def __init__(
+self,
+llvm_build_dir: Path,
+num_optimizations: int,
+output_dir: Path,
+max_workers: int = DEFAULT_MAX_WORKERS,
+):
+self.llvm_build_dir = llvm_build_dir
+self.num_optimizations = num_optimizations
+self.output_dir = output_dir
+self.max_workers = max_workers
+
+# Tool paths
+self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+self._validate_setup()
+
+def _validate_setup(self):
+"""Validate that all required tools and paths exist"""
+if not self.llvm_build_dir.exists():
+raise FileNotFoundError(
+f"LLVM build directory not found: {self.llvm_build_dir}"
+)
+
+if not os.path.isfile(self.opt_binary) or not os.access(
+self.opt_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"opt binary not found or not executable: {self.opt_binary}"
+)
+
+if not os.path.isfile(self.ir2vec_binary) or not os.access(
+self.ir2vec_binary, os.X_OK
+):
+raise FileNotFoundError(
+f"llvm-ir2vec binary not found or not executable: 
{self.ir2vec_binary}"
+)
+
+if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+raise ValueError(
+f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+)
+
+self.output_dir.mkdir(parents=True, exist_ok=True)
+
+def _select_optimization_levels(self) -> List[str]:
+"""Select unique random optimization levels"""
+return random.sample(OPT_LEVELS, self.num_optimizations)
+
+def _process_single_file(self, input_file: Path) -> TripletResult:
+"""Process a single LLVM IR file with multiple optimization levels"""
+all_triplets = set()
+max_relation = 1
+opt_levels = self._select_optimization_levels()
+
+for opt_level in opt_levels:
+try:
+triplets, file_max_relation = self._run_pipeline(input_file, 
opt_level)
+if triplets:
+all_triplets.update(triplets)
+max_relation = max(max_relation, file_max_relation)
+logger.debug(
+f"Generated {len(triplets)

[llvm-branch-commits] [llvm] [IR2Vec] Simplifying creation of Embedder (PR #143999)

2025-06-20 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/143999

>From 8b8932b55c8a6a087d516e174e1d57c9908259bd Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Thu, 12 Jun 2025 23:54:10 +
Subject: [PATCH] Simplifying creation of Embedder

---
 llvm/docs/MLGO.rst|  7 +--
 llvm/include/llvm/Analysis/IR2Vec.h   |  4 +-
 .../Analysis/FunctionPropertiesAnalysis.cpp   | 10 ++---
 llvm/lib/Analysis/IR2Vec.cpp  | 17 +++
 .../FunctionPropertiesAnalysisTest.cpp|  7 ++-
 llvm/unittests/Analysis/IR2VecTest.cpp| 44 +++
 6 files changed, 33 insertions(+), 56 deletions(-)

diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
index 28095447f6a5a..0b849f3382f63 100644
--- a/llvm/docs/MLGO.rst
+++ b/llvm/docs/MLGO.rst
@@ -482,14 +482,9 @@ embeddings can be computed and accessed via an 
``ir2vec::Embedder`` instance.
 
   // Assuming F is an llvm::Function&
   // For example, using IR2VecKind::Symbolic:
-  Expected> EmbOrErr =
+  std::unique_ptr Emb =
   ir2vec::Embedder::create(IR2VecKind::Symbolic, F, Vocabulary);
 
-  if (auto Err = EmbOrErr.takeError()) {
-// Handle error in embedder creation
-return;
-  }
-  std::unique_ptr Emb = std::move(*EmbOrErr);
 
 3. **Compute and Access Embeddings**:
Call ``getFunctionVector()`` to get the embedding for the function. 
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 2a7a6edda70a8..06312562060aa 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,8 +170,8 @@ class Embedder {
   virtual ~Embedder() = default;
 
   /// Factory method to create an Embedder object.
-  static Expected>
-  create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary);
+  static std::unique_ptr create(IR2VecKind Mode, const Function &F,
+  const Vocab &Vocabulary);
 
   /// Returns a map containing instructions and the corresponding embeddings 
for
   /// the function F if it has been computed. If not, it computes the 
embeddings
diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp 
b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
index 29d3aaf46dc06..dd4eb7f0df053 100644
--- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -204,16 +204,12 @@ void FunctionPropertiesInfo::updateForBB(const BasicBlock 
&BB,
 // We instantiate the IR2Vec embedder each time, as having an unique
 // pointer to the embedder as member of the class would make it
 // non-copyable. Instantiating the embedder in itself is not costly.
-auto EmbOrErr = ir2vec::Embedder::create(IR2VecKind::Symbolic,
+auto Embedder = ir2vec::Embedder::create(IR2VecKind::Symbolic,
  *BB.getParent(), *IR2VecVocab);
-if (Error Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-BB.getContext().emitError("Error creating IR2Vec embeddings: " +
-  EI.message());
-  });
+if (!Embedder) {
+  BB.getContext().emitError("Error creating IR2Vec embeddings");
   return;
 }
-auto Embedder = std::move(*EmbOrErr);
 const auto &BBEmbedding = Embedder->getBBVector(BB);
 // Subtract BBEmbedding from Function embedding if the direction is -1,
 // and add it if the direction is +1.
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 7ff7acebedf4e..27cc2a4109879 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -123,13 +123,14 @@ Embedder::Embedder(const Function &F, const Vocab 
&Vocabulary)
   Dimension(Vocabulary.begin()->second.size()), OpcWeight(::OpcWeight),
   TypeWeight(::TypeWeight), ArgWeight(::ArgWeight) {}
 
-Expected>
-Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) {
+std::unique_ptr Embedder::create(IR2VecKind Mode, const Function &F,
+   const Vocab &Vocabulary) {
   switch (Mode) {
   case IR2VecKind::Symbolic:
 return std::make_unique(F, Vocabulary);
   }
-  return make_error("Unknown IR2VecKind", errc::invalid_argument);
+  llvm_unreachable("Unknown IR2Vec kind");
+  return nullptr;
 }
 
 // FIXME: Currently lookups are string based. Use numeric Keys
@@ -384,17 +385,13 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
 
   auto Vocab = IR2VecVocabResult.getVocabulary();
   for (Function &F : M) {
-Expected> EmbOrErr =
+std::unique_ptr Emb =
 Embedder::create(IR2VecKind::Symbolic, F, Vocab);
-if (auto Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-OS << "Error creating IR2Vec embeddings: " << EI.message() << "\n";
-  });
+if (!Emb) {
+  OS << "Error creating I

[llvm-branch-commits] [llvm] [NFC] Formatting PassRegistry.def (PR #144139)

2025-06-20 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/144139

>From 7fa87f2e42378d656ba743a4971e5c2ffaee8492 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Fri, 13 Jun 2025 18:22:10 +
Subject: [PATCH] [NFC] Formatting PassRegistry.def

---
 llvm/lib/Passes/PassRegistry.def | 40 ++--
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index ec14c6a9211d9..5256f1378b64c 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -63,7 +63,8 @@ MODULE_PASS("coro-early", CoroEarlyPass())
 MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass())
 MODULE_PASS("ctx-instr-gen",
 PGOInstrumentationGen(PGOInstrumentationType::CTXPROF))
-MODULE_PASS("ctx-prof-flatten", 
PGOCtxProfFlatteningPass(/*IsPreThinlink=*/false))
+MODULE_PASS("ctx-prof-flatten",
+PGOCtxProfFlatteningPass(/*IsPreThinlink=*/false))
 MODULE_PASS("ctx-prof-flatten-prethinlink",
 PGOCtxProfFlatteningPass(/*IsPreThinlink=*/true))
 MODULE_PASS("noinline-nonprevailing", NoinlineNonPrevailing())
@@ -74,7 +75,8 @@ MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass())
 MODULE_PASS("dxil-upgrade", DXILUpgradePass())
 MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
 MODULE_PASS("extract-blocks", BlockExtractorPass({}, false))
-MODULE_PASS("expand-variadics", 
ExpandVariadicsPass(ExpandVariadicsMode::Disable))
+MODULE_PASS("expand-variadics",
+ExpandVariadicsPass(ExpandVariadicsMode::Disable))
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
 MODULE_PASS("function-import", FunctionImportPass())
 MODULE_PASS("global-merge-func", GlobalMergeFuncPass())
@@ -104,7 +106,10 @@ MODULE_PASS("lower-ifunc", LowerIFuncPass())
 MODULE_PASS("simplify-type-tests", SimplifyTypeTestsPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("fatlto-cleanup", FatLtoCleanup())
-MODULE_PASS("pgo-force-function-attrs", PGOForceFunctionAttrsPass(PGOOpt ? 
PGOOpt->ColdOptType : PGOOptions::ColdFuncOpt::Default))
+MODULE_PASS("pgo-force-function-attrs",
+PGOForceFunctionAttrsPass(PGOOpt
+  ? PGOOpt->ColdOptType
+  : PGOOptions::ColdFuncOpt::Default))
 MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation())
 MODULE_PASS("memprof-module", ModuleMemProfilerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
@@ -178,7 +183,7 @@ MODULE_PASS_WITH_PARAMS(
 parseASanPassOptions, "kernel")
 MODULE_PASS_WITH_PARAMS(
 "cg-profile", "CGProfilePass",
-[](bool InLTOPostLink) { return CGProfilePass(InLTOPostLink);},
+[](bool InLTOPostLink) { return CGProfilePass(InLTOPostLink); },
 parseCGProfilePassOptions, "in-lto-post-link")
 MODULE_PASS_WITH_PARAMS(
 "global-merge", "GlobalMergePass",
@@ -287,7 +292,8 @@ CGSCC_PASS_WITH_PARAMS(
 FUNCTION_ANALYSIS("aa", AAManager())
 FUNCTION_ANALYSIS("access-info", LoopAccessAnalysis())
 FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis())
-FUNCTION_ANALYSIS("bb-sections-profile-reader", 
BasicBlockSectionsProfileReaderAnalysis(TM))
+FUNCTION_ANALYSIS("bb-sections-profile-reader",
+  BasicBlockSectionsProfileReaderAnalysis(TM))
 FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis())
 FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis())
 FUNCTION_ANALYSIS("cycles", CycleAnalysis())
@@ -377,7 +383,7 @@ FUNCTION_PASS("expand-large-div-rem", 
ExpandLargeDivRemPass(TM))
 FUNCTION_PASS("expand-fp", ExpandFpPass(TM))
 FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
 FUNCTION_PASS("extra-vector-passes",
-  ExtraFunctionPassManager())
+  ExtraFunctionPassManager())
 FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
 FUNCTION_PASS("flatten-cfg", FlattenCFGPass())
 FUNCTION_PASS("float2int", Float2IntPass())
@@ -548,8 +554,7 @@ FUNCTION_PASS_WITH_PARAMS(
 "max-iterations=N")
 FUNCTION_PASS_WITH_PARAMS(
 "lint", "LintPass",
-[](bool AbortOnError) { return LintPass(AbortOnError); },
-parseLintOptions,
+[](bool AbortOnError) { return LintPass(AbortOnError); }, parseLintOptions,
 "abort-on-error")
 FUNCTION_PASS_WITH_PARAMS(
 "loop-unroll", "LoopUnrollPass",
@@ -576,7 +581,8 @@ FUNCTION_PASS_WITH_PARAMS(
 "normalize", "IRNormalizerPass",
 [](IRNormalizerOptions Options) { return IRNormalizerPass(Options); },
 parseIRNormalizerPassOptions,
-
"no-preserve-order;preserve-order;no-rename-all;rename-all;no-fold-all;fold-all;no-reorder-operands;reorder-operands")
+"no-preserve-order;preserve-order;no-rename-all;rename-all;no-fold-all;"
+"fold-all;no-reorder-operands;reorder-operands")
 FUNCTION_PASS_WITH_PARAMS(
 "mldst-motion", "MergedLoadStoreMotionPass",
 [](MergedLoadStoreMotionOptions Opts) {
@@ -590,7 +596,7 @@ FUNCTION_PASS_WITH_PARAMS(
 },
 [](StringRe

[llvm-branch-commits] [llvm] [IR2Vec] Simplifying creation of Embedder (PR #143999)

2025-06-20 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/143999

>From 8b8932b55c8a6a087d516e174e1d57c9908259bd Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Thu, 12 Jun 2025 23:54:10 +
Subject: [PATCH] Simplifying creation of Embedder

---
 llvm/docs/MLGO.rst|  7 +--
 llvm/include/llvm/Analysis/IR2Vec.h   |  4 +-
 .../Analysis/FunctionPropertiesAnalysis.cpp   | 10 ++---
 llvm/lib/Analysis/IR2Vec.cpp  | 17 +++
 .../FunctionPropertiesAnalysisTest.cpp|  7 ++-
 llvm/unittests/Analysis/IR2VecTest.cpp| 44 +++
 6 files changed, 33 insertions(+), 56 deletions(-)

diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
index 28095447f6a5a..0b849f3382f63 100644
--- a/llvm/docs/MLGO.rst
+++ b/llvm/docs/MLGO.rst
@@ -482,14 +482,9 @@ embeddings can be computed and accessed via an 
``ir2vec::Embedder`` instance.
 
   // Assuming F is an llvm::Function&
   // For example, using IR2VecKind::Symbolic:
-  Expected> EmbOrErr =
+  std::unique_ptr Emb =
   ir2vec::Embedder::create(IR2VecKind::Symbolic, F, Vocabulary);
 
-  if (auto Err = EmbOrErr.takeError()) {
-// Handle error in embedder creation
-return;
-  }
-  std::unique_ptr Emb = std::move(*EmbOrErr);
 
 3. **Compute and Access Embeddings**:
Call ``getFunctionVector()`` to get the embedding for the function. 
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 2a7a6edda70a8..06312562060aa 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,8 +170,8 @@ class Embedder {
   virtual ~Embedder() = default;
 
   /// Factory method to create an Embedder object.
-  static Expected>
-  create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary);
+  static std::unique_ptr create(IR2VecKind Mode, const Function &F,
+  const Vocab &Vocabulary);
 
   /// Returns a map containing instructions and the corresponding embeddings 
for
   /// the function F if it has been computed. If not, it computes the 
embeddings
diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp 
b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
index 29d3aaf46dc06..dd4eb7f0df053 100644
--- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -204,16 +204,12 @@ void FunctionPropertiesInfo::updateForBB(const BasicBlock 
&BB,
 // We instantiate the IR2Vec embedder each time, as having an unique
 // pointer to the embedder as member of the class would make it
 // non-copyable. Instantiating the embedder in itself is not costly.
-auto EmbOrErr = ir2vec::Embedder::create(IR2VecKind::Symbolic,
+auto Embedder = ir2vec::Embedder::create(IR2VecKind::Symbolic,
  *BB.getParent(), *IR2VecVocab);
-if (Error Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-BB.getContext().emitError("Error creating IR2Vec embeddings: " +
-  EI.message());
-  });
+if (!Embedder) {
+  BB.getContext().emitError("Error creating IR2Vec embeddings");
   return;
 }
-auto Embedder = std::move(*EmbOrErr);
 const auto &BBEmbedding = Embedder->getBBVector(BB);
 // Subtract BBEmbedding from Function embedding if the direction is -1,
 // and add it if the direction is +1.
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 7ff7acebedf4e..27cc2a4109879 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -123,13 +123,14 @@ Embedder::Embedder(const Function &F, const Vocab 
&Vocabulary)
   Dimension(Vocabulary.begin()->second.size()), OpcWeight(::OpcWeight),
   TypeWeight(::TypeWeight), ArgWeight(::ArgWeight) {}
 
-Expected>
-Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) {
+std::unique_ptr Embedder::create(IR2VecKind Mode, const Function &F,
+   const Vocab &Vocabulary) {
   switch (Mode) {
   case IR2VecKind::Symbolic:
 return std::make_unique(F, Vocabulary);
   }
-  return make_error("Unknown IR2VecKind", errc::invalid_argument);
+  llvm_unreachable("Unknown IR2Vec kind");
+  return nullptr;
 }
 
 // FIXME: Currently lookups are string based. Use numeric Keys
@@ -384,17 +385,13 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
 
   auto Vocab = IR2VecVocabResult.getVocabulary();
   for (Function &F : M) {
-Expected> EmbOrErr =
+std::unique_ptr Emb =
 Embedder::create(IR2VecKind::Symbolic, F, Vocab);
-if (auto Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-OS << "Error creating IR2Vec embeddings: " << EI.message() << "\n";
-  });
+if (!Emb) {
+  OS << "Error creating I

[llvm-branch-commits] [llvm] [NFC] Formatting PassRegistry.def (PR #144139)

2025-06-20 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/144139

>From 7fa87f2e42378d656ba743a4971e5c2ffaee8492 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Fri, 13 Jun 2025 18:22:10 +
Subject: [PATCH] [NFC] Formatting PassRegistry.def

---
 llvm/lib/Passes/PassRegistry.def | 40 ++--
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index ec14c6a9211d9..5256f1378b64c 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -63,7 +63,8 @@ MODULE_PASS("coro-early", CoroEarlyPass())
 MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass())
 MODULE_PASS("ctx-instr-gen",
 PGOInstrumentationGen(PGOInstrumentationType::CTXPROF))
-MODULE_PASS("ctx-prof-flatten", 
PGOCtxProfFlatteningPass(/*IsPreThinlink=*/false))
+MODULE_PASS("ctx-prof-flatten",
+PGOCtxProfFlatteningPass(/*IsPreThinlink=*/false))
 MODULE_PASS("ctx-prof-flatten-prethinlink",
 PGOCtxProfFlatteningPass(/*IsPreThinlink=*/true))
 MODULE_PASS("noinline-nonprevailing", NoinlineNonPrevailing())
@@ -74,7 +75,8 @@ MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass())
 MODULE_PASS("dxil-upgrade", DXILUpgradePass())
 MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
 MODULE_PASS("extract-blocks", BlockExtractorPass({}, false))
-MODULE_PASS("expand-variadics", 
ExpandVariadicsPass(ExpandVariadicsMode::Disable))
+MODULE_PASS("expand-variadics",
+ExpandVariadicsPass(ExpandVariadicsMode::Disable))
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
 MODULE_PASS("function-import", FunctionImportPass())
 MODULE_PASS("global-merge-func", GlobalMergeFuncPass())
@@ -104,7 +106,10 @@ MODULE_PASS("lower-ifunc", LowerIFuncPass())
 MODULE_PASS("simplify-type-tests", SimplifyTypeTestsPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("fatlto-cleanup", FatLtoCleanup())
-MODULE_PASS("pgo-force-function-attrs", PGOForceFunctionAttrsPass(PGOOpt ? 
PGOOpt->ColdOptType : PGOOptions::ColdFuncOpt::Default))
+MODULE_PASS("pgo-force-function-attrs",
+PGOForceFunctionAttrsPass(PGOOpt
+  ? PGOOpt->ColdOptType
+  : PGOOptions::ColdFuncOpt::Default))
 MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation())
 MODULE_PASS("memprof-module", ModuleMemProfilerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
@@ -178,7 +183,7 @@ MODULE_PASS_WITH_PARAMS(
 parseASanPassOptions, "kernel")
 MODULE_PASS_WITH_PARAMS(
 "cg-profile", "CGProfilePass",
-[](bool InLTOPostLink) { return CGProfilePass(InLTOPostLink);},
+[](bool InLTOPostLink) { return CGProfilePass(InLTOPostLink); },
 parseCGProfilePassOptions, "in-lto-post-link")
 MODULE_PASS_WITH_PARAMS(
 "global-merge", "GlobalMergePass",
@@ -287,7 +292,8 @@ CGSCC_PASS_WITH_PARAMS(
 FUNCTION_ANALYSIS("aa", AAManager())
 FUNCTION_ANALYSIS("access-info", LoopAccessAnalysis())
 FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis())
-FUNCTION_ANALYSIS("bb-sections-profile-reader", 
BasicBlockSectionsProfileReaderAnalysis(TM))
+FUNCTION_ANALYSIS("bb-sections-profile-reader",
+  BasicBlockSectionsProfileReaderAnalysis(TM))
 FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis())
 FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis())
 FUNCTION_ANALYSIS("cycles", CycleAnalysis())
@@ -377,7 +383,7 @@ FUNCTION_PASS("expand-large-div-rem", 
ExpandLargeDivRemPass(TM))
 FUNCTION_PASS("expand-fp", ExpandFpPass(TM))
 FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
 FUNCTION_PASS("extra-vector-passes",
-  ExtraFunctionPassManager())
+  ExtraFunctionPassManager())
 FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
 FUNCTION_PASS("flatten-cfg", FlattenCFGPass())
 FUNCTION_PASS("float2int", Float2IntPass())
@@ -548,8 +554,7 @@ FUNCTION_PASS_WITH_PARAMS(
 "max-iterations=N")
 FUNCTION_PASS_WITH_PARAMS(
 "lint", "LintPass",
-[](bool AbortOnError) { return LintPass(AbortOnError); },
-parseLintOptions,
+[](bool AbortOnError) { return LintPass(AbortOnError); }, parseLintOptions,
 "abort-on-error")
 FUNCTION_PASS_WITH_PARAMS(
 "loop-unroll", "LoopUnrollPass",
@@ -576,7 +581,8 @@ FUNCTION_PASS_WITH_PARAMS(
 "normalize", "IRNormalizerPass",
 [](IRNormalizerOptions Options) { return IRNormalizerPass(Options); },
 parseIRNormalizerPassOptions,
-
"no-preserve-order;preserve-order;no-rename-all;rename-all;no-fold-all;fold-all;no-reorder-operands;reorder-operands")
+"no-preserve-order;preserve-order;no-rename-all;rename-all;no-fold-all;"
+"fold-all;no-reorder-operands;reorder-operands")
 FUNCTION_PASS_WITH_PARAMS(
 "mldst-motion", "MergedLoadStoreMotionPass",
 [](MergedLoadStoreMotionOptions Opts) {
@@ -590,7 +596,7 @@ FUNCTION_PASS_WITH_PARAMS(
 },
 [](StringRe

[llvm-branch-commits] [llvm] Increasing tolerance in ApproximatelyEquals (PR #145117)

2025-06-20 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy created 
https://github.com/llvm/llvm-project/pull/145117

None

>From d05856c47337b3b6e9086a5ee06b7c39412d9103 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Fri, 20 Jun 2025 22:56:46 +
Subject: [PATCH] Increasing tolerance in ApproximatelyEquals

---
 llvm/include/llvm/Analysis/IR2Vec.h| 2 +-
 llvm/unittests/Analysis/IR2VecTest.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 06312562060aa..480b834077b86 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -116,7 +116,7 @@ struct Embedding {
 
   /// Returns true if the embedding is approximately equal to the RHS embedding
   /// within the specified tolerance.
-  bool approximatelyEquals(const Embedding &RHS, double Tolerance = 1e-6) 
const;
+  bool approximatelyEquals(const Embedding &RHS, double Tolerance = 1e-4) 
const;
 
   void print(raw_ostream &OS) const;
 };
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp 
b/llvm/unittests/Analysis/IR2VecTest.cpp
index 05af55b59323b..33ac16828eb6c 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -154,14 +154,14 @@ TEST(EmbeddingTest, ApproximatelyEqual) {
   EXPECT_TRUE(E1.approximatelyEquals(E2)); // Diff = 1e-7
 
   Embedding E3 = {1.2, 2.2, 3.2}; // Diff = 2e-5
-  EXPECT_FALSE(E1.approximatelyEquals(E3));
+  EXPECT_FALSE(E1.approximatelyEquals(E3, 1e-6));
   EXPECT_TRUE(E1.approximatelyEquals(E3, 3e-5));
 
   Embedding E_clearly_within = {1.005, 2.005, 3.005}; // Diff = 
5e-7
   EXPECT_TRUE(E1.approximatelyEquals(E_clearly_within));
 
   Embedding E_clearly_outside = {1.1, 2.1, 3.1}; // Diff = 1e-5
-  EXPECT_FALSE(E1.approximatelyEquals(E_clearly_outside));
+  EXPECT_FALSE(E1.approximatelyEquals(E_clearly_outside, 1e-6));
 
   Embedding E4 = {1.0, 2.0, 3.5}; // Large diff
   EXPECT_FALSE(E1.approximatelyEquals(E4, 0.01));

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] Overloading operator+ for Embeddngs (PR #145118)

2025-06-20 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy created 
https://github.com/llvm/llvm-project/pull/145118

None

>From cbd2c6e77eefb4ba7b8acbf6ea12f21486e7dbc8 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Fri, 20 Jun 2025 23:00:40 +
Subject: [PATCH] Overloading operator+ for Embeddngs

---
 llvm/include/llvm/Analysis/IR2Vec.h|  1 +
 llvm/lib/Analysis/IR2Vec.cpp   |  8 
 llvm/unittests/Analysis/IR2VecTest.cpp | 18 ++
 3 files changed, 27 insertions(+)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 480b834077b86..f6c40d36f8026 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -106,6 +106,7 @@ struct Embedding {
   const std::vector &getData() const { return Data; }
 
   /// Arithmetic operators
+  Embedding operator+(const Embedding &RHS) const;
   Embedding &operator+=(const Embedding &RHS);
   Embedding &operator-=(const Embedding &RHS);
   Embedding &operator*=(double Factor);
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 27cc2a4109879..d5d27db8bd2bf 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -71,6 +71,14 @@ inline bool fromJSON(const llvm::json::Value &E, Embedding 
&Out,
 // Embedding
 
//===--===//
 
+Embedding Embedding::operator+(const Embedding &RHS) const {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  Embedding Result(*this);
+  std::transform(this->begin(), this->end(), RHS.begin(), Result.begin(),
+ std::plus());
+  return Result;
+}
+
 Embedding &Embedding::operator+=(const Embedding &RHS) {
   assert(this->size() == RHS.size() && "Vectors must have the same dimension");
   std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp 
b/llvm/unittests/Analysis/IR2VecTest.cpp
index 33ac16828eb6c..50eb7f73c6f50 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -109,6 +109,18 @@ TEST(EmbeddingTest, ConstructorsAndAccessors) {
   }
 }
 
+TEST(EmbeddingTest, AddVectorsOutOfPlace) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {0.5, 1.5, -1.0};
+
+  Embedding E3 = E1 + E2;
+  EXPECT_THAT(E3, ElementsAre(1.5, 3.5, 2.0));
+
+  // Check that E1 and E2 are unchanged
+  EXPECT_THAT(E1, ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
+}
+
 TEST(EmbeddingTest, AddVectors) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {0.5, 1.5, -1.0};
@@ -180,6 +192,12 @@ TEST(EmbeddingTest, AccessOutOfBounds) {
   EXPECT_DEATH(E[4] = 4.0, "Index out of bounds");
 }
 
+TEST(EmbeddingTest, MismatchedDimensionsAddVectorsOutOfPlace) {
+  Embedding E1 = {1.0, 2.0};
+  Embedding E2 = {1.0};
+  EXPECT_DEATH(E1 + E2, "Vectors must have the same dimension");
+}
+
 TEST(EmbeddingTest, MismatchedDimensionsAddVectors) {
   Embedding E1 = {1.0, 2.0};
   Embedding E2 = {1.0};

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] Increasing tolerance in ApproximatelyEquals (PR #145117)

2025-06-20 Thread S. VenkataKeerthy via llvm-branch-commits

svkeerthy wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/145117?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#145119** https://app.graphite.dev/github/pr/llvm/llvm-project/145119?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#145118** https://app.graphite.dev/github/pr/llvm/llvm-project/145118?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#145117** https://app.graphite.dev/github/pr/llvm/llvm-project/145117?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/145117?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#143999** https://app.graphite.dev/github/pr/llvm/llvm-project/143999?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#143986** https://app.graphite.dev/github/pr/llvm/llvm-project/143986?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#143479** https://app.graphite.dev/github/pr/llvm/llvm-project/143479?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>: 1 other dependent PR 
([#144139](https://github.com/llvm/llvm-project/pull/144139) https://app.graphite.dev/github/pr/llvm/llvm-project/144139?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>)
* **#143476** https://app.graphite.dev/github/pr/llvm/llvm-project/143476?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#143200** https://app.graphite.dev/github/pr/llvm/llvm-project/143200?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#143197** https://app.graphite.dev/github/pr/llvm/llvm-project/143197?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/145117
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Scale embeddings once in vocab analysis instead of repetitive scaling (PR #143986)

2025-06-13 Thread S. VenkataKeerthy via llvm-branch-commits


@@ -104,7 +106,10 @@ MODULE_PASS("lower-ifunc", LowerIFuncPass())
 MODULE_PASS("simplify-type-tests", SimplifyTypeTestsPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("fatlto-cleanup", FatLtoCleanup())
-MODULE_PASS("pgo-force-function-attrs", PGOForceFunctionAttrsPass(PGOOpt ? 
PGOOpt->ColdOptType : PGOOptions::ColdFuncOpt::Default))
+MODULE_PASS("pgo-force-function-attrs",
+PGOForceFunctionAttrsPass(PGOOpt

svkeerthy wrote:

Yeah, will do. Missed the unrelated formatting changes.

https://github.com/llvm/llvm-project/pull/143986
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR2Vec] Scale embeddings once in vocab analysis instead of repetitive scaling (PR #143986)

2025-06-13 Thread S. VenkataKeerthy via llvm-branch-commits


@@ -259,32 +306,40 @@ Error IR2VecVocabAnalysis::readVocabulary() {
 return createFileError(VocabFile, BufOrError.getError());
 
   auto Content = BufOrError.get()->getBuffer();
-  json::Path::Root Path("");
+
   Expected ParsedVocabValue = json::parse(Content);
   if (!ParsedVocabValue)
 return ParsedVocabValue.takeError();
 
-  bool Res = json::fromJSON(*ParsedVocabValue, Vocabulary, Path);
-  if (!Res)
-return createStringError(errc::illegal_byte_sequence,
- "Unable to parse the vocabulary");
+  ir2vec::Vocab OpcodeVocab, TypeVocab, ArgVocab;
+  unsigned OpcodeDim, TypeDim, ArgDim;
+  if (auto Err = parseVocabSection("Opcodes", *ParsedVocabValue, OpcodeVocab,

svkeerthy wrote:

Correct. Will put it in the doc.

https://github.com/llvm/llvm-project/pull/143986
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [MLGO][IR2Vec] Integrating IR2Vec with MLInliner (PR #143479)

2025-06-13 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/143479

>From a2bec77ad03e20cd76b6870149863049a96c4f9e Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Tue, 10 Jun 2025 05:40:38 +
Subject: [PATCH] [MLIniner][IR2Vec] Integrating IR2Vec with MLInliner

---
 .../Analysis/FunctionPropertiesAnalysis.h |  26 +++-
 llvm/include/llvm/Analysis/InlineAdvisor.h|   4 +
 .../llvm/Analysis/InlineModelFeatureMaps.h|   8 +-
 llvm/include/llvm/Analysis/MLInlineAdvisor.h  |   1 +
 .../Analysis/FunctionPropertiesAnalysis.cpp   | 115 +-
 llvm/lib/Analysis/InlineAdvisor.cpp   |  29 
 llvm/lib/Analysis/MLInlineAdvisor.cpp |  34 +++-
 .../FunctionPropertiesAnalysisTest.cpp| 145 --
 8 files changed, 338 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h 
b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
index babb6d9d6cf0c..06dbfc35a5294 100644
--- a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
+++ b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
@@ -15,6 +15,7 @@
 #define LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H
 
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/IR2Vec.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Compiler.h"
@@ -32,17 +33,19 @@ class FunctionPropertiesInfo {
   void updateAggregateStats(const Function &F, const LoopInfo &LI);
   void reIncludeBB(const BasicBlock &BB);
 
+  ir2vec::Embedding FunctionEmbedding = ir2vec::Embedding(0.0);
+  std::optional IR2VecVocab;
+
 public:
   LLVM_ABI static FunctionPropertiesInfo
   getFunctionPropertiesInfo(const Function &F, const DominatorTree &DT,
-const LoopInfo &LI);
+const LoopInfo &LI,
+const IR2VecVocabResult *VocabResult);
 
   LLVM_ABI static FunctionPropertiesInfo
   getFunctionPropertiesInfo(Function &F, FunctionAnalysisManager &FAM);
 
-  bool operator==(const FunctionPropertiesInfo &FPI) const {
-return std::memcmp(this, &FPI, sizeof(FunctionPropertiesInfo)) == 0;
-  }
+  bool operator==(const FunctionPropertiesInfo &FPI) const;
 
   bool operator!=(const FunctionPropertiesInfo &FPI) const {
 return !(*this == FPI);
@@ -137,6 +140,19 @@ class FunctionPropertiesInfo {
   int64_t CallReturnsVectorPointerCount = 0;
   int64_t CallWithManyArgumentsCount = 0;
   int64_t CallWithPointerArgumentCount = 0;
+
+  const ir2vec::Embedding &getFunctionEmbedding() const {
+return FunctionEmbedding;
+  }
+
+  const std::optional &getIR2VecVocab() const {
+return IR2VecVocab;
+  }
+
+  // Helper intended to be useful for unittests
+  void setFunctionEmbeddingForTest(const ir2vec::Embedding &Embedding) {
+FunctionEmbedding = Embedding;
+  }
 };
 
 // Analysis pass
@@ -192,7 +208,7 @@ class FunctionPropertiesUpdater {
 
   DominatorTree &getUpdatedDominatorTree(FunctionAnalysisManager &FAM) const;
 
-  DenseSet Successors;
+  DenseSet Successors, CallUsers;
 
   // Edges we might potentially need to remove from the dominator tree.
   SmallVector DomTreeUpdates;
diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h 
b/llvm/include/llvm/Analysis/InlineAdvisor.h
index 9d15136e81d10..50ba3c13da70f 100644
--- a/llvm/include/llvm/Analysis/InlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/InlineAdvisor.h
@@ -331,6 +331,10 @@ class InlineAdvisorAnalysis : public 
AnalysisInfoMixin {
   };
 
   Result run(Module &M, ModuleAnalysisManager &MAM) { return Result(M, MAM); }
+
+private:
+  static bool initializeIR2VecVocabIfRequested(Module &M,
+   ModuleAnalysisManager &MAM);
 };
 
 /// Printer pass for the InlineAdvisorAnalysis results.
diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h 
b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
index 961d5091bf9f3..a166621243cad 100644
--- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
+++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
@@ -142,6 +142,12 @@ enum class FeatureIndex : size_t {
   INLINE_FEATURE_ITERATOR(POPULATE_INDICES)
 #undef POPULATE_INDICES
 
+// IR2Vec embeddings
+// Dimensions of embeddings are not known in the compile time (until vocab is 
+// read). Hence macros cannot be used here.
+  callee_embedding,
+  caller_embedding,
+
   NumberOfFeatures
 };
 // clang-format on
@@ -154,7 +160,7 @@ inlineCostFeatureToMlFeature(InlineCostFeatureIndex 
Feature) {
 constexpr size_t NumberOfFeatures =
 static_cast(FeatureIndex::NumberOfFeatures);
 
-LLVM_ABI extern const std::vector FeatureMap;
+LLVM_ABI extern std::vector FeatureMap;
 
 LLVM_ABI extern const char *const DecisionName;
 LLVM_ABI extern const TensorSpec InlineDecisionSpec;
diff --git a/llvm/include/llvm/Analysis/MLInlineAdvisor.h 
b/llvm/include/llvm/Analysis/MLInlineAdvisor.h
index 580dd5e95d760..8262dd0846ede 100644
--- a/llvm/include/llvm/Analysis/MLInlin

[llvm-branch-commits] [llvm] [IR2Vec] Simplifying creation of Embedder (PR #143999)

2025-06-13 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/143999

>From d71dd503f4794abf8a396ddb8a5deeafe0d75c83 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Thu, 12 Jun 2025 23:54:10 +
Subject: [PATCH] Simplifying creation of Embedder

---
 llvm/docs/MLGO.rst|  7 +--
 llvm/include/llvm/Analysis/IR2Vec.h   |  4 +-
 .../Analysis/FunctionPropertiesAnalysis.cpp   | 10 ++---
 llvm/lib/Analysis/IR2Vec.cpp  | 17 +++
 .../FunctionPropertiesAnalysisTest.cpp|  7 ++-
 llvm/unittests/Analysis/IR2VecTest.cpp| 44 +++
 6 files changed, 33 insertions(+), 56 deletions(-)

diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
index 28095447f6a5a..0b849f3382f63 100644
--- a/llvm/docs/MLGO.rst
+++ b/llvm/docs/MLGO.rst
@@ -482,14 +482,9 @@ embeddings can be computed and accessed via an 
``ir2vec::Embedder`` instance.
 
   // Assuming F is an llvm::Function&
   // For example, using IR2VecKind::Symbolic:
-  Expected> EmbOrErr =
+  std::unique_ptr Emb =
   ir2vec::Embedder::create(IR2VecKind::Symbolic, F, Vocabulary);
 
-  if (auto Err = EmbOrErr.takeError()) {
-// Handle error in embedder creation
-return;
-  }
-  std::unique_ptr Emb = std::move(*EmbOrErr);
 
 3. **Compute and Access Embeddings**:
Call ``getFunctionVector()`` to get the embedding for the function. 
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 2a7a6edda70a8..06312562060aa 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,8 +170,8 @@ class Embedder {
   virtual ~Embedder() = default;
 
   /// Factory method to create an Embedder object.
-  static Expected>
-  create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary);
+  static std::unique_ptr create(IR2VecKind Mode, const Function &F,
+  const Vocab &Vocabulary);
 
   /// Returns a map containing instructions and the corresponding embeddings 
for
   /// the function F if it has been computed. If not, it computes the 
embeddings
diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp 
b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
index 29d3aaf46dc06..dd4eb7f0df053 100644
--- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -204,16 +204,12 @@ void FunctionPropertiesInfo::updateForBB(const BasicBlock 
&BB,
 // We instantiate the IR2Vec embedder each time, as having an unique
 // pointer to the embedder as member of the class would make it
 // non-copyable. Instantiating the embedder in itself is not costly.
-auto EmbOrErr = ir2vec::Embedder::create(IR2VecKind::Symbolic,
+auto Embedder = ir2vec::Embedder::create(IR2VecKind::Symbolic,
  *BB.getParent(), *IR2VecVocab);
-if (Error Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-BB.getContext().emitError("Error creating IR2Vec embeddings: " +
-  EI.message());
-  });
+if (!Embedder) {
+  BB.getContext().emitError("Error creating IR2Vec embeddings");
   return;
 }
-auto Embedder = std::move(*EmbOrErr);
 const auto &BBEmbedding = Embedder->getBBVector(BB);
 // Subtract BBEmbedding from Function embedding if the direction is -1,
 // and add it if the direction is +1.
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 7ff7acebedf4e..27cc2a4109879 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -123,13 +123,14 @@ Embedder::Embedder(const Function &F, const Vocab 
&Vocabulary)
   Dimension(Vocabulary.begin()->second.size()), OpcWeight(::OpcWeight),
   TypeWeight(::TypeWeight), ArgWeight(::ArgWeight) {}
 
-Expected>
-Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) {
+std::unique_ptr Embedder::create(IR2VecKind Mode, const Function &F,
+   const Vocab &Vocabulary) {
   switch (Mode) {
   case IR2VecKind::Symbolic:
 return std::make_unique(F, Vocabulary);
   }
-  return make_error("Unknown IR2VecKind", errc::invalid_argument);
+  llvm_unreachable("Unknown IR2Vec kind");
+  return nullptr;
 }
 
 // FIXME: Currently lookups are string based. Use numeric Keys
@@ -384,17 +385,13 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
 
   auto Vocab = IR2VecVocabResult.getVocabulary();
   for (Function &F : M) {
-Expected> EmbOrErr =
+std::unique_ptr Emb =
 Embedder::create(IR2VecKind::Symbolic, F, Vocab);
-if (auto Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-OS << "Error creating IR2Vec embeddings: " << EI.message() << "\n";
-  });
+if (!Emb) {
+  OS << "Error creating I

[llvm-branch-commits] [llvm] [MLGO][IR2Vec] Integrating IR2Vec with MLInliner (PR #143479)

2025-06-13 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/143479

>From a2bec77ad03e20cd76b6870149863049a96c4f9e Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Tue, 10 Jun 2025 05:40:38 +
Subject: [PATCH] [MLIniner][IR2Vec] Integrating IR2Vec with MLInliner

---
 .../Analysis/FunctionPropertiesAnalysis.h |  26 +++-
 llvm/include/llvm/Analysis/InlineAdvisor.h|   4 +
 .../llvm/Analysis/InlineModelFeatureMaps.h|   8 +-
 llvm/include/llvm/Analysis/MLInlineAdvisor.h  |   1 +
 .../Analysis/FunctionPropertiesAnalysis.cpp   | 115 +-
 llvm/lib/Analysis/InlineAdvisor.cpp   |  29 
 llvm/lib/Analysis/MLInlineAdvisor.cpp |  34 +++-
 .../FunctionPropertiesAnalysisTest.cpp| 145 --
 8 files changed, 338 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h 
b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
index babb6d9d6cf0c..06dbfc35a5294 100644
--- a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
+++ b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
@@ -15,6 +15,7 @@
 #define LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H
 
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/IR2Vec.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Compiler.h"
@@ -32,17 +33,19 @@ class FunctionPropertiesInfo {
   void updateAggregateStats(const Function &F, const LoopInfo &LI);
   void reIncludeBB(const BasicBlock &BB);
 
+  ir2vec::Embedding FunctionEmbedding = ir2vec::Embedding(0.0);
+  std::optional IR2VecVocab;
+
 public:
   LLVM_ABI static FunctionPropertiesInfo
   getFunctionPropertiesInfo(const Function &F, const DominatorTree &DT,
-const LoopInfo &LI);
+const LoopInfo &LI,
+const IR2VecVocabResult *VocabResult);
 
   LLVM_ABI static FunctionPropertiesInfo
   getFunctionPropertiesInfo(Function &F, FunctionAnalysisManager &FAM);
 
-  bool operator==(const FunctionPropertiesInfo &FPI) const {
-return std::memcmp(this, &FPI, sizeof(FunctionPropertiesInfo)) == 0;
-  }
+  bool operator==(const FunctionPropertiesInfo &FPI) const;
 
   bool operator!=(const FunctionPropertiesInfo &FPI) const {
 return !(*this == FPI);
@@ -137,6 +140,19 @@ class FunctionPropertiesInfo {
   int64_t CallReturnsVectorPointerCount = 0;
   int64_t CallWithManyArgumentsCount = 0;
   int64_t CallWithPointerArgumentCount = 0;
+
+  const ir2vec::Embedding &getFunctionEmbedding() const {
+return FunctionEmbedding;
+  }
+
+  const std::optional &getIR2VecVocab() const {
+return IR2VecVocab;
+  }
+
+  // Helper intended to be useful for unittests
+  void setFunctionEmbeddingForTest(const ir2vec::Embedding &Embedding) {
+FunctionEmbedding = Embedding;
+  }
 };
 
 // Analysis pass
@@ -192,7 +208,7 @@ class FunctionPropertiesUpdater {
 
   DominatorTree &getUpdatedDominatorTree(FunctionAnalysisManager &FAM) const;
 
-  DenseSet Successors;
+  DenseSet Successors, CallUsers;
 
   // Edges we might potentially need to remove from the dominator tree.
   SmallVector DomTreeUpdates;
diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h 
b/llvm/include/llvm/Analysis/InlineAdvisor.h
index 9d15136e81d10..50ba3c13da70f 100644
--- a/llvm/include/llvm/Analysis/InlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/InlineAdvisor.h
@@ -331,6 +331,10 @@ class InlineAdvisorAnalysis : public 
AnalysisInfoMixin {
   };
 
   Result run(Module &M, ModuleAnalysisManager &MAM) { return Result(M, MAM); }
+
+private:
+  static bool initializeIR2VecVocabIfRequested(Module &M,
+   ModuleAnalysisManager &MAM);
 };
 
 /// Printer pass for the InlineAdvisorAnalysis results.
diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h 
b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
index 961d5091bf9f3..a166621243cad 100644
--- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
+++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
@@ -142,6 +142,12 @@ enum class FeatureIndex : size_t {
   INLINE_FEATURE_ITERATOR(POPULATE_INDICES)
 #undef POPULATE_INDICES
 
+// IR2Vec embeddings
+// Dimensions of embeddings are not known in the compile time (until vocab is 
+// read). Hence macros cannot be used here.
+  callee_embedding,
+  caller_embedding,
+
   NumberOfFeatures
 };
 // clang-format on
@@ -154,7 +160,7 @@ inlineCostFeatureToMlFeature(InlineCostFeatureIndex 
Feature) {
 constexpr size_t NumberOfFeatures =
 static_cast(FeatureIndex::NumberOfFeatures);
 
-LLVM_ABI extern const std::vector FeatureMap;
+LLVM_ABI extern std::vector FeatureMap;
 
 LLVM_ABI extern const char *const DecisionName;
 LLVM_ABI extern const TensorSpec InlineDecisionSpec;
diff --git a/llvm/include/llvm/Analysis/MLInlineAdvisor.h 
b/llvm/include/llvm/Analysis/MLInlineAdvisor.h
index 580dd5e95d760..8262dd0846ede 100644
--- a/llvm/include/llvm/Analysis/MLInlin

[llvm-branch-commits] [llvm] [IR2Vec] Simplifying creation of Embedder (PR #143999)

2025-06-13 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/143999

>From d71dd503f4794abf8a396ddb8a5deeafe0d75c83 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Thu, 12 Jun 2025 23:54:10 +
Subject: [PATCH] Simplifying creation of Embedder

---
 llvm/docs/MLGO.rst|  7 +--
 llvm/include/llvm/Analysis/IR2Vec.h   |  4 +-
 .../Analysis/FunctionPropertiesAnalysis.cpp   | 10 ++---
 llvm/lib/Analysis/IR2Vec.cpp  | 17 +++
 .../FunctionPropertiesAnalysisTest.cpp|  7 ++-
 llvm/unittests/Analysis/IR2VecTest.cpp| 44 +++
 6 files changed, 33 insertions(+), 56 deletions(-)

diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
index 28095447f6a5a..0b849f3382f63 100644
--- a/llvm/docs/MLGO.rst
+++ b/llvm/docs/MLGO.rst
@@ -482,14 +482,9 @@ embeddings can be computed and accessed via an 
``ir2vec::Embedder`` instance.
 
   // Assuming F is an llvm::Function&
   // For example, using IR2VecKind::Symbolic:
-  Expected> EmbOrErr =
+  std::unique_ptr Emb =
   ir2vec::Embedder::create(IR2VecKind::Symbolic, F, Vocabulary);
 
-  if (auto Err = EmbOrErr.takeError()) {
-// Handle error in embedder creation
-return;
-  }
-  std::unique_ptr Emb = std::move(*EmbOrErr);
 
 3. **Compute and Access Embeddings**:
Call ``getFunctionVector()`` to get the embedding for the function. 
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 2a7a6edda70a8..06312562060aa 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,8 +170,8 @@ class Embedder {
   virtual ~Embedder() = default;
 
   /// Factory method to create an Embedder object.
-  static Expected>
-  create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary);
+  static std::unique_ptr create(IR2VecKind Mode, const Function &F,
+  const Vocab &Vocabulary);
 
   /// Returns a map containing instructions and the corresponding embeddings 
for
   /// the function F if it has been computed. If not, it computes the 
embeddings
diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp 
b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
index 29d3aaf46dc06..dd4eb7f0df053 100644
--- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -204,16 +204,12 @@ void FunctionPropertiesInfo::updateForBB(const BasicBlock 
&BB,
 // We instantiate the IR2Vec embedder each time, as having an unique
 // pointer to the embedder as member of the class would make it
 // non-copyable. Instantiating the embedder in itself is not costly.
-auto EmbOrErr = ir2vec::Embedder::create(IR2VecKind::Symbolic,
+auto Embedder = ir2vec::Embedder::create(IR2VecKind::Symbolic,
  *BB.getParent(), *IR2VecVocab);
-if (Error Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-BB.getContext().emitError("Error creating IR2Vec embeddings: " +
-  EI.message());
-  });
+if (!Embedder) {
+  BB.getContext().emitError("Error creating IR2Vec embeddings");
   return;
 }
-auto Embedder = std::move(*EmbOrErr);
 const auto &BBEmbedding = Embedder->getBBVector(BB);
 // Subtract BBEmbedding from Function embedding if the direction is -1,
 // and add it if the direction is +1.
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 7ff7acebedf4e..27cc2a4109879 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -123,13 +123,14 @@ Embedder::Embedder(const Function &F, const Vocab 
&Vocabulary)
   Dimension(Vocabulary.begin()->second.size()), OpcWeight(::OpcWeight),
   TypeWeight(::TypeWeight), ArgWeight(::ArgWeight) {}
 
-Expected>
-Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) {
+std::unique_ptr Embedder::create(IR2VecKind Mode, const Function &F,
+   const Vocab &Vocabulary) {
   switch (Mode) {
   case IR2VecKind::Symbolic:
 return std::make_unique(F, Vocabulary);
   }
-  return make_error("Unknown IR2VecKind", errc::invalid_argument);
+  llvm_unreachable("Unknown IR2Vec kind");
+  return nullptr;
 }
 
 // FIXME: Currently lookups are string based. Use numeric Keys
@@ -384,17 +385,13 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
 
   auto Vocab = IR2VecVocabResult.getVocabulary();
   for (Function &F : M) {
-Expected> EmbOrErr =
+std::unique_ptr Emb =
 Embedder::create(IR2VecKind::Symbolic, F, Vocab);
-if (auto Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-OS << "Error creating IR2Vec embeddings: " << EI.message() << "\n";
-  });
+if (!Emb) {
+  OS << "Error creating I

[llvm-branch-commits] [llvm] [IR2Vec] Simplifying creation of Embedder (PR #143999)

2025-06-13 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/143999

>From 0d921416a0f81e5634705dc9dfc5363d721a55bf Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Thu, 12 Jun 2025 23:54:10 +
Subject: [PATCH] Simplifying creation of Embedder

---
 llvm/docs/MLGO.rst|  7 +--
 llvm/include/llvm/Analysis/IR2Vec.h   |  4 +-
 .../Analysis/FunctionPropertiesAnalysis.cpp   | 10 ++---
 llvm/lib/Analysis/IR2Vec.cpp  | 17 +++
 .../FunctionPropertiesAnalysisTest.cpp|  7 ++-
 llvm/unittests/Analysis/IR2VecTest.cpp| 44 +++
 6 files changed, 33 insertions(+), 56 deletions(-)

diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
index 4f8fb3f59ca19..e7bba9995b75b 100644
--- a/llvm/docs/MLGO.rst
+++ b/llvm/docs/MLGO.rst
@@ -479,14 +479,9 @@ embeddings can be computed and accessed via an 
``ir2vec::Embedder`` instance.
 
   // Assuming F is an llvm::Function&
   // For example, using IR2VecKind::Symbolic:
-  Expected> EmbOrErr =
+  std::unique_ptr Emb =
   ir2vec::Embedder::create(IR2VecKind::Symbolic, F, Vocabulary);
 
-  if (auto Err = EmbOrErr.takeError()) {
-// Handle error in embedder creation
-return;
-  }
-  std::unique_ptr Emb = std::move(*EmbOrErr);
 
 3. **Compute and Access Embeddings**:
Call ``getFunctionVector()`` to get the embedding for the function. 
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index f1aaf4cd2e013..6efa6eac56af9 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,8 +170,8 @@ class Embedder {
   virtual ~Embedder() = default;
 
   /// Factory method to create an Embedder object.
-  static Expected>
-  create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary);
+  static std::unique_ptr create(IR2VecKind Mode, const Function &F,
+  const Vocab &Vocabulary);
 
   /// Returns a map containing instructions and the corresponding embeddings 
for
   /// the function F if it has been computed. If not, it computes the 
embeddings
diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp 
b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
index 29d3aaf46dc06..dd4eb7f0df053 100644
--- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -204,16 +204,12 @@ void FunctionPropertiesInfo::updateForBB(const BasicBlock 
&BB,
 // We instantiate the IR2Vec embedder each time, as having an unique
 // pointer to the embedder as member of the class would make it
 // non-copyable. Instantiating the embedder in itself is not costly.
-auto EmbOrErr = ir2vec::Embedder::create(IR2VecKind::Symbolic,
+auto Embedder = ir2vec::Embedder::create(IR2VecKind::Symbolic,
  *BB.getParent(), *IR2VecVocab);
-if (Error Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-BB.getContext().emitError("Error creating IR2Vec embeddings: " +
-  EI.message());
-  });
+if (!Embedder) {
+  BB.getContext().emitError("Error creating IR2Vec embeddings");
   return;
 }
-auto Embedder = std::move(*EmbOrErr);
 const auto &BBEmbedding = Embedder->getBBVector(BB);
 // Subtract BBEmbedding from Function embedding if the direction is -1,
 // and add it if the direction is +1.
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index de9c2db9531e8..308c3d86a7668 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -123,13 +123,14 @@ Embedder::Embedder(const Function &F, const Vocab 
&Vocabulary)
   Dimension(Vocabulary.begin()->second.size()), OpcWeight(::OpcWeight),
   TypeWeight(::TypeWeight), ArgWeight(::ArgWeight) {}
 
-Expected>
-Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) {
+std::unique_ptr Embedder::create(IR2VecKind Mode, const Function &F,
+   const Vocab &Vocabulary) {
   switch (Mode) {
   case IR2VecKind::Symbolic:
 return std::make_unique(F, Vocabulary);
   }
-  return make_error("Unknown IR2VecKind", errc::invalid_argument);
+  llvm_unreachable("Unknown IR2Vec kind");
+  return nullptr;
 }
 
 // FIXME: Currently lookups are string based. Use numeric Keys
@@ -388,17 +389,13 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
 
   auto Vocab = IR2VecVocabResult.getVocabulary();
   for (Function &F : M) {
-Expected> EmbOrErr =
+std::unique_ptr Emb =
 Embedder::create(IR2VecKind::Symbolic, F, Vocab);
-if (auto Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-OS << "Error creating IR2Vec embeddings: " << EI.message() << "\n";
-  });
+if (!Emb) {
+  OS << "Error creating I

[llvm-branch-commits] [llvm] [IR2Vec] Simplifying creation of Embedder (PR #143999)

2025-06-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/143999

>From ea224dfb11b37573f5dbdd34ca118fee5a9808c1 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Thu, 12 Jun 2025 23:54:10 +
Subject: [PATCH] Simplifying creation of Embedder

---
 llvm/docs/MLGO.rst|  7 +--
 llvm/include/llvm/Analysis/IR2Vec.h   |  4 +-
 .../Analysis/FunctionPropertiesAnalysis.cpp   | 10 ++---
 llvm/lib/Analysis/IR2Vec.cpp  | 17 +++
 .../FunctionPropertiesAnalysisTest.cpp|  7 ++-
 llvm/unittests/Analysis/IR2VecTest.cpp| 44 +++
 6 files changed, 33 insertions(+), 56 deletions(-)

diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
index 28095447f6a5a..0b849f3382f63 100644
--- a/llvm/docs/MLGO.rst
+++ b/llvm/docs/MLGO.rst
@@ -482,14 +482,9 @@ embeddings can be computed and accessed via an 
``ir2vec::Embedder`` instance.
 
   // Assuming F is an llvm::Function&
   // For example, using IR2VecKind::Symbolic:
-  Expected> EmbOrErr =
+  std::unique_ptr Emb =
   ir2vec::Embedder::create(IR2VecKind::Symbolic, F, Vocabulary);
 
-  if (auto Err = EmbOrErr.takeError()) {
-// Handle error in embedder creation
-return;
-  }
-  std::unique_ptr Emb = std::move(*EmbOrErr);
 
 3. **Compute and Access Embeddings**:
Call ``getFunctionVector()`` to get the embedding for the function. 
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 2a7a6edda70a8..06312562060aa 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,8 +170,8 @@ class Embedder {
   virtual ~Embedder() = default;
 
   /// Factory method to create an Embedder object.
-  static Expected>
-  create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary);
+  static std::unique_ptr create(IR2VecKind Mode, const Function &F,
+  const Vocab &Vocabulary);
 
   /// Returns a map containing instructions and the corresponding embeddings 
for
   /// the function F if it has been computed. If not, it computes the 
embeddings
diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp 
b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
index 29d3aaf46dc06..dd4eb7f0df053 100644
--- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -204,16 +204,12 @@ void FunctionPropertiesInfo::updateForBB(const BasicBlock 
&BB,
 // We instantiate the IR2Vec embedder each time, as having an unique
 // pointer to the embedder as member of the class would make it
 // non-copyable. Instantiating the embedder in itself is not costly.
-auto EmbOrErr = ir2vec::Embedder::create(IR2VecKind::Symbolic,
+auto Embedder = ir2vec::Embedder::create(IR2VecKind::Symbolic,
  *BB.getParent(), *IR2VecVocab);
-if (Error Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-BB.getContext().emitError("Error creating IR2Vec embeddings: " +
-  EI.message());
-  });
+if (!Embedder) {
+  BB.getContext().emitError("Error creating IR2Vec embeddings");
   return;
 }
-auto Embedder = std::move(*EmbOrErr);
 const auto &BBEmbedding = Embedder->getBBVector(BB);
 // Subtract BBEmbedding from Function embedding if the direction is -1,
 // and add it if the direction is +1.
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 7ff7acebedf4e..27cc2a4109879 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -123,13 +123,14 @@ Embedder::Embedder(const Function &F, const Vocab 
&Vocabulary)
   Dimension(Vocabulary.begin()->second.size()), OpcWeight(::OpcWeight),
   TypeWeight(::TypeWeight), ArgWeight(::ArgWeight) {}
 
-Expected>
-Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) {
+std::unique_ptr Embedder::create(IR2VecKind Mode, const Function &F,
+   const Vocab &Vocabulary) {
   switch (Mode) {
   case IR2VecKind::Symbolic:
 return std::make_unique(F, Vocabulary);
   }
-  return make_error("Unknown IR2VecKind", errc::invalid_argument);
+  llvm_unreachable("Unknown IR2Vec kind");
+  return nullptr;
 }
 
 // FIXME: Currently lookups are string based. Use numeric Keys
@@ -384,17 +385,13 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
 
   auto Vocab = IR2VecVocabResult.getVocabulary();
   for (Function &F : M) {
-Expected> EmbOrErr =
+std::unique_ptr Emb =
 Embedder::create(IR2VecKind::Symbolic, F, Vocab);
-if (auto Err = EmbOrErr.takeError()) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-OS << "Error creating IR2Vec embeddings: " << EI.message() << "\n";
-  });
+if (!Emb) {
+  OS << "Error creating I

[llvm-branch-commits] [llvm] [NFC] Formatting PassRegistry.def (PR #144139)

2025-06-17 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/144139

>From cd6a0f4fbfa87df8bed4efcdf066530523f5ec0d Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Fri, 13 Jun 2025 18:22:10 +
Subject: [PATCH] [NFC] Formatting PassRegistry.def

---
 llvm/lib/Passes/PassRegistry.def | 40 ++--
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index f761d0dab09a8..b1570162d3434 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -63,7 +63,8 @@ MODULE_PASS("coro-early", CoroEarlyPass())
 MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass())
 MODULE_PASS("ctx-instr-gen",
 PGOInstrumentationGen(PGOInstrumentationType::CTXPROF))
-MODULE_PASS("ctx-prof-flatten", 
PGOCtxProfFlatteningPass(/*IsPreThinlink=*/false))
+MODULE_PASS("ctx-prof-flatten",
+PGOCtxProfFlatteningPass(/*IsPreThinlink=*/false))
 MODULE_PASS("ctx-prof-flatten-prethinlink",
 PGOCtxProfFlatteningPass(/*IsPreThinlink=*/true))
 MODULE_PASS("noinline-nonprevailing", NoinlineNonPrevailing())
@@ -74,7 +75,8 @@ MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass())
 MODULE_PASS("dxil-upgrade", DXILUpgradePass())
 MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
 MODULE_PASS("extract-blocks", BlockExtractorPass({}, false))
-MODULE_PASS("expand-variadics", 
ExpandVariadicsPass(ExpandVariadicsMode::Disable))
+MODULE_PASS("expand-variadics",
+ExpandVariadicsPass(ExpandVariadicsMode::Disable))
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
 MODULE_PASS("function-import", FunctionImportPass())
 MODULE_PASS("global-merge-func", GlobalMergeFuncPass())
@@ -104,7 +106,10 @@ MODULE_PASS("lower-ifunc", LowerIFuncPass())
 MODULE_PASS("simplify-type-tests", SimplifyTypeTestsPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("fatlto-cleanup", FatLtoCleanup())
-MODULE_PASS("pgo-force-function-attrs", PGOForceFunctionAttrsPass(PGOOpt ? 
PGOOpt->ColdOptType : PGOOptions::ColdFuncOpt::Default))
+MODULE_PASS("pgo-force-function-attrs",
+PGOForceFunctionAttrsPass(PGOOpt
+  ? PGOOpt->ColdOptType
+  : PGOOptions::ColdFuncOpt::Default))
 MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation())
 MODULE_PASS("memprof-module", ModuleMemProfilerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
@@ -178,7 +183,7 @@ MODULE_PASS_WITH_PARAMS(
 parseASanPassOptions, "kernel")
 MODULE_PASS_WITH_PARAMS(
 "cg-profile", "CGProfilePass",
-[](bool InLTOPostLink) { return CGProfilePass(InLTOPostLink);},
+[](bool InLTOPostLink) { return CGProfilePass(InLTOPostLink); },
 parseCGProfilePassOptions, "in-lto-post-link")
 MODULE_PASS_WITH_PARAMS(
 "global-merge", "GlobalMergePass",
@@ -287,7 +292,8 @@ CGSCC_PASS_WITH_PARAMS(
 FUNCTION_ANALYSIS("aa", AAManager())
 FUNCTION_ANALYSIS("access-info", LoopAccessAnalysis())
 FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis())
-FUNCTION_ANALYSIS("bb-sections-profile-reader", 
BasicBlockSectionsProfileReaderAnalysis(TM))
+FUNCTION_ANALYSIS("bb-sections-profile-reader",
+  BasicBlockSectionsProfileReaderAnalysis(TM))
 FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis())
 FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis())
 FUNCTION_ANALYSIS("cycles", CycleAnalysis())
@@ -377,7 +383,7 @@ FUNCTION_PASS("expand-large-div-rem", 
ExpandLargeDivRemPass(TM))
 FUNCTION_PASS("expand-fp", ExpandFpPass(TM))
 FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
 FUNCTION_PASS("extra-vector-passes",
-  ExtraFunctionPassManager())
+  ExtraFunctionPassManager())
 FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
 FUNCTION_PASS("flatten-cfg", FlattenCFGPass())
 FUNCTION_PASS("float2int", Float2IntPass())
@@ -548,8 +554,7 @@ FUNCTION_PASS_WITH_PARAMS(
 "max-iterations=N")
 FUNCTION_PASS_WITH_PARAMS(
 "lint", "LintPass",
-[](bool AbortOnError) { return LintPass(AbortOnError); },
-parseLintOptions,
+[](bool AbortOnError) { return LintPass(AbortOnError); }, parseLintOptions,
 "abort-on-error")
 FUNCTION_PASS_WITH_PARAMS(
 "loop-unroll", "LoopUnrollPass",
@@ -576,7 +581,8 @@ FUNCTION_PASS_WITH_PARAMS(
 "normalize", "IRNormalizerPass",
 [](IRNormalizerOptions Options) { return IRNormalizerPass(Options); },
 parseIRNormalizerPassOptions,
-
"no-preserve-order;preserve-order;no-rename-all;rename-all;no-fold-all;fold-all;no-reorder-operands;reorder-operands")
+"no-preserve-order;preserve-order;no-rename-all;rename-all;no-fold-all;"
+"fold-all;no-reorder-operands;reorder-operands")
 FUNCTION_PASS_WITH_PARAMS(
 "mldst-motion", "MergedLoadStoreMotionPass",
 [](MergedLoadStoreMotionOptions Opts) {
@@ -590,7 +596,7 @@ FUNCTION_PASS_WITH_PARAMS(
 },
 [](StringRe

[llvm-branch-commits] [llvm] [IR2Vec] Minor vocab changes and exposing weights (PR #143200)

2025-06-09 Thread S. VenkataKeerthy via llvm-branch-commits

https://github.com/svkeerthy updated 
https://github.com/llvm/llvm-project/pull/143200

>From d3468ab37c05d4796661f13d61fffe31bcf47ba5 Mon Sep 17 00:00:00 2001
From: svkeerthy 
Date: Fri, 6 Jun 2025 20:32:32 +
Subject: [PATCH] Vocab changes1

---
 llvm/include/llvm/Analysis/IR2Vec.h|  10 ++
 llvm/lib/Analysis/IR2Vec.cpp   |  82 +--
 llvm/unittests/Analysis/IR2VecTest.cpp | 137 ++---
 3 files changed, 163 insertions(+), 66 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h 
b/llvm/include/llvm/Analysis/IR2Vec.h
index 3e7e31aac0f6d..dbce8ebe5e103 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -31,7 +31,9 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/JSON.h"
 #include 
 
 namespace llvm {
@@ -43,6 +45,7 @@ class Function;
 class Type;
 class Value;
 class raw_ostream;
+class LLVMContext;
 
 /// IR2Vec computes two kinds of embeddings: Symbolic and Flow-aware.
 /// Symbolic embeddings capture the "syntactic" and "statistical correlation"
@@ -53,6 +56,11 @@ class raw_ostream;
 enum class IR2VecKind { Symbolic };
 
 namespace ir2vec {
+
+LLVM_ABI extern cl::opt OpcWeight;
+LLVM_ABI extern cl::opt TypeWeight;
+LLVM_ABI extern cl::opt ArgWeight;
+
 /// Embedding is a ADT that wraps std::vector. It provides
 /// additional functionality for arithmetic and comparison operations.
 /// It is meant to be used *like* std::vector but is more restrictive
@@ -226,10 +234,12 @@ class IR2VecVocabResult {
 class IR2VecVocabAnalysis : public AnalysisInfoMixin {
   ir2vec::Vocab Vocabulary;
   Error readVocabulary();
+  void emitError(Error Err, LLVMContext &Ctx);
 
 public:
   static AnalysisKey Key;
   IR2VecVocabAnalysis() = default;
+  explicit IR2VecVocabAnalysis(ir2vec::Vocab &&Vocab);
   using Result = IR2VecVocabResult;
   Result run(Module &M, ModuleAnalysisManager &MAM);
 };
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 25ce35d4ace37..2ad65c2f40c33 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -16,13 +16,11 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 using namespace llvm;
@@ -33,6 +31,8 @@ using namespace ir2vec;
 STATISTIC(VocabMissCounter,
   "Number of lookups to entites not present in the vocabulary");
 
+namespace llvm {
+namespace ir2vec {
 static cl::OptionCategory IR2VecCategory("IR2Vec Options");
 
 // FIXME: Use a default vocab when not specified
@@ -40,18 +40,20 @@ static cl::opt
 VocabFile("ir2vec-vocab-path", cl::Optional,
   cl::desc("Path to the vocabulary file for IR2Vec"), cl::init(""),
   cl::cat(IR2VecCategory));
-static cl::opt OpcWeight("ir2vec-opc-weight", cl::Optional,
-cl::init(1.0),
-cl::desc("Weight for opcode embeddings"),
-cl::cat(IR2VecCategory));
-static cl::opt TypeWeight("ir2vec-type-weight", cl::Optional,
- cl::init(0.5),
- cl::desc("Weight for type embeddings"),
- cl::cat(IR2VecCategory));
-static cl::opt ArgWeight("ir2vec-arg-weight", cl::Optional,
-cl::init(0.2),
-cl::desc("Weight for argument embeddings"),
-cl::cat(IR2VecCategory));
+LLVM_ABI cl::opt OpcWeight("ir2vec-opc-weight", cl::Optional,
+  cl::init(1.0),
+  cl::desc("Weight for opcode embeddings"),
+  cl::cat(IR2VecCategory));
+LLVM_ABI cl::opt TypeWeight("ir2vec-type-weight", cl::Optional,
+   cl::init(0.5),
+   cl::desc("Weight for type embeddings"),
+   cl::cat(IR2VecCategory));
+LLVM_ABI cl::opt ArgWeight("ir2vec-arg-weight", cl::Optional,
+  cl::init(0.2),
+  cl::desc("Weight for argument embeddings"),
+  cl::cat(IR2VecCategory));
+} // namespace ir2vec
+} // namespace llvm
 
 AnalysisKey IR2VecVocabAnalysis::Key;
 
@@ -251,9 +253,9 @@ bool IR2VecVocabResult::invalidate(
 // by auto-generating a default vocabulary during the build time.
 Error IR2VecVocabAnalysis::readVocabulary() {
   auto BufOrError = MemoryBuffer::getFileOrSTDIN(VocabFile, /*IsText=*/true);
-  if (!BufOrError) {
+  if (!B

  1   2   3   >