[PATCH] D83814: [clangd] Add Random Forest runtime for code completion.

UTKARSH SAXENA via Phabricator via cfe-commits Tue, 14 Jul 2020 14:33:13 -0700

usaxena95 created this revision.
Herald added subscribers: cfe-commits, kadircet, arphaman, jkorous, MaskRay, 
ilya-biryukov, mgorny.
Herald added a project: clang.


[WIP]

- Proposes a json format for representing Random Forest model.
- Proposes a way to test the generated runtime using a test model.

TODO:

- Add generated source code snippet for easier review.
- Fix unused label warning.
- Figure out required using declarations for CATEGORICAL columns from 
Features.json.
- Necessary Google3 internal modifications for blaze before landing.
- Add documentation for format of the model.
- Document more.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D83814

Files:
  clang-tools-extra/clangd/CMakeLists.txt
  clang-tools-extra/clangd/CompletionModelCodegen.py
  clang-tools-extra/clangd/model/features.json
  clang-tools-extra/clangd/model/tree.json
  clang-tools-extra/clangd/unittests/CMakeLists.txt
  clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
  clang-tools-extra/clangd/unittests/model/features.json
  clang-tools-extra/clangd/unittests/model/tree.json

Index: clang-tools-extra/clangd/unittests/model/tree.json
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/unittests/model/tree.json
@@ -0,0 +1,52 @@
+[
+    {
+        "operation": "if_greater",
+        "feature": "NumReferences",
+        "threshold": 200.0,
+        "then": {
+            "operation": "if_greater",
+            "feature": "FileProximityDistance",
+            "threshold": -1,
+            "then": {
+                "operation": "boost",
+                "score": 10.0
+            },
+            "else": {
+                "operation": "boost",
+                "score": -20.0
+            }
+        },
+        "else": {
+            "operation": "if_member",
+            "feature": "ContextKind",
+            "set": [
+                "Kind::CCC_DotMemberAccess",
+                "Kind::CCC_ArrowMemberAccess"
+            ],
+            "then": {
+                "operation": "boost",
+                "score": 3.0
+            },
+            "else": {
+                "operation": "boost",
+                "score": -4.0
+            }
+        }
+    },
+    {
+        "operation": "if_member",
+        "feature": "ContextKind",
+        "set": [
+            "Kind::CCC_Namespace",
+            "Kind::CCC_ArrowMemberAccess"
+        ],
+        "then": {
+            "operation": "boost",
+            "score": 5.0
+        },
+        "else": {
+            "operation": "boost",
+            "score": -6.0
+        }
+    }
+]
\ No newline at end of file
Index: clang-tools-extra/clangd/unittests/model/features.json
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/unittests/model/features.json
@@ -0,0 +1,14 @@
+[
+    {
+        "name": "NumReferences",
+        "type": "NUMERICAL"
+    },
+    {
+        "name": "FileProximityDistance",
+        "type": "NUMERICAL"
+    },
+    {
+        "name": "ContextKind",
+        "type": "CATEGORICAL"
+    }
+]
\ No newline at end of file
Index: clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
===================================================================
--- clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -10,6 +10,8 @@
 #include "ClangdServer.h"
 #include "CodeComplete.h"
 #include "Compiler.h"
+#include "CompletionModel.h"
+#include "CompletionModelTest.h"
 #include "Matchers.h"
 #include "Protocol.h"
 #include "Quality.h"
@@ -47,6 +49,7 @@
 using ::testing::IsEmpty;
 using ::testing::Not;
 using ::testing::UnorderedElementsAre;
+using ContextKind = CodeCompletionContext::Kind;
 
 // GMock helpers for matching completion items.
 MATCHER_P(Named, Name, "") { return arg.Name == Name; }
@@ -161,6 +164,36 @@
   return S;
 }
 
+TEST(DecisionForestRuntime, Evaluate) {
+  using Example = clangd::test::Example;
+  using clangd::test::Evaluate;
+
+  Example E;
+  E.SetNumReferences(200);                              // True
+  E.SetFileProximityDistance(0);                        // True: +10.0
+  E.SetContextKind(ContextKind::CCC_ArrowMemberAccess); // True: +5.0
+  EXPECT_EQ(Evaluate(E), 15.0);
+
+  E.SetNumReferences(200);                      // True
+  E.SetFileProximityDistance(-2);               // False: -20.0
+  E.SetContextKind(ContextKind::CCC_Namespace); // True: +5.0
+  EXPECT_EQ(Evaluate(E), -15.0);
+
+  E.SetNumReferences(100);                            // False
+  E.SetContextKind(ContextKind::CCC_DotMemberAccess); // True: +3.0, False: -6.0
+  EXPECT_EQ(Evaluate(E), -3.0);
+}
+
+TEST(DecisionForestRuntime, SanityTest) {
+  using Example = clangd::Example;
+  using clangd::Evaluate;
+  Example E1;
+  E1.SetContextKind(ContextKind::CCC_ArrowMemberAccess);
+  Example E2;
+  E2.SetContextKind(ContextKind::CCC_SymbolOrNewName);
+  EXPECT_GT(Evaluate(E1), Evaluate(E2));
+}
+
 TEST(CompletionTest, Limit) {
   clangd::CodeCompleteOptions Opts;
   Opts.Limit = 2;
Index: clang-tools-extra/clangd/unittests/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/unittests/CMakeLists.txt
+++ clang-tools-extra/clangd/unittests/CMakeLists.txt
@@ -28,6 +28,27 @@
   set(REMOTE_TEST_SOURCES remote/MarshallingTests.cpp)
 endif()
 
+
+set(output_dir ${CMAKE_BINARY_DIR}/generated/test)
+set(output_name CompletionModelTest)
+set(model_h ${output_dir}/${output_name}.h)
+set(model_cpp ${output_dir}/${output_name}.cpp)
+
+set(model_gen ${CMAKE_CURRENT_SOURCE_DIR}/../CompletionModelCodegen.py)
+set(model_json ${CMAKE_CURRENT_SOURCE_DIR}/model/tree.json)
+set(features_json ${CMAKE_CURRENT_SOURCE_DIR}/model/features.json)
+
+add_custom_command(OUTPUT ${model_cpp} ${model_h}
+  COMMAND "${Python3_EXECUTABLE}" ${model_gen} --model=${model_json}  --features=${features_json} --output_dir=${output_dir} --output_name=${output_name} --test
+  COMMENT "Generating code completion model runtime..."
+  DEPENDS ${model_gen} ${model_json}
+  VERBATIM )
+
+set_source_files_properties("${model_h}"
+  PROPERTIES GENERATED TRUE)
+set_source_files_properties("${model_cpp}"
+  PROPERTIES GENERATED TRUE)
+
 add_custom_target(ClangdUnitTests)
 add_unittest(ClangdUnitTests ClangdTests
   Annotations.cpp
@@ -87,6 +108,7 @@
   TweakTesting.cpp
   URITests.cpp
   XRefsTests.cpp
+  ${model_cpp}
 
   support/CancellationTests.cpp
   support/ContextTests.cpp
@@ -101,6 +123,10 @@
   $<TARGET_OBJECTS:obj.clangDaemonTweaks>
   )
 
+target_include_directories(ClangdTests PUBLIC
+  $<BUILD_INTERFACE:${output_dir}>
+)
+
 clang_target_link_libraries(ClangdTests
   PRIVATE
   clangAST
Index: clang-tools-extra/clangd/model/tree.json
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/model/tree.json
@@ -0,0 +1,18 @@
+[
+    {
+        "operation": "if_member",
+        "feature": "ContextKind",
+        "set": [
+            "Kind::CCC_DotMemberAccess",
+            "Kind::CCC_ArrowMemberAccess"
+        ],
+        "then": {
+            "operation": "boost",
+            "score": 3.0
+        },
+        "else": {
+            "operation": "boost",
+            "score": 1.0
+        }
+    }
+]
\ No newline at end of file
Index: clang-tools-extra/clangd/model/features.json
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/model/features.json
@@ -0,0 +1,6 @@
+[
+    {
+        "name": "ContextKind",
+        "type": "CATEGORICAL"
+    }
+]
\ No newline at end of file
Index: clang-tools-extra/clangd/CompletionModelCodegen.py
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/CompletionModelCodegen.py
@@ -0,0 +1,240 @@
+import argparse
+import json
+import struct
+from dataclasses import dataclass
+
+
+@dataclass
+class Feature:
+    name: str
+    kind: str
+
+
+def gen_header_code(features_json, is_test=False):
+    gaurd = "LLVM_CLANG_TOOLS_EXTRA_CLANGD_COMPLETION_MODEL_{}H".format(
+        "TEST_" if is_test else "")
+    code = """#ifndef {gaurd}
+#define {gaurd}""".format(gaurd=gaurd)
+    code += """
+#include <cstdint>
+#include <cstring>
+#include <cstdio>
+#include <limits>
+
+namespace clang {
+namespace clangd {
+"""
+    if is_test:
+        code += "namespace test {\n"
+    code += """namespace {
+template <typename To, typename From>
+To BitCast(From F) {
+  static_assert(sizeof(To) == sizeof(From), "bad bit_cast");
+  To Result;
+  std::memcpy(&Result, &F, sizeof(From));
+  return Result;
+}
+
+// Produces an integer that sorts in the same order as F.
+// That is: a < b <==> orderEncode(a) < orderEncode(b).
+inline uint32_t OrderEncode(float F) {
+  static_assert(std::numeric_limits<float>::is_iec559, "");
+  constexpr uint32_t TopBit = ~(~uint32_t{0} >> 1);
+
+  // Get the bits of the float. Endianness is the same as for integers.
+  uint32_t U = BitCast<uint32_t>(F);
+  std::memcpy(&U, &F, sizeof(U));
+  // IEEE 754 floats compare like sign-magnitude integers.
+  if (U & TopBit)    // Negative float.
+    return 0 - U;    // Map onto the low half of integers, order reversed.
+  return U + TopBit; // Positive floats map onto the high half of integers.
+}
+} // namespace
+
+"""
+
+    features = []
+    assert isinstance(features_json, list)
+    for feature in features_json:
+        features.append(Feature(feature['name'], feature['type']))
+
+    setters = []
+    class_members = []
+    for feature in features:
+        if feature.kind == "NUMERICAL":
+            setters.append(
+                "void Set{feature}(float V) {{ {feature} = OrderEncode(V); }}".
+                format(feature=feature.name))
+        elif feature.kind == "CATEGORICAL":
+            setters.append(
+                "void Set{feature}(unsigned V) {{ {feature} = 1<<V; }}".format(
+                    feature=feature.name))
+        else:
+            raise ValueError("Unkown type of feature: ", feature.kind)
+        class_members.append(
+            "uint32_t {feature} = 0;".format(feature=feature.name))
+
+    code += "class Example {\n"
+    code += "public:\n"
+    code += "  " + "\n  ".join(setters) + "\n"
+    code += "\n"
+    code += "private:\n"
+    code += "  " + "\n  ".join(class_members) + "\n"
+
+    code += "  friend float Evaluate(const Example&);\n"
+    code += "};\n"
+    code += "float Evaluate(const Example&);" + "\n"
+
+    if is_test:
+        code += "} // namespace test\n"
+    code += "} // namespace clangd\n"
+    code += "} // namespace clang\n"
+    code += "#endif // {gaurd}".format(gaurd=gaurd)
+    return code
+
+
+def order_encode(v: float):
+    i = struct.unpack('<I', struct.pack('<f', v))[0]
+    TopBit = 1 << 31
+    # IEEE 754 floats compare like sign-magnitude integers.
+    if (i & TopBit):  # Negative float
+        return (1 << 32) - i  # low half of integers, order reversed.
+    return TopBit + i  # top half of integers
+
+
+class Tree:
+    def __init__(self, json_tree, tree_num: int, node_num: int):
+        self.operation = json_tree['operation']
+        self.tree_num = tree_num
+        self.node_num = node_num
+        self.label = "t{0}_n{1}".format(tree_num, node_num)
+
+        if self.operation == 'boost':
+            self.score = json_tree['score']
+            self.size = 1
+            return
+
+        self.feature = json_tree['feature']
+        if self.operation == 'if_greater':
+            self.threshold = json_tree['threshold']
+        elif self.operation == 'if_member':
+            self.members = json_tree['set']
+            assert isinstance(self.members, list)
+        else:
+            raise ValueError("Unknown value for operation: ", self.operation)
+
+        self.false = Tree(json_tree['else'],
+                          tree_num=tree_num,
+                          node_num=node_num + 1)
+        self.true = Tree(json_tree['then'],
+                         tree_num=tree_num,
+                         node_num=node_num + self.false.size + 1)
+        self.size = 1 + self.true.size + self.false.size
+
+    def codegen(self):
+        code = []
+        if self.node_num == 0:
+            code.append("tree_{0}:".format(self.tree_num))
+
+        if self.operation == "boost":
+            code.append(
+                "{label}: Score += {score}; goto tree_{next_tree};".format(
+                    label=self.label,
+                    score=self.score,
+                    next_tree=self.tree_num + 1))
+            return code
+
+        if self.operation == "if_greater":
+            code.append(
+                "{label}: if(E.{feature} >= {encoded} /*{threshold}*/) goto {true_label};"
+                .format(label=self.label,
+                        feature=self.feature,
+                        encoded=order_encode(self.threshold),
+                        threshold=self.threshold,
+                        true_label=self.true.label))
+        if self.operation == "if_member":
+            members = '|'.join(
+                ["BIT({})".format(member) for member in self.members])
+            code.append(
+                "{label}: if(E.{feature} & ({members})) goto {true_label};".
+                format(label=self.label,
+                       feature=self.feature,
+                       members=members,
+                       true_label=self.true.label))
+        return code + self.false.codegen() + self.true.codegen()
+
+
+def gen_evaluate_func(forest_json):
+    assert isinstance(forest_json, list)
+    # Generate code for Random Forest.
+    code = "float Evaluate(const Example& E) {\n"
+    tree_num = 0
+    lines = []
+    lines.append("float Score = 0;")
+    for tree_json in forest_json:
+        lines += Tree(tree_json, tree_num=tree_num, node_num=0).codegen()
+        tree_num += 1
+    lines.append("tree_{}: // No such tree.".format(tree_num))
+    lines.append("return Score;")
+
+    code += "  " + "\n  ".join(lines)
+    code += "\n}"
+    return code
+
+
+def gen_cpp_code(forest_json, header, is_test):
+    code = ""
+    # Headers
+    code = '#include "{header}"\n'.format(header=header)
+    code += """#include "clang/Sema/CodeCompleteConsumer.h"
+#define BIT(X) (1<<X)
+
+namespace clang {
+namespace clangd {
+"""
+    # Namespaces
+    if is_test:
+        code += "namespace test {\n"
+    code += "using Kind=CodeCompletionContext::Kind;\n"
+    code += "\n"
+
+    code += gen_evaluate_func(forest_json) + "\n"
+    if is_test:
+        code += "} // namespace test\n"
+    code += "} // namespace clangd\n"
+    code += "} // namespace clang\n"
+    return code
+
+
+def main():
+    parser = argparse.ArgumentParser('DecisionForestCodegen')
+    parser.add_argument('--output_name', help='output name')
+    parser.add_argument('--output_dir', help='output directory')
+    parser.add_argument('--model', help='input json model filename')
+    parser.add_argument('--features', help='input json features filename')
+    parser.add_argument('--test',
+                        action='store_true',
+                        help='generate code for unittests.',
+                        default=False)
+    ns = parser.parse_args()
+
+    output_dir = ns.output_dir
+    output_name = ns.output_name
+    header_file = "{dir}/{name}.h".format(dir=output_dir, name=output_name)
+    cpp_file = "{dir}/{name}.cpp".format(dir=output_dir, name=output_name)
+
+    with open(ns.model) as model_file:
+        with open(cpp_file, 'w+t') as output_cc:
+            output_cc.write(
+                gen_cpp_code(json.load(model_file),
+                             header="{}.h".format(output_name),
+                             is_test=ns.test))
+
+    with open(ns.features) as features_file:
+        with open(header_file, 'w+t') as output_h:
+            output_h.write(
+                gen_header_code(json.load(features_file), is_test=ns.test))
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -27,6 +27,26 @@
   FrontendOpenMP
   )
 
+set(output_dir ${CMAKE_BINARY_DIR}/generated)
+set(output_name CompletionModel)
+set(model_h ${output_dir}/${output_name}.h)
+set(model_cpp ${output_dir}/${output_name}.cpp)
+
+set(model_gen ${CMAKE_CURRENT_SOURCE_DIR}/CompletionModelCodegen.py)
+set(model_json ${CMAKE_CURRENT_SOURCE_DIR}/model/tree.json)
+set(features_json ${CMAKE_CURRENT_SOURCE_DIR}/model/features.json)
+
+add_custom_command(OUTPUT ${model_cpp} ${model_h}
+  COMMAND "${Python3_EXECUTABLE}" ${model_gen} --model=${model_json}  --features=${features_json} --output_dir=${output_dir} --output_name=${output_name}
+  COMMENT "Generating code completion model runtime..."
+  DEPENDS ${model_gen} ${model_json}
+  VERBATIM )
+
+set_source_files_properties("${model_h}"
+  PROPERTIES GENERATED TRUE)
+set_source_files_properties("${model_cpp}"
+  PROPERTIES GENERATED TRUE)
+
 add_clang_library(clangDaemon
   AST.cpp
   ClangdLSPServer.cpp
@@ -69,6 +89,7 @@
   TUScheduler.cpp
   URI.cpp
   XRefs.cpp
+  ${model_cpp}
 
   index/Background.cpp
   index/BackgroundIndexLoader.cpp
@@ -109,6 +130,10 @@
   omp_gen
   )
 
+target_include_directories(clangDaemon PUBLIC
+  $<BUILD_INTERFACE:${output_dir}>
+)
+
 clang_target_link_libraries(clangDaemon
   PRIVATE
   clangAST

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D83814: [clangd] Add Random Forest runtime for code completion.

Reply via email to