usaxena95 created this revision. Herald added subscribers: cfe-commits, kadircet, arphaman, jkorous, MaskRay, ilya-biryukov, mgorny. Herald added a project: clang.
[WIP] - Proposes a json format for representing Random Forest model. - Proposes a way to test the generated runtime using a test model. TODO: - Add generated source code snippet for easier review. - Fix unused label warning. - Figure out required using declarations for CATEGORICAL columns from Features.json. - Necessary Google3 internal modifications for blaze before landing. - Add documentation for format of the model. - Document more. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D83814 Files: clang-tools-extra/clangd/CMakeLists.txt clang-tools-extra/clangd/CompletionModelCodegen.py clang-tools-extra/clangd/model/features.json clang-tools-extra/clangd/model/tree.json clang-tools-extra/clangd/unittests/CMakeLists.txt clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp clang-tools-extra/clangd/unittests/model/features.json clang-tools-extra/clangd/unittests/model/tree.json
Index: clang-tools-extra/clangd/unittests/model/tree.json =================================================================== --- /dev/null +++ clang-tools-extra/clangd/unittests/model/tree.json @@ -0,0 +1,52 @@ +[ + { + "operation": "if_greater", + "feature": "NumReferences", + "threshold": 200.0, + "then": { + "operation": "if_greater", + "feature": "FileProximityDistance", + "threshold": -1, + "then": { + "operation": "boost", + "score": 10.0 + }, + "else": { + "operation": "boost", + "score": -20.0 + } + }, + "else": { + "operation": "if_member", + "feature": "ContextKind", + "set": [ + "Kind::CCC_DotMemberAccess", + "Kind::CCC_ArrowMemberAccess" + ], + "then": { + "operation": "boost", + "score": 3.0 + }, + "else": { + "operation": "boost", + "score": -4.0 + } + } + }, + { + "operation": "if_member", + "feature": "ContextKind", + "set": [ + "Kind::CCC_Namespace", + "Kind::CCC_ArrowMemberAccess" + ], + "then": { + "operation": "boost", + "score": 5.0 + }, + "else": { + "operation": "boost", + "score": -6.0 + } + } +] \ No newline at end of file Index: clang-tools-extra/clangd/unittests/model/features.json =================================================================== --- /dev/null +++ clang-tools-extra/clangd/unittests/model/features.json @@ -0,0 +1,14 @@ +[ + { + "name": "NumReferences", + "type": "NUMERICAL" + }, + { + "name": "FileProximityDistance", + "type": "NUMERICAL" + }, + { + "name": "ContextKind", + "type": "CATEGORICAL" + } +] \ No newline at end of file Index: clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp =================================================================== --- clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp +++ clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp @@ -10,6 +10,8 @@ #include "ClangdServer.h" #include "CodeComplete.h" #include "Compiler.h" +#include "CompletionModel.h" +#include "CompletionModelTest.h" #include "Matchers.h" #include "Protocol.h" #include "Quality.h" @@ -47,6 +49,7 @@ using ::testing::IsEmpty; using ::testing::Not; using ::testing::UnorderedElementsAre; +using ContextKind = CodeCompletionContext::Kind; // GMock helpers for matching completion items. MATCHER_P(Named, Name, "") { return arg.Name == Name; } @@ -161,6 +164,36 @@ return S; } +TEST(DecisionForestRuntime, Evaluate) { + using Example = clangd::test::Example; + using clangd::test::Evaluate; + + Example E; + E.SetNumReferences(200); // True + E.SetFileProximityDistance(0); // True: +10.0 + E.SetContextKind(ContextKind::CCC_ArrowMemberAccess); // True: +5.0 + EXPECT_EQ(Evaluate(E), 15.0); + + E.SetNumReferences(200); // True + E.SetFileProximityDistance(-2); // False: -20.0 + E.SetContextKind(ContextKind::CCC_Namespace); // True: +5.0 + EXPECT_EQ(Evaluate(E), -15.0); + + E.SetNumReferences(100); // False + E.SetContextKind(ContextKind::CCC_DotMemberAccess); // True: +3.0, False: -6.0 + EXPECT_EQ(Evaluate(E), -3.0); +} + +TEST(DecisionForestRuntime, SanityTest) { + using Example = clangd::Example; + using clangd::Evaluate; + Example E1; + E1.SetContextKind(ContextKind::CCC_ArrowMemberAccess); + Example E2; + E2.SetContextKind(ContextKind::CCC_SymbolOrNewName); + EXPECT_GT(Evaluate(E1), Evaluate(E2)); +} + TEST(CompletionTest, Limit) { clangd::CodeCompleteOptions Opts; Opts.Limit = 2; Index: clang-tools-extra/clangd/unittests/CMakeLists.txt =================================================================== --- clang-tools-extra/clangd/unittests/CMakeLists.txt +++ clang-tools-extra/clangd/unittests/CMakeLists.txt @@ -28,6 +28,27 @@ set(REMOTE_TEST_SOURCES remote/MarshallingTests.cpp) endif() + +set(output_dir ${CMAKE_BINARY_DIR}/generated/test) +set(output_name CompletionModelTest) +set(model_h ${output_dir}/${output_name}.h) +set(model_cpp ${output_dir}/${output_name}.cpp) + +set(model_gen ${CMAKE_CURRENT_SOURCE_DIR}/../CompletionModelCodegen.py) +set(model_json ${CMAKE_CURRENT_SOURCE_DIR}/model/tree.json) +set(features_json ${CMAKE_CURRENT_SOURCE_DIR}/model/features.json) + +add_custom_command(OUTPUT ${model_cpp} ${model_h} + COMMAND "${Python3_EXECUTABLE}" ${model_gen} --model=${model_json} --features=${features_json} --output_dir=${output_dir} --output_name=${output_name} --test + COMMENT "Generating code completion model runtime..." + DEPENDS ${model_gen} ${model_json} + VERBATIM ) + +set_source_files_properties("${model_h}" + PROPERTIES GENERATED TRUE) +set_source_files_properties("${model_cpp}" + PROPERTIES GENERATED TRUE) + add_custom_target(ClangdUnitTests) add_unittest(ClangdUnitTests ClangdTests Annotations.cpp @@ -87,6 +108,7 @@ TweakTesting.cpp URITests.cpp XRefsTests.cpp + ${model_cpp} support/CancellationTests.cpp support/ContextTests.cpp @@ -101,6 +123,10 @@ $<TARGET_OBJECTS:obj.clangDaemonTweaks> ) +target_include_directories(ClangdTests PUBLIC + $<BUILD_INTERFACE:${output_dir}> +) + clang_target_link_libraries(ClangdTests PRIVATE clangAST Index: clang-tools-extra/clangd/model/tree.json =================================================================== --- /dev/null +++ clang-tools-extra/clangd/model/tree.json @@ -0,0 +1,18 @@ +[ + { + "operation": "if_member", + "feature": "ContextKind", + "set": [ + "Kind::CCC_DotMemberAccess", + "Kind::CCC_ArrowMemberAccess" + ], + "then": { + "operation": "boost", + "score": 3.0 + }, + "else": { + "operation": "boost", + "score": 1.0 + } + } +] \ No newline at end of file Index: clang-tools-extra/clangd/model/features.json =================================================================== --- /dev/null +++ clang-tools-extra/clangd/model/features.json @@ -0,0 +1,6 @@ +[ + { + "name": "ContextKind", + "type": "CATEGORICAL" + } +] \ No newline at end of file Index: clang-tools-extra/clangd/CompletionModelCodegen.py =================================================================== --- /dev/null +++ clang-tools-extra/clangd/CompletionModelCodegen.py @@ -0,0 +1,240 @@ +import argparse +import json +import struct +from dataclasses import dataclass + + +@dataclass +class Feature: + name: str + kind: str + + +def gen_header_code(features_json, is_test=False): + gaurd = "LLVM_CLANG_TOOLS_EXTRA_CLANGD_COMPLETION_MODEL_{}H".format( + "TEST_" if is_test else "") + code = """#ifndef {gaurd} +#define {gaurd}""".format(gaurd=gaurd) + code += """ +#include <cstdint> +#include <cstring> +#include <cstdio> +#include <limits> + +namespace clang { +namespace clangd { +""" + if is_test: + code += "namespace test {\n" + code += """namespace { +template <typename To, typename From> +To BitCast(From F) { + static_assert(sizeof(To) == sizeof(From), "bad bit_cast"); + To Result; + std::memcpy(&Result, &F, sizeof(From)); + return Result; +} + +// Produces an integer that sorts in the same order as F. +// That is: a < b <==> orderEncode(a) < orderEncode(b). +inline uint32_t OrderEncode(float F) { + static_assert(std::numeric_limits<float>::is_iec559, ""); + constexpr uint32_t TopBit = ~(~uint32_t{0} >> 1); + + // Get the bits of the float. Endianness is the same as for integers. + uint32_t U = BitCast<uint32_t>(F); + std::memcpy(&U, &F, sizeof(U)); + // IEEE 754 floats compare like sign-magnitude integers. + if (U & TopBit) // Negative float. + return 0 - U; // Map onto the low half of integers, order reversed. + return U + TopBit; // Positive floats map onto the high half of integers. +} +} // namespace + +""" + + features = [] + assert isinstance(features_json, list) + for feature in features_json: + features.append(Feature(feature['name'], feature['type'])) + + setters = [] + class_members = [] + for feature in features: + if feature.kind == "NUMERICAL": + setters.append( + "void Set{feature}(float V) {{ {feature} = OrderEncode(V); }}". + format(feature=feature.name)) + elif feature.kind == "CATEGORICAL": + setters.append( + "void Set{feature}(unsigned V) {{ {feature} = 1<<V; }}".format( + feature=feature.name)) + else: + raise ValueError("Unkown type of feature: ", feature.kind) + class_members.append( + "uint32_t {feature} = 0;".format(feature=feature.name)) + + code += "class Example {\n" + code += "public:\n" + code += " " + "\n ".join(setters) + "\n" + code += "\n" + code += "private:\n" + code += " " + "\n ".join(class_members) + "\n" + + code += " friend float Evaluate(const Example&);\n" + code += "};\n" + code += "float Evaluate(const Example&);" + "\n" + + if is_test: + code += "} // namespace test\n" + code += "} // namespace clangd\n" + code += "} // namespace clang\n" + code += "#endif // {gaurd}".format(gaurd=gaurd) + return code + + +def order_encode(v: float): + i = struct.unpack('<I', struct.pack('<f', v))[0] + TopBit = 1 << 31 + # IEEE 754 floats compare like sign-magnitude integers. + if (i & TopBit): # Negative float + return (1 << 32) - i # low half of integers, order reversed. + return TopBit + i # top half of integers + + +class Tree: + def __init__(self, json_tree, tree_num: int, node_num: int): + self.operation = json_tree['operation'] + self.tree_num = tree_num + self.node_num = node_num + self.label = "t{0}_n{1}".format(tree_num, node_num) + + if self.operation == 'boost': + self.score = json_tree['score'] + self.size = 1 + return + + self.feature = json_tree['feature'] + if self.operation == 'if_greater': + self.threshold = json_tree['threshold'] + elif self.operation == 'if_member': + self.members = json_tree['set'] + assert isinstance(self.members, list) + else: + raise ValueError("Unknown value for operation: ", self.operation) + + self.false = Tree(json_tree['else'], + tree_num=tree_num, + node_num=node_num + 1) + self.true = Tree(json_tree['then'], + tree_num=tree_num, + node_num=node_num + self.false.size + 1) + self.size = 1 + self.true.size + self.false.size + + def codegen(self): + code = [] + if self.node_num == 0: + code.append("tree_{0}:".format(self.tree_num)) + + if self.operation == "boost": + code.append( + "{label}: Score += {score}; goto tree_{next_tree};".format( + label=self.label, + score=self.score, + next_tree=self.tree_num + 1)) + return code + + if self.operation == "if_greater": + code.append( + "{label}: if(E.{feature} >= {encoded} /*{threshold}*/) goto {true_label};" + .format(label=self.label, + feature=self.feature, + encoded=order_encode(self.threshold), + threshold=self.threshold, + true_label=self.true.label)) + if self.operation == "if_member": + members = '|'.join( + ["BIT({})".format(member) for member in self.members]) + code.append( + "{label}: if(E.{feature} & ({members})) goto {true_label};". + format(label=self.label, + feature=self.feature, + members=members, + true_label=self.true.label)) + return code + self.false.codegen() + self.true.codegen() + + +def gen_evaluate_func(forest_json): + assert isinstance(forest_json, list) + # Generate code for Random Forest. + code = "float Evaluate(const Example& E) {\n" + tree_num = 0 + lines = [] + lines.append("float Score = 0;") + for tree_json in forest_json: + lines += Tree(tree_json, tree_num=tree_num, node_num=0).codegen() + tree_num += 1 + lines.append("tree_{}: // No such tree.".format(tree_num)) + lines.append("return Score;") + + code += " " + "\n ".join(lines) + code += "\n}" + return code + + +def gen_cpp_code(forest_json, header, is_test): + code = "" + # Headers + code = '#include "{header}"\n'.format(header=header) + code += """#include "clang/Sema/CodeCompleteConsumer.h" +#define BIT(X) (1<<X) + +namespace clang { +namespace clangd { +""" + # Namespaces + if is_test: + code += "namespace test {\n" + code += "using Kind=CodeCompletionContext::Kind;\n" + code += "\n" + + code += gen_evaluate_func(forest_json) + "\n" + if is_test: + code += "} // namespace test\n" + code += "} // namespace clangd\n" + code += "} // namespace clang\n" + return code + + +def main(): + parser = argparse.ArgumentParser('DecisionForestCodegen') + parser.add_argument('--output_name', help='output name') + parser.add_argument('--output_dir', help='output directory') + parser.add_argument('--model', help='input json model filename') + parser.add_argument('--features', help='input json features filename') + parser.add_argument('--test', + action='store_true', + help='generate code for unittests.', + default=False) + ns = parser.parse_args() + + output_dir = ns.output_dir + output_name = ns.output_name + header_file = "{dir}/{name}.h".format(dir=output_dir, name=output_name) + cpp_file = "{dir}/{name}.cpp".format(dir=output_dir, name=output_name) + + with open(ns.model) as model_file: + with open(cpp_file, 'w+t') as output_cc: + output_cc.write( + gen_cpp_code(json.load(model_file), + header="{}.h".format(output_name), + is_test=ns.test)) + + with open(ns.features) as features_file: + with open(header_file, 'w+t') as output_h: + output_h.write( + gen_header_code(json.load(features_file), is_test=ns.test)) + + +if __name__ == '__main__': + main() \ No newline at end of file Index: clang-tools-extra/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/clangd/CMakeLists.txt +++ clang-tools-extra/clangd/CMakeLists.txt @@ -27,6 +27,26 @@ FrontendOpenMP ) +set(output_dir ${CMAKE_BINARY_DIR}/generated) +set(output_name CompletionModel) +set(model_h ${output_dir}/${output_name}.h) +set(model_cpp ${output_dir}/${output_name}.cpp) + +set(model_gen ${CMAKE_CURRENT_SOURCE_DIR}/CompletionModelCodegen.py) +set(model_json ${CMAKE_CURRENT_SOURCE_DIR}/model/tree.json) +set(features_json ${CMAKE_CURRENT_SOURCE_DIR}/model/features.json) + +add_custom_command(OUTPUT ${model_cpp} ${model_h} + COMMAND "${Python3_EXECUTABLE}" ${model_gen} --model=${model_json} --features=${features_json} --output_dir=${output_dir} --output_name=${output_name} + COMMENT "Generating code completion model runtime..." + DEPENDS ${model_gen} ${model_json} + VERBATIM ) + +set_source_files_properties("${model_h}" + PROPERTIES GENERATED TRUE) +set_source_files_properties("${model_cpp}" + PROPERTIES GENERATED TRUE) + add_clang_library(clangDaemon AST.cpp ClangdLSPServer.cpp @@ -69,6 +89,7 @@ TUScheduler.cpp URI.cpp XRefs.cpp + ${model_cpp} index/Background.cpp index/BackgroundIndexLoader.cpp @@ -109,6 +130,10 @@ omp_gen ) +target_include_directories(clangDaemon PUBLIC + $<BUILD_INTERFACE:${output_dir}> +) + clang_target_link_libraries(clangDaemon PRIVATE clangAST
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits