(incubator-hugegraph-ai) 31/32: Add Apache-2.0 license, fix review comments

jin Thu, 30 Oct 2025 05:44:22 -0700

This is an automated email from the ASF dual-hosted git repository.

jin pushed a commit to branch text2gql
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git


commit 958fef3011bdb67ba6a881eb9e35b033b8810e51
Author: Lriver <[email protected]>
AuthorDate: Thu Oct 30 19:53:18 2025 +0800

    Add Apache-2.0 license, fix review comments
---
 text2gremlin/AST_Text2Gremlin/README.md            |  14 +-
 .../AST_Text2Gremlin/base/CombinationController.py |  77 ++++--
 text2gremlin/AST_Text2Gremlin/base/Config.py       |  35 ++-
 text2gremlin/AST_Text2Gremlin/base/GremlinBase.py  |  18 ++
 text2gremlin/AST_Text2Gremlin/base/GremlinExpr.py  |  18 ++
 text2gremlin/AST_Text2Gremlin/base/GremlinParse.py |  18 ++
 .../AST_Text2Gremlin/base/GremlinTransVisitor.py   |  18 ++
 text2gremlin/AST_Text2Gremlin/base/Schema.py       |  58 ++---
 .../AST_Text2Gremlin/base/TraversalGenerator.py    | 184 +++++++-------
 text2gremlin/AST_Text2Gremlin/base/__init__.py     |  29 ++-
 text2gremlin/AST_Text2Gremlin/base/generator.py    |  57 +++--
 .../AST_Text2Gremlin/base/gremlin/GremlinLexer.py  |  20 +-
 .../base/gremlin/GremlinListener.py                |  20 +-
 .../AST_Text2Gremlin/base/gremlin/GremlinParser.py |  20 +-
 .../base/gremlin/GremlinVisitor.py                 |  20 +-
 .../AST_Text2Gremlin/base/gremlin/__init__.py      |  16 ++
 .../base/gremlin/antlr-4.13.1-complete.jar         | Bin 2139203 -> 0 bytes
 text2gremlin/AST_Text2Gremlin/config.json          |   4 +-
 text2gremlin/AST_Text2Gremlin/generate_corpus.py   | 209 ++++++++++++++++
 text2gremlin/AST_Text2Gremlin/output/README.md     |  74 ------
 .../output/SYNTAX_ANALYSIS_SUMMARY.md              | 178 -------------
 .../output/SYNTAX_DISTRIBUTION_REPORT.md           | 277 ---------------------
 .../output/syntax_distribution_stats.json          |  95 -------
 text2gremlin/AST_Text2Gremlin/requirements.txt     |   5 +-
 24 files changed, 653 insertions(+), 811 deletions(-)

diff --git a/text2gremlin/AST_Text2Gremlin/README.md 
b/text2gremlin/AST_Text2Gremlin/README.md
index 9542906b..7df02684 100644
--- a/text2gremlin/AST_Text2Gremlin/README.md
+++ b/text2gremlin/AST_Text2Gremlin/README.md
@@ -5,7 +5,7 @@
 ## 快速开始
 环境配置：python：3.12.10
 ```bash
-pip install requirements.txt
+pip install -r requirements.txt
 ```
 
 ```bash
@@ -31,7 +31,7 @@ python show_syntax_stats.py
 
 ## 项目结构
 
-```
+```text
 ├── generate_corpus.py                   # 主程序
 ├── gremlin_templates.csv                # 模板文件
 ├── config.json                          # 配置
@@ -80,11 +80,7 @@ print(f"生成了 {result['total_unique_queries']} 个查询")
 
 ### 3. 添加模板
 
-```bash
-python add_template.py
-```
-
-或直接编辑 `gremlin_templates.csv`
+直接编辑 `gremlin_templates.csv`即可
 
 ---
 
@@ -155,7 +151,7 @@ python visualize_syntax_distribution.py
 
 ### 1. 模板泛化
 从一个模板生成多个变体：
-```
+```text
 模板: g.V().hasLabel('person').out('acted_in')
 
 泛化:
@@ -177,7 +173,7 @@ python visualize_syntax_distribution.py
 
 ### 4. 中文翻译
 自动生成流畅的中文描述：
-```
+```text
 g.V().hasLabel('person').out('acted_in').has('title', 'Inception')
 ↓
 从图中开始查找所有顶点，过滤出'人'类型的顶点，沿'参演'边out方向遍历，其'标题'为'Inception'
diff --git a/text2gremlin/AST_Text2Gremlin/base/CombinationController.py 
b/text2gremlin/AST_Text2Gremlin/base/CombinationController.py
index 7923db01..445f8f38 100644
--- a/text2gremlin/AST_Text2Gremlin/base/CombinationController.py
+++ b/text2gremlin/AST_Text2Gremlin/base/CombinationController.py
@@ -1,3 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
 """
 组合爆炸控制器
 
@@ -20,21 +38,41 @@ class CombinationController:
         """
         self.config = config
         
-        # 链长度分类阈值
-        self.chain_thresholds = config['chain_thresholds']
-        
-        # 随机增强控制
-        self.random_enhancement = config['random_enhancement']
-        
-        # 数据填充策略
-        self.value_fill = config['value_fill_strategy']
-        
-        # 属性泛化策略
-        self.property_gen = config['property_generalization']
+        # 验证必要配置项并加载
+        try:
+            # 链长度分类阈值
+            self.chain_thresholds = config['chain_thresholds']
+            
+            # 随机增强控制
+            self.random_enhancement = config['random_enhancement']
+            
+            # 数据填充策略
+            self.value_fill = config['value_fill_strategy']
+            
+            # 属性泛化策略
+            self.property_gen = config['property_generalization']
+        except KeyError as e:
+            raise ValueError(f"缺少必要配置项: {e}") from None
         
-        # 总数限制
+        # 总数限制（可选）
         self.max_total = config.get('max_total_combinations', {})
         
+        # 验证关键类别的存在性
+        # chain_thresholds 只需要 short, medium, long（ultra 通过 else 分支隐式定义）
+        for category in ('short', 'medium', 'long'):
+            if category not in self.chain_thresholds:
+                raise ValueError(f"chain_thresholds 缺少 '{category}' 配置")
+        
+        # property_generalization 需要所有4个类别（包括 ultra）
+        for category in ('short', 'medium', 'long', 'ultra'):
+            if category not in self.property_gen:
+                raise ValueError(f"property_generalization 缺少 '{category}' 配置")
+            # 验证每个类别的必要字段
+            required_fields = ['full_coverage_threshold', 
'additional_random_min', 'additional_random_max']
+            for field in required_fields:
+                if field not in self.property_gen[category]:
+                    raise ValueError(f"property_generalization.{category} 缺少 
'{field}' 字段")
+        
     def get_chain_category(self, step_count: int) -> str:
         """
         根据步骤数确定链长度类别
@@ -132,7 +170,7 @@ class CombinationController:
         # 2. 判断是否全部遍历
         if len(all_options) <= strategy['full_coverage_threshold']:
             # 同级选项少，全部遍历
-            return all_options
+            return list(all_options)
         
         # 3. 同级选项多，随机选择额外的
         additional_count = random.randint(
@@ -178,9 +216,11 @@ class CombinationController:
         max_combinations = schema_config.get(chain_category, 
{}).get('max_combinations', 1)
         
         combinations = []
+        seen = set()  # 用于去重的集合
         
         # 1. 保留原配方组合
         combinations.append(recipe_params.copy())
+        seen.add(tuple(sorted(recipe_params)))
         
         if max_combinations <= 1:
             return combinations
@@ -210,24 +250,25 @@ class CombinationController:
             # 随机选择同数量的参数
             combo = random.sample(other_options, param_count)
             
-            # 避免重复组合
-            if combo not in combinations:
+            # 使用排序后的元组作为key进行去重（因为参数顺序不影响语义）
+            key = tuple(sorted(combo))
+            if key not in seen:
+                seen.add(key)
                 combinations.append(combo)
             
             attempts += 1
         
         return combinations
     
-    def get_multi_param_value_fill_count(self, param_count: int, is_terminal: 
bool) -> int:
+    def get_multi_param_value_fill_count(self, is_terminal: bool) -> int:
         """
         多参数数据值填充次数控制
         
         Args:
-            param_count: 参数个数
             is_terminal: 是否是终端步骤
             
         Returns:
-            填充次数（每次填充param_count个值）
+            填充次数（每次填充的值个数由调用方根据参数个数决定）
         """
         multi_config = self.config.get('multi_param_strategy', {})
         value_config = multi_config.get('value_fill', {})
diff --git a/text2gremlin/AST_Text2Gremlin/base/Config.py 
b/text2gremlin/AST_Text2Gremlin/base/Config.py
index 21e93eb5..dd67d42c 100644
--- a/text2gremlin/AST_Text2Gremlin/base/Config.py
+++ b/text2gremlin/AST_Text2Gremlin/base/Config.py
@@ -1,5 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
 """
-配置管理模块。
+项目配置管理模块。
 
 负责加载和管理项目配置文件，提供各模块所需的配置参数。
 """
@@ -19,8 +37,13 @@ class Config:
         self.db_id = self.config_data.get("db_id")
 
     def load_config(self):
-        with open(self.file_path, "r") as file:
-            return json.load(file)
+        try:
+            with open(self.file_path, "r", encoding="utf-8") as file:
+                return json.load(file)
+        except FileNotFoundError:
+            raise FileNotFoundError(f"配置文件不存在: {self.file_path}")
+        except json.JSONDecodeError as e:
+            raise ValueError(f"配置文件 JSON 格式错误: {self.file_path}, 错误: {e}")
 
     def get_input_query_path(self):
         return self.config_data.get("input_query_path")
@@ -59,7 +82,11 @@ class Config:
 
     def get_schema_path(self, db_id):
         schema_dict = self.config_data.get("db_schema_path")
-        return schema_dict[db_id]  # todo error check
+        if not schema_dict:
+            raise ValueError("配置中缺少 'db_schema_path' 字段")
+        if db_id not in schema_dict:
+            raise KeyError(f"未找到 db_id '{db_id}' 对应的 schema 路径")
+        return schema_dict[db_id]
 
     def get_config(self, module_name):
         return self.config_data.get(module_name)
diff --git a/text2gremlin/AST_Text2Gremlin/base/GremlinBase.py 
b/text2gremlin/AST_Text2Gremlin/base/GremlinBase.py
index 998dce9d..40fa6025 100644
--- a/text2gremlin/AST_Text2Gremlin/base/GremlinBase.py
+++ b/text2gremlin/AST_Text2Gremlin/base/GremlinBase.py
@@ -1,3 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
 """
 Gremlin翻译引擎模块。
 
diff --git a/text2gremlin/AST_Text2Gremlin/base/GremlinExpr.py 
b/text2gremlin/AST_Text2Gremlin/base/GremlinExpr.py
index a2420cee..1b9e4a20 100644
--- a/text2gremlin/AST_Text2Gremlin/base/GremlinExpr.py
+++ b/text2gremlin/AST_Text2Gremlin/base/GremlinExpr.py
@@ -1,3 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
 """
 Gremlin复杂表达式定义模块。
 
diff --git a/text2gremlin/AST_Text2Gremlin/base/GremlinParse.py 
b/text2gremlin/AST_Text2Gremlin/base/GremlinParse.py
index 07a99e42..feef3c58 100644
--- a/text2gremlin/AST_Text2Gremlin/base/GremlinParse.py
+++ b/text2gremlin/AST_Text2Gremlin/base/GremlinParse.py
@@ -1,3 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
 """
 Gremlin查询结构化表示模块。
 
diff --git a/text2gremlin/AST_Text2Gremlin/base/GremlinTransVisitor.py 
b/text2gremlin/AST_Text2Gremlin/base/GremlinTransVisitor.py
index 502b0862..18308f72 100644
--- a/text2gremlin/AST_Text2Gremlin/base/GremlinTransVisitor.py
+++ b/text2gremlin/AST_Text2Gremlin/base/GremlinTransVisitor.py
@@ -1,3 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
 """
 Gremlin查询AST访问器模块。
 
diff --git a/text2gremlin/AST_Text2Gremlin/base/Schema.py 
b/text2gremlin/AST_Text2Gremlin/base/Schema.py
index 80a74d2c..ad675fd1 100644
--- a/text2gremlin/AST_Text2Gremlin/base/Schema.py
+++ b/text2gremlin/AST_Text2Gremlin/base/Schema.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 
 """
 图数据库Schema管理模块。
@@ -7,9 +24,6 @@
 
 import os
 import json
-import pandas as pd
-from typing import List, Dict
-import json
 import random
 import pandas as pd
 from typing import List, Dict, Any, Tuple
@@ -135,8 +149,14 @@ class Schema:
 
     def get_step_result_label(self, start_label: str, step: Dict) -> 
Tuple[str, str]:
         step_name, step_param = step.get('step'), step.get('param')
-        if step_name == 'out': return self.edges[step_param]['destination'], 
'vertex'
-        if step_name == 'in': return self.edges[step_param]['source'], 'vertex'
+        if step_name == 'out':
+            if step_param not in self.edges:
+                raise KeyError(f"边标签 '{step_param}' 不存在于 schema 中")
+            return self.edges[step_param]['destination'], 'vertex'
+        if step_name == 'in':
+            if step_param not in self.edges:
+                raise KeyError(f"边标签 '{step_param}' 不存在于 schema 中")
+            return self.edges[step_param]['source'], 'vertex'
         if step_name in ['properties', 'has', 'values']: return start_label, 
'vertex'
         return None, None
 
@@ -171,7 +191,6 @@ class Schema:
         Returns:
             实例列表
         """
-        import random
         
         is_edge = label in self.edges
         data_cache = self.edge_data if is_edge else self.vertex_data
@@ -194,30 +213,3 @@ class Schema:
         # 随机采样
         sampled_df = df.sample(actual_count)
         return sampled_df.to_dict('records')
-
-# --- 单模块测试入口 ---
-if __name__ == "__main__":
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    project_root = os.path.dirname(base_dir)
-    schema_path = os.path.join(project_root, 'db_data', 'schema', 
'movie_schema.json')
-    # 【修正】将 data_path 指向包含 movie/raw_data 的上级目录
-    data_path = os.path.join(project_root, 'db_data')
-
-    if not os.path.exists(schema_path) or not os.path.exists(data_path):
-        print("错误: 找不到 Schema 或数据文件，请检查路径。")
-    else:
-        schema = Schema(schema_path, data_path)
-        print("\n--- Schema 初始化成功 (已修复CSV读取逻辑) ---")
-
-        print("\n--- 测试数据实例获取 ---")
-        random_person = schema.get_instance('person')
-        print(f"随机获取一个 'person' 实例: {random_person}")
-        
-        random_user = schema.get_instance('user')
-        print(f"随机获取一个 'user' 实例: {random_user}")
-        
-        # 验证 name 属性是否能被正确读取
-        if random_person and 'name' in random_person:
-            print(f"成功读取到 'person' 的 name: {random_person['name']}")
-        else:
-            print("错误: 未能从 'person' 实例中读取到 name 属性。")
\ No newline at end of file
diff --git a/text2gremlin/AST_Text2Gremlin/base/TraversalGenerator.py 
b/text2gremlin/AST_Text2Gremlin/base/TraversalGenerator.py
index 7a6b7177..60593e68 100644
--- a/text2gremlin/AST_Text2Gremlin/base/TraversalGenerator.py
+++ b/text2gremlin/AST_Text2Gremlin/base/TraversalGenerator.py
@@ -1,8 +1,20 @@
-"""
-Gremlin查询生成器核心引擎。
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 
-基于递归回溯算法与数据控制策略,从结构化配方生成大量多样化的Gremlin查询及其中文描述。
-"""
 
 import os
 import random
@@ -20,37 +32,21 @@ from .CombinationController import CombinationController
 
 class TraversalGenerator:
     """Gremlin查询生成器 - 分层泛化架构"""
-    
-    # ==================== 步骤分类配置 ====================
-    #
-    # 如何添加新步骤：
-    # 1. 确定步骤类别（A-J）
-    # 2. 在对应的字典中添加步骤定义
-    # 3. 如果需要特殊逻辑，在对应的处理器方法中添加实现
-    # 4. 运行测试验证
-    #
-    # 示例：添加新的简单步骤 'explain'
-    #   1. 在 SIMPLE_STEPS 中添加: 'explain': ('解释查询', 'string')
-    #   2. 无需修改 _handle_simple_step 方法（自动处理）
-    #   3. 测试: g.V().explain()
-    # ===========================================================
-    
     # A. 简单步骤（无参数，直接生成）
-    # 翻译由GremlinBase提供
     SIMPLE_STEPS = {
         'count': {'output_type': 'number'},
         'id': {'output_type': 'value'},
         'label': {'output_type': 'string'},
         'fold': {'output_type': 'list'},
-        'unfold': {'output_type': None},  # None表示保持当前类型
+        'unfold': {'output_type': None},  
         'drop': {'output_type': 'none'},
         'iterate': {'output_type': 'none'},
-        'explain': {'output_type': 'string'},  # 返回执行计划
-        'profile': {'output_type': 'map'},  # 返回性能分析
-        'loops': {'output_type': 'number'},  # 返回循环次数
-        'value': {'output_type': 'value'},  # 获取属性值（用于属性流）
-        'identity': {'output_type': None},  # 恒等映射，保持当前类型
-        'barrier': {'output_type': None}  # 屏障，等待所有遍历者
+        'explain': {'output_type': 'string'}, 
+        'profile': {'output_type': 'map'}, 
+        'loops': {'output_type': 'number'},  
+        'value': {'output_type': 'value'}, 
+        'identity': {'output_type': None},  
+        'barrier': {'output_type': None}  
     }
     
     # B. 属性访问步骤（需要Schema + 泛化）
@@ -62,7 +58,7 @@ class TraversalGenerator:
         'key': {'output_type': 'string', 'supports_params': False}
     }
     
-    # C. 数值参数步骤（需要生成合理的数值）
+    # C. 数值参数步骤
     NUMERIC_PARAM_STEPS = {
         'limit': {'range': (1, 100)},
         'skip': {'range': (0, 50)},
@@ -142,13 +138,13 @@ class TraversalGenerator:
     }
     
     
-    # M. 边修改步骤
+    # I. 边修改步骤
     EDGE_MODIFICATION_STEPS = {
         'from': {'output_type': None, 'needs_label_or_traversal': True},
         'to': {'output_type': None, 'needs_label_or_traversal': True}
     }
     
-    # I. 谓词（用于has等步骤）
+    # J. 谓词（用于has等步骤）
     # 谓词由 Visitor 解析为 Predicate/TextPredicate 对象，在 E 层过滤步骤中处理
     PREDICATES = {
         # 数值谓词
@@ -177,7 +173,29 @@ class TraversalGenerator:
         'not': {'types': ['any']}
     }
     
-    # J. 特殊步骤（需要单独实现）
+
+    # K. 图算法步骤
+    GRAPH_ALGORITHM_STEPS = {
+        'pageRank': {'output_type': None},
+        'peerPressure': {'output_type': None},
+        'connectedComponent': {'output_type': None},
+        'shortestPath': {'output_type': None}
+    }
+    
+    # L. 工具步骤
+    UTILITY_STEPS = {
+        'math': {'output_type': 'number', 'needs_expression': True},
+        'subgraph': {'output_type': None, 'needs_key': True},
+        'timeLimit': {'output_type': None, 'needs_number': True},
+        'inject': {'output_type': None, 'multi_param': True},  # 支持多参数
+        'call': {'output_type': None, 'needs_string': True},
+        'io': {'output_type': None, 'needs_string': True},
+        'mergeE': {'output_type': None},  # 合并边
+        'mergeV': {'output_type': None},  # 合并顶点
+        'with': {'output_type': None, 'multi_param': True}  # 配置选项
+    }
+    
+    # M. 特殊步骤（需要单独实现）
     SPECIAL_STEPS = {
         # 起始步骤
         'V': {'category': 'start'},
@@ -209,26 +227,6 @@ class TraversalGenerator:
         'map': {'category': 'higher_order'},
         'local': {'category': 'higher_order'}  # 本地作用域遍历
     }
-    # K. 图算法步骤
-    GRAPH_ALGORITHM_STEPS = {
-        'pageRank': {'output_type': None},
-        'peerPressure': {'output_type': None},
-        'connectedComponent': {'output_type': None},
-        'shortestPath': {'output_type': None}
-    }
-    
-    # L. 工具步骤
-    UTILITY_STEPS = {
-        'math': {'output_type': 'number', 'needs_expression': True},
-        'subgraph': {'output_type': None, 'needs_key': True},
-        'timeLimit': {'output_type': None, 'needs_number': True},
-        'inject': {'output_type': None, 'multi_param': True},  # 支持多参数
-        'call': {'output_type': None, 'needs_string': True},
-        'io': {'output_type': None, 'needs_string': True},
-        'mergeE': {'output_type': None},  # 合并边
-        'mergeV': {'output_type': None},  # 合并顶点
-        'with': {'output_type': None, 'multi_param': True}  # 配置选项
-    }
     
     def __init__(self, schema: Schema, recipe: Traversal, gremlin_base: 
GremlinBase, 
                  controller: Optional[CombinationController] = None):
@@ -248,12 +246,10 @@ class TraversalGenerator:
         
         # 集成组合控制器
         if controller is None:
-            # 尝试加载默认配置
             try:
-                # 尝试多个可能的路径
                 possible_paths = [
-                    'combination_control_config.json',  # 当前目录
-                    os.path.join(os.path.dirname(__file__), 
'combination_control_config.json'),  # TraversalGenerator.py所在目录
+                    'combination_control_config.json',  
+                    os.path.join(os.path.dirname(__file__), 
'combination_control_config.json'),  
                 ]
                 
                 config_loaded = False
@@ -281,7 +277,6 @@ class TraversalGenerator:
         # 配方路径完成标记
         self.recipe_path_completed = False
     
-    # ==================== 主生成流程 ====================
     
     def generate(self) -> List[Tuple[str, str]]:
         """
@@ -415,7 +410,7 @@ class TraversalGenerator:
                     option['new_label'], option['new_type']
                 )
     
-    # ==================== 步骤选项生成（分发器）====================
+    #步骤选项生成（分发器）
     
     def _get_valid_options_for_step(self, step_recipe: Step, current_label: 
str, 
                                     current_type: str, remaining_steps: 
List[Step] = None) -> List[Dict]:
@@ -489,7 +484,7 @@ class TraversalGenerator:
         print(f"⚠️  未知步骤: {step_name}")
         return []
     
-    # ==================== A. 简单步骤处理器 ====================
+    #   A. 简单步骤处理器  
     
     def _handle_simple_step(self, step_name: str, current_label: str, 
                            current_type: str) -> List[Dict]:
@@ -521,7 +516,7 @@ class TraversalGenerator:
             'new_type': new_type
         }]
     
-    # ==================== F. 转换步骤处理器 ====================
+    #   F. 转换步骤处理器  
     
     def _handle_transform_step(self, step_recipe: Step, current_label: str,
                                current_type: str) -> List[Dict]:
@@ -652,7 +647,7 @@ class TraversalGenerator:
         
         return options
     
-    # ==================== G. 聚合步骤处理器 ====================
+    #   G. 聚合步骤处理器  
     
     def _handle_aggregate_step(self, step_recipe: Step, current_label: str,
                                current_type: str) -> List[Dict]:
@@ -696,7 +691,7 @@ class TraversalGenerator:
                 'new_type': new_type
             }]
     
-    # ==================== G2. 副作用步骤处理器 ====================
+    #   G2. 副作用步骤处理器  
     
     def _handle_side_effect_step(self, step_recipe: Step, current_label: str,
                                  current_type: str) -> List[Dict]:
@@ -783,7 +778,7 @@ class TraversalGenerator:
                 'new_type': new_type
             }]
     
-    # ==================== H. 终端步骤处理器 ====================
+    #   H. 终端步骤处理器  
     
     def _handle_terminal_step(self, step_recipe: Step, current_label: str,
                              current_type: str) -> List[Dict]:
@@ -830,7 +825,7 @@ class TraversalGenerator:
             'new_type': new_type
         }]
     
-    # ==================== C. 数值参数步骤处理器 ====================
+    #   C. 数值参数步骤处理器  
     
     def _handle_numeric_param_step(self, step_name: str, params: List, 
                                    current_label: str, current_type: str) -> 
List[Dict]:
@@ -870,7 +865,7 @@ class TraversalGenerator:
             'new_type': current_type
         }]
     
-    # ==================== K. 图算法步骤处理器 ====================
+    #   K. 图算法步骤处理器  
     
     def _handle_graph_algorithm_step(self, step_name: str, current_label: str,
                                      current_type: str) -> List[Dict]:
@@ -897,7 +892,7 @@ class TraversalGenerator:
             'new_type': current_type
         }]
     
-    # ==================== L. 工具步骤处理器 ====================
+    #   L. 工具步骤处理器  
     
     def _handle_utility_step(self, step_recipe: Step, current_label: str,
                             current_type: str) -> List[Dict]:
@@ -995,7 +990,7 @@ class TraversalGenerator:
             'new_type': new_type
         }]
     
-    # ==================== M. 边修改步骤处理器 ====================
+    #   M. 边修改步骤处理器  
     
     def _handle_edge_modification_step(self, step_recipe: Step, current_label: 
str,
                                       current_type: str) -> List[Dict]:
@@ -1058,7 +1053,7 @@ class TraversalGenerator:
             'new_type': current_type
         }]
     
-    # ==================== B. 属性访问步骤处理器 ====================
+    #   B. 属性访问步骤处理器  
     
     def _handle_property_access_step(self, step_recipe: Step, current_label: 
str,
                                     current_type: str) -> List[Dict]:
@@ -1244,7 +1239,7 @@ class TraversalGenerator:
         
         return options
     
-    # ==================== 嵌套遍历泛化辅助方法 ====================
+    #   嵌套遍历泛化辅助方法  
     
     def _generate_nested_traversal_variants(self, anonymous_trav, 
current_depth=0):
         """
@@ -1450,7 +1445,7 @@ class TraversalGenerator:
         else:
             return "..."
     
-    # ==================== E. 过滤步骤处理器 ====================
+    #   E. 过滤步骤处理器  
     
     def _handle_filter_step(self, step_recipe: Step, current_label: str,
                            current_type: str, remaining_steps: List[Step]) -> 
List[Dict]:
@@ -1619,18 +1614,27 @@ class TraversalGenerator:
                             # has('name', 'Tom', 'Jerry') - 多个值
                             if self.controller:
                                 fill_times = 
self.controller.get_multi_param_value_fill_count(
-                                    param_count=value_count,
                                     is_terminal=is_terminal_step
                                 )
                             else:
                                 fill_times = 1
                             
-                            # 生成多次填充
-                            for _ in range(fill_times):
-                                if len(all_values) >= value_count:
-                                    selected_combo = random.sample(all_values, 
value_count)
-                                    values_str = ", ".join(repr(v) for v in 
selected_combo)
-                                    prop_desc = 
self.gremlin_base.get_schema_desc(prop_name)
+                            # 调整填充次数：不能超过实际可生成的不同组合数
+                            if len(all_values) >= value_count:
+                                # 使用集合去重，避免生成重复组合
+                                generated_combos = set()
+                                attempts = 0
+                                max_attempts = fill_times * 10  # 避免无限循环
+                                
+                                while len(generated_combos) < fill_times and 
attempts < max_attempts:
+                                    selected_combo = 
tuple(sorted(random.sample(all_values, value_count)))
+                                    generated_combos.add(selected_combo)
+                                    attempts += 1
+                                
+                                # 生成查询选项
+                                prop_desc = 
self.gremlin_base.get_schema_desc(prop_name)
+                                for combo in generated_combos:
+                                    values_str = ", ".join(repr(v) for v in 
combo)
                                     options.append({
                                         'query_part': f".has('{prop_name}', 
{values_str})",
                                         'desc_part': 
f"，其'{prop_desc}'为{values_str}之一",
@@ -1693,17 +1697,27 @@ class TraversalGenerator:
                             # has('name', 'Tom', 'Jerry') - 多个值
                             if self.controller:
                                 fill_times = 
self.controller.get_multi_param_value_fill_count(
-                                    param_count=value_count,
                                     is_terminal=is_terminal_step
                                 )
                             else:
                                 fill_times = 1
                             
-                            for _ in range(fill_times):
-                                if len(all_values) >= value_count:
-                                    selected_combo = random.sample(all_values, 
value_count)
-                                    values_str = ", ".join(repr(v) for v in 
selected_combo)
-                                    prop_desc = 
self.gremlin_base.get_schema_desc(recipe_prop)
+                            # 调整填充次数：不能超过实际可生成的不同组合数
+                            if len(all_values) >= value_count:
+                                # 使用集合去重，避免生成重复组合
+                                generated_combos = set()
+                                attempts = 0
+                                max_attempts = fill_times * 10  # 避免无限循环
+                                
+                                while len(generated_combos) < fill_times and 
attempts < max_attempts:
+                                    selected_combo = 
tuple(sorted(random.sample(all_values, value_count)))
+                                    generated_combos.add(selected_combo)
+                                    attempts += 1
+                                
+                                # 生成查询选项
+                                prop_desc = 
self.gremlin_base.get_schema_desc(recipe_prop)
+                                for combo in generated_combos:
+                                    values_str = ", ".join(repr(v) for v in 
combo)
                                     options.append({
                                         'query_part': f".has('{recipe_prop}', 
{values_str})",
                                         'desc_part': 
f"，其'{prop_desc}'为{values_str}之一",
@@ -2148,7 +2162,7 @@ class TraversalGenerator:
         
         return options
     
-    # ==================== D. 导航步骤处理器 ====================
+    #   D. 导航步骤处理器  
     
     def _handle_navigation_step(self, step_recipe: Step, current_label: str,
                                current_type: str) -> List[Dict]:
@@ -2333,7 +2347,7 @@ class TraversalGenerator:
         
         return options
     
-    # ==================== J. 特殊步骤处理器 ====================
+    #   J. 特殊步骤处理器  
     
     def _handle_special_step(self, step_recipe: Step, current_label: str,
                             current_type: str, remaining_steps: List[Step]) -> 
List[Dict]:
@@ -3165,7 +3179,7 @@ class TraversalGenerator:
         
         return results
     
-    # ==================== 辅助方法 ====================
+    #   辅助方法  
     
     def _get_random_value(self, label: str, prop_info: Dict, for_update: bool 
= False) -> Any:
         """根据属性类型生成随机值"""
diff --git a/text2gremlin/AST_Text2Gremlin/base/__init__.py 
b/text2gremlin/AST_Text2Gremlin/base/__init__.py
index 89ee1a95..9d86256b 100644
--- a/text2gremlin/AST_Text2Gremlin/base/__init__.py
+++ b/text2gremlin/AST_Text2Gremlin/base/__init__.py
@@ -1,19 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
 """
 Gremlin 查询生成器包
 
 这个包提供了从模板生成 Gremlin 查询语料库的功能。
-
-主要模块:
-- generator: 主要的生成器接口
-- Config: 配置管理
-- Schema: 图数据库模式定义
-- TraversalGenerator: 遍历查询生成器
-- GremlinTransVisitor: Gremlin 语法解析器
 """
-
-__version__ = "1.0.0"
-
-# 导出主要接口
 from .generator import generate_gremlin_corpus
 
 __all__ = ['generate_gremlin_corpus']
diff --git a/text2gremlin/AST_Text2Gremlin/base/generator.py 
b/text2gremlin/AST_Text2Gremlin/base/generator.py
index 856d46e0..0c1d78cf 100644
--- a/text2gremlin/AST_Text2Gremlin/base/generator.py
+++ b/text2gremlin/AST_Text2Gremlin/base/generator.py
@@ -1,15 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
 """
-控制层-Gremlin语料库生成器主入口。
+Gremlin语料库生成器主入口脚本。
 
 从Gremlin查询模板生成大量多样化的查询-描述对，用于Text-to-Gremlin任务的训练数据。
 """
 
 import os
 import json
+from datetime import datetime
 from antlr4 import InputStream, CommonTokenStream
 from antlr4.error.ErrorListener import ErrorListener
 
-# Import all our custom modules from the gremlin_base package
 from .Config import Config
 from .Schema import Schema
 from .GremlinBase import GremlinBase
@@ -17,7 +35,6 @@ from .GremlinParse import Traversal
 from .TraversalGenerator import TraversalGenerator
 from .GremlinTransVisitor import GremlinTransVisitor
 
-# Import the ANTLR-generated components
 from .gremlin.GremlinLexer import GremlinLexer
 from .gremlin.GremlinParser import GremlinParser
 import random
@@ -54,10 +71,12 @@ def check_gremlin_syntax(query_string: str) -> tuple[bool, 
str]:
         parser = GremlinParser(token_stream)
         
         # 移除默认的控制台错误监听器
+        lexer.removeErrorListeners()
         parser.removeErrorListeners()
         
         # 添加自定义的监听器
         error_listener = SyntaxErrorListener()
+        lexer.addErrorListener(error_listener)
         parser.addErrorListener(error_listener)
         
         # 尝试解析查询
@@ -135,20 +154,21 @@ def generate_corpus_from_template(
         
         for query, description in corpus:
             try:
-                # 首先进行语法检查
+                # 先判重，避免对重复项做语法检查
+                if query in global_corpus_dict:
+                    duplicate_count += 1
+                    continue
+                
+                # 再进行语法检查
                 is_valid, error_msg = check_gremlin_syntax(query)
                 
                 if not is_valid:
                     syntax_error_count += 1
                     continue
-                    
-                if query not in global_corpus_dict:
-                    # 新的查询且语法正确，添加到全局字典
-                    global_corpus_dict[query] = description
-                    new_pairs_count += 1
-                else:
-                    # 重复的查询，跳过
-                    duplicate_count += 1
+                
+                # 新的查询且语法正确，添加到全局字典
+                global_corpus_dict[query] = description
+                new_pairs_count += 1
                     
             except Exception as e:
                 syntax_error_count += 1
@@ -179,18 +199,18 @@ def generate_gremlin_corpus(templates: list[str],
                            config_path: str, 
                            schema_path: str, 
                            data_path: str,
-                           output_file: str = None,
-                           num_queries: int = 100) -> dict:
+                           output_file: str = None) -> dict:
     """
     从Gremlin模板列表生成完整的语料库。
     
+    查询数量由 combination_control_config.json 中的 max_total_combinations 控制。
+    
     Args:
         templates: Gremlin查询模板列表或CSV文件路径
         config_path: 配置文件路径（必需）
         schema_path: Schema文件路径（必需）
         data_path: 数据文件路径（必需）
         output_file: 输出文件名（可选）
-        num_queries: 每个模板生成的查询数量（默认100）
         
     Returns:
         包含生成统计信息的字典
@@ -300,9 +320,12 @@ def generate_gremlin_corpus(templates: list[str],
     full_corpus = [(query, desc) for query, desc in global_corpus_dict.items()]
     
     # --- Save the full corpus to a local file (if output_file is provided) ---
-    from datetime import datetime
-    
     if output_file:
+        # 确保输出目录存在
+        out_dir = os.path.dirname(os.path.abspath(output_file))
+        if out_dir:
+            os.makedirs(out_dir, exist_ok=True)
+        
         # 确保只保存成功生成的查询-描述对
         corpus_data = {
             "metadata": {
diff --git a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinLexer.py 
b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinLexer.py
index 75d8b68b..99ad9e37 100644
--- a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinLexer.py
+++ b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinLexer.py
@@ -1,4 +1,22 @@
-# Generated from /root/lzj/ospp/Gremlin_Antlr4/Gremlin.g4 by ANTLR 4.13.1
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# Generated from ./Gremlin.g4 by ANTLR 4.13.1
 from antlr4 import *
 from io import StringIO
 import sys
diff --git a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinListener.py 
b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinListener.py
index b9782b1b..c3a22ff8 100644
--- a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinListener.py
+++ b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinListener.py
@@ -1,4 +1,22 @@
-# Generated from /root/lzj/ospp/Gremlin_Antlr4/Gremlin.g4 by ANTLR 4.13.1
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# Generated from ./Gremlin.g4 by ANTLR 4.13.1
 from antlr4 import *
 if "." in __name__:
     from .GremlinParser import GremlinParser
diff --git a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinParser.py 
b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinParser.py
index 83ff7ca1..57745b98 100644
--- a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinParser.py
+++ b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinParser.py
@@ -1,4 +1,22 @@
-# Generated from /root/lzj/ospp/Gremlin_Antlr4/Gremlin.g4 by ANTLR 4.13.1
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# Generated from ./Gremlin.g4 by ANTLR 4.13.1
 # encoding: utf-8
 from antlr4 import *
 from io import StringIO
diff --git a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinVisitor.py 
b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinVisitor.py
index f7684135..06e3f5a3 100644
--- a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinVisitor.py
+++ b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinVisitor.py
@@ -1,4 +1,22 @@
-# Generated from /root/lzj/ospp/Gremlin_Antlr4/Gremlin.g4 by ANTLR 4.13.1
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# Generated from ./Gremlin.g4 by ANTLR 4.13.1
 from antlr4 import *
 if "." in __name__:
     from .GremlinParser import GremlinParser
diff --git a/text2gremlin/AST_Text2Gremlin/base/gremlin/__init__.py 
b/text2gremlin/AST_Text2Gremlin/base/gremlin/__init__.py
index e69de29b..13a83393 100644
--- a/text2gremlin/AST_Text2Gremlin/base/gremlin/__init__.py
+++ b/text2gremlin/AST_Text2Gremlin/base/gremlin/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git 
a/text2gremlin/AST_Text2Gremlin/base/gremlin/antlr-4.13.1-complete.jar 
b/text2gremlin/AST_Text2Gremlin/base/gremlin/antlr-4.13.1-complete.jar
deleted file mode 100644
index f539ab04..00000000
Binary files 
a/text2gremlin/AST_Text2Gremlin/base/gremlin/antlr-4.13.1-complete.jar and 
/dev/null differ
diff --git a/text2gremlin/AST_Text2Gremlin/config.json 
b/text2gremlin/AST_Text2Gremlin/config.json
index 3eae0490..43b311b9 100644
--- a/text2gremlin/AST_Text2Gremlin/config.json
+++ b/text2gremlin/AST_Text2Gremlin/config.json
@@ -7,9 +7,9 @@
     "data_path": "./db_data/",
     "templates_file": "gremlin_templates.csv",
     "output_dir": "output",
-    "num_queries_per_template": 100,
     "schema_dict_path": [
         "./base/template/schema_dict.txt"
-    ]
+    ],
+    "_note": "查询数量由 combination_control_config.json 中的 max_total_combinations 
控制"
 }
 
diff --git a/text2gremlin/AST_Text2Gremlin/generate_corpus.py 
b/text2gremlin/AST_Text2Gremlin/generate_corpus.py
new file mode 100644
index 00000000..1438f4bf
--- /dev/null
+++ b/text2gremlin/AST_Text2Gremlin/generate_corpus.py
@@ -0,0 +1,209 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""
+Gremlin 查询语料库生成脚本
+
+从模板生成 Gremlin 查询语料库的命令行工具。
+
+用法:
+    # 使用默认配置（推荐）
+    python generate_corpus.py
+    
+    # 自定义参数
+    python generate_corpus.py --templates my_templates.csv --num-queries 50
+"""
+
+import argparse
+import sys
+import os
+import json
+from pathlib import Path
+from datetime import datetime
+
+# 添加 base 包到 Python 路径
+sys.path.insert(0, str(Path(__file__).parent))
+
+from base import generate_gremlin_corpus
+
+
+def load_config(config_path='config.json'):
+    """加载配置文件"""
+    try:
+        with open(config_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except Exception as e:
+        print(f"⚠️  警告: 无法加载配置文件 {config_path}: {e}")
+        return {}
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='生成 Gremlin 查询语料库',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例:
+  # 使用默认配置
+  python generate_corpus.py
+  
+  # 使用自定义模板文件
+  python generate_corpus.py --templates my_templates.csv
+  
+  # 使用自定义配置文件
+  python generate_corpus.py --config my_config.json
+  
+  # 完全自定义
+  python generate_corpus.py --templates templates.csv --schema schema.json 
--data data/ --output output.json
+
+配置说明:
+  config.json 中的配置项：
+  - templates_file: 模板文件路径（默认: gremlin_templates.csv）
+  - db_schema_path: schema 文件路径
+  - data_path: 数据目录路径（默认: db_data/）
+  - output_dir: 输出目录（默认: output）
+  
+  查询数量控制:
+  - 由 combination_control_config.json 中的 max_total_combinations 控制
+  - 根据查询复杂度自动调整（short/medium/long/ultra）
+
+注意:
+  - 输出文件自动命名为 output/generated_corpus_YYYYMMDD_HHMMSS.json
+  - 每次运行生成新文件，不会覆盖旧文件
+        """
+    )
+    
+    parser.add_argument(
+        '--config',
+        default='config.json',
+        help='配置文件路径 (JSON格式，默认: config.json)'
+    )
+    
+    parser.add_argument(
+        '--templates', 
+        help='模板文件路径 (CSV格式，默认从 config.json 读取)'
+    )
+    
+    parser.add_argument(
+        '--schema',
+        help='图数据库模式文件路径 (JSON格式，默认从 config.json 读取)'
+    )
+    
+    parser.add_argument(
+        '--data',
+        help='数据目录路径 (默认从 config.json 读取)'
+    )
+    
+    parser.add_argument(
+        '--output',
+        help='输出文件路径 (JSON格式，默认: output/generated_corpus_YYYYMMDD_HHMMSS.json)'
+    )
+    
+
+    
+    args = parser.parse_args()
+    
+    # 加载配置文件
+    config = load_config(args.config)
+    
+    # 从配置文件或命令行参数获取值（命令行参数优先）
+    templates_file = args.templates or config.get('templates_file', 
'gremlin_templates.csv')
+    db_id = config.get('db_id', 'movie')
+    schema_path = args.schema or config.get('db_schema_path', {}).get(db_id, 
'db_data/schema/movie_schema.json')
+    data_path = args.data or config.get('data_path', 'db_data/')
+    output_dir = config.get('output_dir', 'output')
+    
+    # 更新 args 对象
+    args.templates = templates_file
+    args.schema = schema_path
+    args.data = data_path
+    
+    # 验证输入文件
+    if not os.path.exists(args.templates):
+        print(f"❌ 错误: 模板文件不存在: {args.templates}")
+        print(f"💡 提示: 请创建 {args.templates} 文件，或使用 --templates 指定其他文件")
+        sys.exit(1)
+    
+    if not os.path.exists(args.config):
+        print(f"❌ 错误: 配置文件不存在: {args.config}")
+        sys.exit(1)
+    
+    if not os.path.exists(args.schema):
+        print(f"❌ 错误: 模式文件不存在: {args.schema}")
+        sys.exit(1)
+    
+    if not os.path.exists(args.data):
+        print(f"❌ 错误: 数据目录不存在: {args.data}")
+        sys.exit(1)
+    
+    # 如果没有指定输出文件，使用默认路径
+    if not args.output:
+        # 确保输出目录存在
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # 生成带时间戳的文件名
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        args.output = f'{output_dir}/generated_corpus_{timestamp}.json'
+    
+    try:
+        print("=" * 60)
+        print("🚀 Gremlin 查询语料库生成器")
+        print("=" * 60)
+        print(f"\n📋 配置信息:")
+        print(f"  模板文件: {args.templates}")
+        print(f"  配置文件: {args.config}")
+        print(f"  模式文件: {args.schema}")
+        print(f"  数据目录: {args.data}")
+        print(f"  输出文件: {args.output}")
+        
+        print("\n" + "-" * 60)
+        
+        # 调用生成器
+        result = generate_gremlin_corpus(
+            templates=args.templates,
+            config_path=args.config,
+            schema_path=args.schema,
+            data_path=args.data,
+            output_file=args.output
+        )
+        
+        print("\n" + "=" * 60)
+        print("✅ 生成完成！")
+        print("=" * 60)
+        print(f"\n📊 统计信息:")
+        print(f"  总模板数: {result['total_templates']}")
+        print(f"  成功处理: {result['successful_templates']}")
+        print(f"  处理失败: {result['failed_templates']}")
+        print(f"  生成查询数: {result['total_unique_queries']}")
+        
+        if 'output_file' in result:
+            print(f"\n💾 结果已保存到: {result['output_file']}")
+            print(f"\n💡 提示:")
+            print(f"  - 可以在 {args.templates} 中添加更多模板")
+            print(f"  - 查询数量由 combination_control_config.json 控制")
+        else:
+            print(f"\n生成了 {len(result['queries'])} 个查询 (未保存到文件)")
+        
+    except Exception as e:
+        print(f"❌ 错误: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/text2gremlin/AST_Text2Gremlin/output/README.md 
b/text2gremlin/AST_Text2Gremlin/output/README.md
deleted file mode 100644
index 4513e94a..00000000
--- a/text2gremlin/AST_Text2Gremlin/output/README.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# Output 目录
-
-这个目录用于存放生成的 Gremlin 查询语料库文件。
-
-## 文件命名规则
-
-生成的文件会自动命名为：
-```
-generated_corpus_YYYYMMDD_HHMMSS.json
-```
-
-例如：
-- `generated_corpus_20251029_143025.json`
-- `generated_corpus_20251029_150130.json`
-
-## 文件格式
-
-每个生成的 JSON 文件包含：
-
-```json
-{
-    "metadata": {
-        "total_templates": 10,
-        "successful_templates": 9,
-        "failed_templates": 1,
-        "total_unique_queries": 450,
-        "generation_timestamp": "2025-10-29 14:30:25"
-    },
-    "corpus": [
-        {
-            "query": "g.V().hasLabel('person').has('name', 'Tom Hanks')",
-            "description": "从图中开始，并筛选出标签为 'person' 的元素，并筛选出属性 'name' 为 'Tom 
Hanks' 的元素"
-        }
-    ]
-}
-```
-
-## 使用方式
-
-### 生成新的语料库
-
-```bash
-# 使用默认配置
-python generate_corpus.py
-
-# 指定生成数量
-python generate_corpus.py --num-queries 50
-```
-
-### 查看生成的文件
-
-```bash
-# 列出所有生成的文件
-ls -lh output/
-
-# 查看最新生成的文件
-ls -t output/ | head -1
-```
-
-### 清理旧文件
-
-```bash
-# 删除所有生成的文件
-rm output/generated_corpus_*.json
-
-# 只保留最新的 5 个文件
-ls -t output/generated_corpus_*.json | tail -n +6 | xargs rm
-```
-
-## 注意事项
-
-- 每次运行 `generate_corpus.py` 都会生成一个新文件
-- 文件不会自动覆盖，需要手动清理旧文件
-- 建议定期清理不需要的文件以节省空间
diff --git a/text2gremlin/AST_Text2Gremlin/output/SYNTAX_ANALYSIS_SUMMARY.md 
b/text2gremlin/AST_Text2Gremlin/output/SYNTAX_ANALYSIS_SUMMARY.md
deleted file mode 100644
index 1abf4de9..00000000
--- a/text2gremlin/AST_Text2Gremlin/output/SYNTAX_ANALYSIS_SUMMARY.md
+++ /dev/null
@@ -1,178 +0,0 @@
-# Gremlin 语法分析总结
-
-## 📊 核心发现
-
-基于对 **1,493 个查询** 的深度分析，我们使用 ANTLR 解析器对每个查询进行了语法树解析，统计了 **7,353 个步骤** 的分布情况。
-
----
-
-## 🎯 关键数据
-
-### 整体统计
-- **总查询数**: 1,493
-- **总步骤数**: 7,353
-- **不同步骤类型**: 76 种
-- **平均每查询步骤数**: 4.92
-- **谓词使用**: 154 次（3 种类型）
-
-### 集中度分析
-- **前 3 个步骤** 覆盖 **50%** 的使用
-- **前 10 个步骤** 覆盖 **80%** 的使用
-- **前 20 个步骤** 覆盖 **92.65%** 的使用
-- **前 42 个步骤** 覆盖 **99%** 的使用
-
----
-
-## 🏆 Top 10 最常用步骤
-
-| 排名 | 步骤 | 次数 | 占比 | 典型用法 |
-|------|------|------|------|----------|
-| 1 | `hasLabel` | 1,485 | 20.20% | `g.V().hasLabel('movie')` |
-| 2 | `V` | 1,482 | 20.16% | `g.V()` |
-| 3 | `out` | 1,202 | 16.35% | `.out('acted_in')` |
-| 4 | `in` | 475 | 6.46% | `.in('has_genre')` |
-| 5 | `dedup` | 302 | 4.11% | `.dedup()` |
-| 6 | `by` | 259 | 3.52% | `.order().by('name')` |
-| 7 | `as` | 254 | 3.45% | `.as('movie')` |
-| 8 | `has` | 209 | 2.84% | `.has('name', 'Tom')` |
-| 9 | `groupCount` | 182 | 2.48% | `.groupCount()` |
-| 10 | `where` | 147 | 2.00% | `.where(P.neq('m'))` |
-
----
-
-## 📈 步骤分类占比
-
-```
-过滤步骤 ████████████████████████████████ 29.63%
-图遍历   ████████████████████████ 23.47%
-起始步骤 ████████████████████ 20.17%
-辅助步骤 ███████ 7.28%
-排序限制 ████ 4.26%
-聚合统计 ███ 3.59%
-投影转换 ███ 3.33%
-分支条件 ██ 2.50%
-循环     ██ 2.30%
-其他     ███ 3.47%
-```
-
----
-
-## 🔍 深度分析
-
-### 1. 查询起始模式
-- **99.26%** 的查询从 `g.V()` 开始
-- 仅 **0.07%** 从 `g.E()` 开始
-- 说明：**顶点中心的图遍历是主流模式**
-
-### 2. 过滤策略
-- `hasLabel` 几乎是必备步骤（99.46% 的查询使用）
-- `has` 用于属性过滤（14.00% 的查询使用）
-- `dedup` 去重频繁（20.23% 的查询使用）
-- 说明：**类型过滤 + 属性过滤 + 去重是标准三件套**
-
-### 3. 遍历方向偏好
-- `out` : `in` = **2.53 : 1**
-- 出边遍历远多于入边遍历
-- 说明：**查询更关注"从哪里出发"而非"从哪里来"**
-
-### 4. 聚合分析需求
-- `groupCount` 是最常用的聚合操作（182 次）
-- `count`, `sum`, `mean` 等基础统计也有使用
-- 说明：**分组统计是重要的分析需求**
-
-### 5. 复杂查询特征
-- **标记引用**: `as` (254) + `where` (147) 组合用于复杂关联
-- **循环遍历**: `repeat` (76) + `times` (39) 用于多跳查询
-- **分支逻辑**: `union` (115) + `coalesce` (51) 用于多路径探索
-- 说明：**支持复杂的图分析场景**
-
-### 6. 谓词使用模式
-- `neq` (不等于) 占 **69.48%**，主要用于 `where(P.neq('m'))` 排除自身
-- `within` (在集合内) 占 **22.08%**，用于集合成员判断
-- `gt` (大于) 占 **8.44%**，用于数值比较
-- 说明：**排除模式是最常见的过滤需求**
-
----
-
-## 💡 实践建议
-
-### 对于查询优化
-1. **优先优化高频步骤**: `hasLabel`, `V`, `out` 的性能直接影响整体
-2. **索引策略**: 为 `hasLabel` 和 `has` 建立索引
-3. **去重优化**: `dedup` 使用频繁，需要高效的去重算法
-4. **出边优化**: `out` 步骤是性能瓶颈，考虑邻接表优化
-
-### 对于测试覆盖
-1. **核心路径**: 重点测试 `V().hasLabel().out()` 组合
-2. **过滤场景**: 覆盖各种 `has` 和 `where` 的组合
-3. **聚合操作**: 确保 `groupCount` 在各种场景下正确
-4. **谓词测试**: 重点测试 `neq`, `within`, `gt`
-
-### 对于功能开发
-1. **高优先级**: 前 20 个步骤（覆盖 92.65%）
-2. **中优先级**: 21-42 个步骤（覆盖 6.35%）
-3. **低优先级**: 43-76 个步骤（覆盖 1%）
-4. **长尾支持**: 虽然使用少，但要确保功能完整
-
-### 对于文档编写
-1. **入门教程**: 重点讲解前 10 个步骤
-2. **进阶教程**: 覆盖前 30 个步骤的组合使用
-3. **高级特性**: 介绍循环、分支、聚合等复杂功能
-4. **完整参考**: 提供所有 76 个步骤的详细文档
-
----
-
-## 📁 相关文件
-
-- **统计数据**: `output/syntax_distribution_stats.json`
-- **详细报告**: `output/SYNTAX_DISTRIBUTION_REPORT.md`
-- **分析脚本**: `analyze_syntax_distribution.py`
-- **可视化脚本**: `visualize_syntax_distribution.py`
-- **源语料库**: `output/generated_corpus_20251029_190729.json`
-
----
-
-## 🔬 分析方法
-
-本分析使用 **ANTLR 解析器** 对每个 Gremlin 查询进行语法树解析，而非简单的字符串匹配：
-
-1. **词法分析**: 使用 `GremlinLexer` 将查询字符串分解为 token
-2. **语法分析**: 使用 `GremlinParser` 构建抽象语法树（AST）
-3. **语义分析**: 使用 `GremlinTransVisitor` 遍历 AST 提取步骤和谓词
-4. **统计汇总**: 对提取的语法元素进行计数和分类
-
-这种方法的优势：
-- ✅ **准确识别**: 能准确区分步骤名称和参数
-- ✅ **处理嵌套**: 能正确处理嵌套遍历和匿名遍历
-- ✅ **谓词提取**: 能识别谓词类型而不受参数影响
-- ✅ **语法验证**: 只统计语法正确的查询
-
----
-
-## 📊 数据质量
-
-- **解析成功率**: 100% (1,493/1,493)
-- **步骤识别**: 7,353 个步骤全部正确识别
-- **谓词识别**: 154 个谓词全部正确分类
-- **分析时间**: < 5 秒
-
----
-
-## 🎓 结论
-
-通过对 1,493 个 Gremlin 查询的深度分析，我们发现：
-
-1. **查询模式高度集中**: 前 20 个步骤覆盖 92.65% 的使用
-2. **顶点遍历为主**: 99.26% 的查询从 `g.V()` 开始
-3. **过滤是核心**: 过滤步骤占总步骤数的 29.63%
-4. **出边优先**: 出边遍历是入边遍历的 2.5 倍
-5. **分析需求强**: 分组统计和聚合操作使用频繁
-6. **复杂查询支持**: 循环、分支、标记引用等高级特性都有使用
-
-这些发现为 Gremlin 查询引擎的优化、测试用例设计、文档编写提供了数据支持。
-
----
-
-**生成时间**: 2025-10-29  
-**分析工具**: ANTLR + Python  
-**数据来源**: 泛化生成的 Gremlin 查询语料库
diff --git a/text2gremlin/AST_Text2Gremlin/output/SYNTAX_DISTRIBUTION_REPORT.md 
b/text2gremlin/AST_Text2Gremlin/output/SYNTAX_DISTRIBUTION_REPORT.md
deleted file mode 100644
index 465e8582..00000000
--- a/text2gremlin/AST_Text2Gremlin/output/SYNTAX_DISTRIBUTION_REPORT.md
+++ /dev/null
@@ -1,277 +0,0 @@
-# Gremlin 语法词汇分布统计报告
-
-## 📊 总体统计
-
-| 指标 | 数值 |
-|------|------|
-| 总查询数 | 1,493 |
-| 总步骤数 | 7,353 |
-| 不同步骤类型数 | 76 |
-| 总谓词数 | 154 |
-| 不同谓词类型数 | 3 |
-| 平均每个查询的步骤数 | 4.92 |
-
----
-
-## 🔝 Top 20 最常用步骤
-
-| 排名 | 步骤名称 | 出现次数 | 占比 | 累计占比 |
-|------|----------|----------|------|----------|
-| 1 | `hasLabel` | 1,485 | 20.20% | 20.20% |
-| 2 | `V` | 1,482 | 20.16% | 40.36% |
-| 3 | `out` | 1,202 | 16.35% | 56.71% |
-| 4 | `in` | 475 | 6.46% | 63.17% |
-| 5 | `dedup` | 302 | 4.11% | 67.28% |
-| 6 | `by` | 259 | 3.52% | 70.80% |
-| 7 | `as` | 254 | 3.45% | 74.25% |
-| 8 | `has` | 209 | 2.84% | 77.09% |
-| 9 | `groupCount` | 182 | 2.48% | 79.57% |
-| 10 | `where` | 147 | 2.00% | 81.57% |
-| 11 | `order` | 126 | 1.71% | 83.28% |
-| 12 | `limit` | 116 | 1.58% | 84.86% |
-| 13 | `union` | 115 | 1.56% | 86.42% |
-| 14 | `values` | 109 | 1.48% | 87.90% |
-| 15 | `aggregate` | 78 | 1.06% | 88.96% |
-| 16 | `repeat` | 76 | 1.03% | 89.99% |
-| 17 | `path` | 54 | 0.73% | 90.72% |
-| 18 | `coalesce` | 51 | 0.69% | 91.41% |
-| 19 | `valueMap` | 47 | 0.64% | 92.05% |
-| 20 | `select` | 44 | 0.60% | 92.65% |
-
-**分析**: 前 20 个步骤占总步骤数的 **92.65%**，说明查询模式相对集中。
-
----
-
-## 📈 步骤分类统计
-
-### 图遍历起始步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `V` | 1,482 | 从顶点开始遍历 |
-| `E` | 1 | 从边开始遍历 |
-
-### 过滤步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `hasLabel` | 1,485 | 按标签过滤 |
-| `has` | 209 | 按属性过滤 |
-| `hasId` | 5 | 按ID过滤 |
-| `hasKey` | 6 | 按键过滤 |
-| `hasValue` | 2 | 按值过滤 |
-| `where` | 147 | 条件过滤 |
-| `filter` | 1 | 自定义过滤 |
-| `is` | 1 | 值比较过滤 |
-| `dedup` | 302 | 去重 |
-| `simplePath` | 20 | 简单路径过滤 |
-| `cyclicPath` | 1 | 循环路径过滤 |
-
-**小计**: 2,179 次 (29.63%)
-
-### 图遍历步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `out` | 1,202 | 出边遍历 |
-| `in` | 475 | 入边遍历 |
-| `both` | 2 | 双向遍历 |
-| `outE` | 20 | 出边 |
-| `inE` | 23 | 入边 |
-| `bothE` | 1 | 双向边 |
-| `outV` | 1 | 出顶点 |
-| `inV` | 1 | 入顶点 |
-| `otherV` | 1 | 另一端顶点 |
-
-**小计**: 1,726 次 (23.47%)
-
-### 聚合统计步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `groupCount` | 182 | 分组计数 |
-| `count` | 22 | 计数 |
-| `sum` | 21 | 求和 |
-| `mean` | 6 | 平均值 |
-| `max` | 2 | 最大值 |
-| `min` | 1 | 最小值 |
-| `fold` | 29 | 折叠为列表 |
-| `unfold` | 1 | 展开列表 |
-
-**小计**: 264 次 (3.59%)
-
-### 排序和限制步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `order` | 126 | 排序 |
-| `limit` | 116 | 限制数量 |
-| `range` | 38 | 范围选择 |
-| `skip` | 1 | 跳过 |
-| `tail` | 1 | 取尾部 |
-| `sample` | 30 | 随机采样 |
-| `coin` | 1 | 随机过滤 |
-
-**小计**: 313 次 (4.26%)
-
-### 投影和转换步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `values` | 109 | 获取属性值 |
-| `valueMap` | 47 | 获取属性映射 |
-| `elementMap` | 2 | 获取元素映射 |
-| `properties` | 19 | 获取属性对象 |
-| `project` | 19 | 投影 |
-| `select` | 44 | 选择 |
-| `label` | 2 | 获取标签 |
-| `id` | 1 | 获取ID |
-| `constant` | 1 | 常量值 |
-| `identity` | 1 | 恒等变换 |
-
-**小计**: 245 次 (3.33%)
-
-### 分支和条件步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `union` | 115 | 联合多个遍历 |
-| `coalesce` | 51 | 合并（返回第一个非空） |
-| `choose` | 10 | 条件分支 |
-| `optional` | 8 | 可选遍历 |
-
-**小计**: 184 次 (2.50%)
-
-### 循环步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `repeat` | 76 | 重复遍历 |
-| `times` | 39 | 重复次数 |
-| `until` | 22 | 直到条件满足 |
-| `emit` | 32 | 发射中间结果 |
-
-**小计**: 169 次 (2.30%)
-
-### 路径步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `path` | 54 | 获取路径 |
-| `tree` | 18 | 树形结构 |
-
-**小计**: 72 次 (0.98%)
-
-### 副作用步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `aggregate` | 78 | 聚合到集合 |
-| `store` | 2 | 存储 |
-| `sideEffect` | 19 | 副作用 |
-| `group` | 7 | 分组 |
-| `cap` | 2 | 获取副作用值 |
-
-**小计**: 108 次 (1.47%)
-
-### 修改步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `addV` | 10 | 添加顶点 |
-| `property` | 33 | 设置属性 |
-| `drop` | 19 | 删除元素 |
-
-**小计**: 62 次 (0.84%)
-
-### 逻辑步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `and` | 5 | 逻辑与 |
-| `or` | 2 | 逻辑或 |
-| `not` | 3 | 逻辑非 |
-
-**小计**: 10 次 (0.14%)
-
-### 辅助步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `as` | 254 | 标记步骤 |
-| `by` | 259 | 修饰符（用于排序、分组等） |
-| `map` | 6 | 映射变换 |
-| `flatMap` | 15 | 扁平映射 |
-| `barrier` | 1 | 屏障 |
-
-**小计**: 535 次 (7.28%)
-
-### 终端步骤
-| 步骤 | 次数 | 说明 |
-|------|------|------|
-| `iterate` | 1 | 迭代执行 |
-| `explain` | 1 | 解释查询计划 |
-| `profile` | 1 | 性能分析 |
-
-**小计**: 3 次 (0.04%)
-
----
-
-## 🎯 谓词分布
-
-| 排名 | 谓词 | 出现次数 | 占比 | 说明 |
-|------|------|----------|------|------|
-| 1 | `neq` | 107 | 69.48% | 不等于 |
-| 2 | `within` | 34 | 22.08% | 在集合内 |
-| 3 | `gt` | 13 | 8.44% | 大于 |
-
-**总计**: 154 次
-
-**分析**: 
-- `neq` (不等于) 是最常用的谓词，主要用于排除自身或特定值
-- `within` 用于集合成员判断
-- `gt` 用于数值比较
-
----
-
-## 📝 关键发现
-
-### 1. 查询模式特征
-- **几乎所有查询都从 `V()` 开始** (99.26%)，说明主要是顶点遍历查询
-- **标签过滤是标配** (`hasLabel` 出现 1,485 次)，说明图数据有明确的类型划分
-- **出边遍历远多于入边遍历** (`out`: 1,202 vs `in`: 475)，比例约 2.5:1
-
-### 2. 数据质量控制
-- **去重操作频繁** (`dedup` 302 次)，说明查询结果中存在重复数据
-- **路径过滤较少** (`simplePath` 20 次)，大多数查询不关心路径唯一性
-
-### 3. 分析需求
-- **分组统计需求强** (`groupCount` 182 次)
-- **标记和引用常见** (`as` 254 次, `where` 147 次)，说明有复杂的关联查询
-- **聚合操作** (`aggregate` 78 次) 用于收集中间结果
-
-### 4. 查询复杂度
-- **循环遍历** (`repeat` 76 次) 用于多跳查询
-- **分支逻辑** (`union` 115 次, `coalesce` 51 次) 用于多路径探索
-- **条件过滤** (`where` 147 次) 用于复杂条件判断
-
-### 5. 覆盖度分析
-- **高频步骤** (前 20 个) 占 92.65%，说明核心功能集中
-- **长尾步骤** (后 56 个) 仅占 7.35%，但提供了丰富的功能扩展
-- **76 种不同步骤类型** 说明 Gremlin 语法覆盖全面
-
----
-
-## 🎨 使用建议
-
-### 对于测试用例设计
-1. **优先覆盖高频步骤**: 重点测试前 20 个步骤的各种组合
-2. **关注遍历模式**: `V().hasLabel().out()` 是最常见的模式
-3. **测试去重场景**: 确保 `dedup` 在各种位置都能正常工作
-4. **验证谓词**: 重点测试 `neq`, `within`, `gt` 三个谓词
-
-### 对于性能优化
-1. **优化 `hasLabel` 和 `has`**: 这两个过滤步骤使用最频繁
-2. **优化出边遍历**: `out` 步骤占比最高
-3. **优化 `groupCount`**: 聚合操作需要特别关注性能
-
-### 对于文档编写
-1. **重点讲解高频步骤**: 前 20 个步骤应该有详细文档
-2. **提供组合示例**: 展示常见的步骤组合模式
-3. **补充长尾功能**: 虽然使用少，但要确保文档完整
-
----
-
-## 📅 生成信息
-
-- **语料库文件**: `output/generated_corpus_20251029_190729.json`
-- **统计时间**: 2025-10-29
-- **分析方法**: 基于 ANTLR 解析器的语法树分析
-- **统计脚本**: `analyze_syntax_distribution.py`
diff --git 
a/text2gremlin/AST_Text2Gremlin/output/syntax_distribution_stats.json 
b/text2gremlin/AST_Text2Gremlin/output/syntax_distribution_stats.json
deleted file mode 100644
index 621d9617..00000000
--- a/text2gremlin/AST_Text2Gremlin/output/syntax_distribution_stats.json
+++ /dev/null
@@ -1,95 +0,0 @@
-{
-  "metadata": {
-    "total_queries": 1493,
-    "total_steps": 7353,
-    "unique_step_types": 76,
-    "total_predicates": 154,
-    "unique_predicate_types": 3,
-    "total_text_predicates": 0,
-    "unique_text_predicate_types": 0
-  },
-  "steps": {
-    "hasLabel": 1485,
-    "V": 1482,
-    "out": 1202,
-    "in": 475,
-    "dedup": 302,
-    "by": 259,
-    "as": 254,
-    "has": 209,
-    "groupCount": 182,
-    "where": 147,
-    "order": 126,
-    "limit": 116,
-    "union": 115,
-    "values": 109,
-    "aggregate": 78,
-    "repeat": 76,
-    "path": 54,
-    "coalesce": 51,
-    "valueMap": 47,
-    "select": 44,
-    "times": 39,
-    "range": 38,
-    "property": 33,
-    "emit": 32,
-    "sample": 30,
-    "fold": 29,
-    "inE": 23,
-    "count": 22,
-    "until": 22,
-    "sum": 21,
-    "outE": 20,
-    "simplePath": 20,
-    "properties": 19,
-    "project": 19,
-    "sideEffect": 19,
-    "drop": 19,
-    "tree": 18,
-    "flatMap": 15,
-    "choose": 10,
-    "addV": 10,
-    "optional": 8,
-    "group": 7,
-    "hasKey": 6,
-    "mean": 6,
-    "map": 6,
-    "hasId": 5,
-    "and": 5,
-    "not": 3,
-    "hasValue": 2,
-    "both": 2,
-    "elementMap": 2,
-    "label": 2,
-    "max": 2,
-    "or": 2,
-    "store": 2,
-    "cap": 2,
-    "E": 1,
-    "bothE": 1,
-    "inV": 1,
-    "outV": 1,
-    "otherV": 1,
-    "skip": 1,
-    "tail": 1,
-    "coin": 1,
-    "id": 1,
-    "cyclicPath": 1,
-    "min": 1,
-    "filter": 1,
-    "is": 1,
-    "constant": 1,
-    "identity": 1,
-    "barrier": 1,
-    "unfold": 1,
-    "iterate": 1,
-    "explain": 1,
-    "profile": 1
-  },
-  "predicates": {
-    "neq": 107,
-    "within": 34,
-    "gt": 13
-  },
-  "text_predicates": {}
-}
\ No newline at end of file
diff --git a/text2gremlin/AST_Text2Gremlin/requirements.txt 
b/text2gremlin/AST_Text2Gremlin/requirements.txt
index 204098c6..396b4e32 100644
--- a/text2gremlin/AST_Text2Gremlin/requirements.txt
+++ b/text2gremlin/AST_Text2Gremlin/requirements.txt
@@ -26,7 +26,6 @@ openai==1.96.0
 packaging==25.0
 pandas==2.3.1
 pillow==11.3.0
-pip==25.1.1
 propcache==0.3.2
 psutil==7.0.0
 pydantic==2.11.7
@@ -35,7 +34,6 @@ pyparsing==3.2.3
 python-dateutil==2.9.0.post0
 python-dotenv==1.1.1
 pytz==2025.2
-setuptools==80.9.0
 six==1.17.0
 sniffio==1.3.1
 tenacity==9.1.2
@@ -43,5 +41,4 @@ tqdm==4.67.1
 typing_extensions==4.14.1
 typing-inspection==0.4.1
 tzdata==2025.2
-wheel==0.45.1
-yarl==1.20.1
\ No newline at end of file
+yarl==1.20.1

(incubator-hugegraph-ai) 31/32: Add Apache-2.0 license, fix review comments

Reply via email to