On Tue, May 05, 2026 at 08:59:23AM +0200, Nam Cao wrote:
> The DOT parsing scripts directly parse the raw text and they are quite
> fragile. If the input dot files' formats are slightly changed (for
> instance, by breaking long some lines which is allowed by the DOT language
> defined by graphviz), the scripts would fail.
>
> To make the scripts robust, the parser should be implemented based on the
> dot language specification, not based on how the existing dot files look.
>
> As a first step, use Lark to implement a Parser based on the graphviz dot
> language specification. The resulting parse tree is not used yet, but the
> existing scripts will be converted one by one to use this new parse tree in
> the follow-up commits.
>
> Signed-off-by: Nam Cao <[email protected]>
> ---
> tools/verification/rvgen/rvgen/automata.py | 182 +++++++++++++++++++++
> 1 file changed, 182 insertions(+)
>
> diff --git a/tools/verification/rvgen/rvgen/automata.py
> b/tools/verification/rvgen/rvgen/automata.py
> index b9f8149f7118..4e3d719a0952 100644
> --- a/tools/verification/rvgen/rvgen/automata.py
> +++ b/tools/verification/rvgen/rvgen/automata.py
> @@ -13,6 +13,187 @@ import re
> from typing import Iterator
> from itertools import islice
>
> +import lark
> +
> +class ParseTree:
> + # based on https://graphviz.org/doc/info/lang.html
> + # with the irrelevant stuffs (port and compass) removed
> + grammar = r'''
> + start: "strict"? ("graph" | "digraph") ID? "{" stmt_list "}"
> +
> + stmt_list: (stmt ";"? stmt_list)?
> +
> + stmt: node_stmt
> + | edge_stmt
> + | attr_stmt
> + | ID "=" ID
> + | subgraph
> +
> + attr_stmt: attr_type attr_list
> +
> + attr_type: "graph" -> graph
> + | "node" -> node
> + | "edge" -> edge
> +
> + attr_list: "[" a_list? "]" attr_list?
> +
> + a_list: ID "=" ID (";" | ",")? a_list?
> +
> + edge_stmt: (node_id | subgraph) edgerhs attr_list?
> +
> + edgerhs: edgeop (node_id | subgraph) edgerhs?
> +
> + edgeop: "->" | "--"
> +
> + node_stmt: node_id attr_list?
> +
> + node_id: ID
> +
> + subgraph: ("subgraph" ID?)? "{" stmt_list "}"
> +
> + ID: /[_a-zA-Z][_a-zA-Z0-9]+/
This regex rejects symbol character symbol. Is that intentional?
> + | /-?(\.[0-9]+|[0-9]+(\.[0-9]*))/
> + | /".*?"/
> +
> + %import common.WS
> + %ignore WS
> + '''
> +
> + @staticmethod
> + def parse_edge(tree: lark.Tree) -> tuple[str, str]:
> + # only support a simple node-to-node edge
> + nodes = []
> + for node in tree.iter_subtrees_topdown():
> + if node.data == "node_id":
> + nodes.append(node.children[0].strip('"'))
> +
> + if len(nodes) != 2:
> + raise AutomataError("Only state-to-state transition is
> supported")
> +
> + return tuple(nodes)
> +
> + class ParseNodes(lark.visitors.Visitor):
> + def __init__(self, *args, **kwargs):
> + self.nodes = set()
> + super().__init__(*args, **kwargs)
> +
> + def node_stmt(self, tree):
> + node_id = tree.children[0]
> + node = node_id.children[0].strip('"')
> + self.nodes.add(node)
> +
> + class ParseEdges(lark.visitors.Visitor):
> + def __init__(self, *args, **kwargs):
> + self.edges = set()
> + super().__init__(*args, **kwargs)
> +
> + def edge_stmt(self, tree):
> + edge = ParseTree.parse_edge(tree)
> + self.edges.add(edge)
> +
> + class ParseAttributes(lark.visitors.Interpreter):
> + def __init__(self, *args, **kwargs):
> + '''
> + Stacks of default attributes. [0] is the default
> + attributes for the outermost scope, while [-1] is the
> + default attributes for the current scope.
> + '''
> + self.default_node_attrs = [{}]
> + self.default_edge_attrs = [{}]
> +
> + self.node_attrs = {}
> + self.edge_attrs = {}
> +
> + super().__init__(*args, **kwargs)
> +
> + @staticmethod
> + def __get_attrs(stmt: lark.Tree) -> dict[str, str]:
> + attrs = {}
> +
> + for node in stmt.iter_subtrees():
> + if node.data == "a_list":
> + attrs[node.children[0]] = node.children[1].strip('"')
> +
> + return attrs
> +
> +
> + def subgraph(self, tree):
> + # We are entering a new scope, inherit the default
> + # attributes of the outer scope
> +
> self.default_node_attrs.append(self.default_node_attrs[-1].copy())
> +
> self.default_edge_attrs.append(self.default_edge_attrs[-1].copy())
> +
> + children = self.visit_children(tree)
> +
> + # Exiting the scope
> + del self.default_node_attrs[-1]
> + del self.default_edge_attrs[-1]
> +
> + return children
> +
> + def node_stmt(self, tree):
> + node_id = tree.children[0]
> + node = node_id.children[0].strip('"')
> +
> + attrs = self.default_node_attrs[-1].copy()
> + attrs |= self.__get_attrs(tree)
> +
> + if attrs:
> + if node in self.node_attrs:
> + self.node_attrs[node] = attrs | self.node_attrs[node]
> + else:
> + self.node_attrs[node] = attrs
> +
> + return self.visit_children(tree)
> +
> + def edge_stmt(self, tree):
> + edge = ParseTree.parse_edge(tree)
> +
> + attrs = self.default_edge_attrs[-1].copy()
> + attrs |= self.__get_attrs(tree)
> +
> + if attrs:
> + if edge in self.edge_attrs:
> + self.edge_attrs[edge] = attrs | self.edge_attrs[edge]
> + else:
> + self.edge_attrs[edge] = attrs
> +
> + return self.visit_children(tree)
> +
> + def attr_stmt(self, tree):
> + attr_type = tree.children[0].data
> + attrs = self.__get_attrs(tree)
> +
> + if attr_type == "node":
> + self.default_node_attrs[-1] |= attrs
> + elif attr_type == "edge":
> + self.default_edge_attrs[-1] |= attrs
> + else:
> + # graph attributes are irrelevant
> + pass
> +
> + self.visit_children(tree)
> +
> + def __init__(self, dot_file):
> + parser = lark.Lark(self.grammar, parser='lalr')
> + node_parser = self.ParseNodes()
> + edge_parser = self.ParseEdges()
> + attributes_parser = self.ParseAttributes()
> +
> + try:
> + with open(dot_file, "r") as dot_file:
> + tree = parser.parse(dot_file.read())
> + attributes_parser.visit(tree)
> + node_parser.visit(tree)
> + edge_parser.visit(tree)
> + except OSError as exc:
> + raise AutomataError(exc.strerror) from exc
> +
> + self.nodes = node_parser.nodes
> + self.edges = edge_parser.edges
> + self.node_attrs = attributes_parser.node_attrs
> + self.edge_attrs = attributes_parser.edge_attrs
> +
> class _ConstraintKey:
> """Base class for constraint keys."""
>
> @@ -66,6 +247,7 @@ class Automata:
> self.__dot_path = file_path
> self.name = model_name or self.__get_model_name()
> self.__dot_lines = self.__open_dot()
> + self.__parse_tree = ParseTree(file_path)
> self.states, self.initial_state, self.final_states =
> self.__get_state_variables()
> self.env_types = {}
> self.env_stored = set()
> --
> 2.47.3
>