On Tue, May 05, 2026 at 08:59:23AM +0200, Nam Cao wrote:
> The DOT parsing scripts directly parse the raw text and they are quite
> fragile. If the input dot files' formats are slightly changed (for
> instance, by breaking long some lines which is allowed by the DOT language
> defined by graphviz), the scripts would fail.
> 
> To make the scripts robust, the parser should be implemented based on the
> dot language specification, not based on how the existing dot files look.
> 
> As a first step, use Lark to implement a Parser based on the graphviz dot
> language specification. The resulting parse tree is not used yet, but the
> existing scripts will be converted one by one to use this new parse tree in
> the follow-up commits.
> 
> Signed-off-by: Nam Cao <[email protected]>
> ---
>  tools/verification/rvgen/rvgen/automata.py | 182 +++++++++++++++++++++
>  1 file changed, 182 insertions(+)
> 
> diff --git a/tools/verification/rvgen/rvgen/automata.py 
> b/tools/verification/rvgen/rvgen/automata.py
> index b9f8149f7118..4e3d719a0952 100644
> --- a/tools/verification/rvgen/rvgen/automata.py
> +++ b/tools/verification/rvgen/rvgen/automata.py
> @@ -13,6 +13,187 @@ import re
>  from typing import Iterator
>  from itertools import islice
>  
> +import lark
> +
> +class ParseTree:
> +    # based on https://graphviz.org/doc/info/lang.html
> +    # with the irrelevant stuffs (port and compass) removed
> +    grammar = r'''
> +    start: "strict"? ("graph" | "digraph") ID? "{" stmt_list "}"
> +
> +    stmt_list: (stmt ";"? stmt_list)?
> +
> +    stmt: node_stmt
> +        | edge_stmt
> +        | attr_stmt
> +        | ID "=" ID
> +        | subgraph
> +
> +    attr_stmt: attr_type attr_list
> +
> +    attr_type: "graph" -> graph
> +            | "node"  -> node
> +            | "edge"  -> edge
> +
> +    attr_list: "[" a_list? "]" attr_list?
> +
> +    a_list: ID "=" ID (";" | ",")? a_list?
> +
> +    edge_stmt: (node_id | subgraph) edgerhs attr_list?
> +
> +    edgerhs: edgeop (node_id | subgraph) edgerhs?
> +
> +    edgeop: "->" | "--"
> +
> +    node_stmt: node_id attr_list?
> +
> +    node_id: ID
> +
> +    subgraph: ("subgraph" ID?)? "{" stmt_list "}"
> +
> +    ID: /[_a-zA-Z][_a-zA-Z0-9]+/

This regex rejects symbol character symbol. Is that intentional?

> +      | /-?(\.[0-9]+|[0-9]+(\.[0-9]*))/
> +      | /".*?"/
> +
> +    %import common.WS
> +    %ignore WS
> +    '''
> +
> +    @staticmethod
> +    def parse_edge(tree: lark.Tree) -> tuple[str, str]:
> +        # only support a simple node-to-node edge
> +        nodes = []
> +        for node in tree.iter_subtrees_topdown():
> +            if node.data == "node_id":
> +                nodes.append(node.children[0].strip('"'))
> +
> +        if len(nodes) != 2:
> +            raise AutomataError("Only state-to-state transition is 
> supported")
> +
> +        return tuple(nodes)
> +
> +    class ParseNodes(lark.visitors.Visitor):
> +        def __init__(self, *args, **kwargs):
> +            self.nodes = set()
> +            super().__init__(*args, **kwargs)
> +
> +        def node_stmt(self, tree):
> +            node_id = tree.children[0]
> +            node = node_id.children[0].strip('"')
> +            self.nodes.add(node)
> +
> +    class ParseEdges(lark.visitors.Visitor):
> +        def __init__(self, *args, **kwargs):
> +            self.edges = set()
> +            super().__init__(*args, **kwargs)
> +
> +        def edge_stmt(self, tree):
> +            edge = ParseTree.parse_edge(tree)
> +            self.edges.add(edge)
> +
> +    class ParseAttributes(lark.visitors.Interpreter):
> +        def __init__(self, *args, **kwargs):
> +            '''
> +            Stacks of default attributes. [0] is the default
> +            attributes for the outermost scope, while [-1] is the
> +            default attributes for the current scope.
> +            '''
> +            self.default_node_attrs = [{}]
> +            self.default_edge_attrs = [{}]
> +
> +            self.node_attrs = {}
> +            self.edge_attrs = {}
> +
> +            super().__init__(*args, **kwargs)
> +
> +        @staticmethod
> +        def __get_attrs(stmt: lark.Tree) -> dict[str, str]:
> +            attrs = {}
> +
> +            for node in stmt.iter_subtrees():
> +                if node.data == "a_list":
> +                    attrs[node.children[0]] = node.children[1].strip('"')
> +
> +            return attrs
> +
> +
> +        def subgraph(self, tree):
> +            # We are entering a new scope, inherit the default
> +            # attributes of the outer scope
> +            
> self.default_node_attrs.append(self.default_node_attrs[-1].copy())
> +            
> self.default_edge_attrs.append(self.default_edge_attrs[-1].copy())
> +
> +            children = self.visit_children(tree)
> +
> +            # Exiting the scope
> +            del self.default_node_attrs[-1]
> +            del self.default_edge_attrs[-1]
> +
> +            return children
> +
> +        def node_stmt(self, tree):
> +            node_id = tree.children[0]
> +            node = node_id.children[0].strip('"')
> +
> +            attrs = self.default_node_attrs[-1].copy()
> +            attrs |= self.__get_attrs(tree)
> +
> +            if attrs:
> +                if node in self.node_attrs:
> +                    self.node_attrs[node] = attrs | self.node_attrs[node]
> +                else:
> +                    self.node_attrs[node] = attrs
> +
> +            return self.visit_children(tree)
> +
> +        def edge_stmt(self, tree):
> +            edge = ParseTree.parse_edge(tree)
> +
> +            attrs = self.default_edge_attrs[-1].copy()
> +            attrs |= self.__get_attrs(tree)
> +
> +            if attrs:
> +                if edge in self.edge_attrs:
> +                    self.edge_attrs[edge] = attrs | self.edge_attrs[edge]
> +                else:
> +                    self.edge_attrs[edge] = attrs
> +
> +            return self.visit_children(tree)
> +
> +        def attr_stmt(self, tree):
> +            attr_type = tree.children[0].data
> +            attrs = self.__get_attrs(tree)
> +
> +            if attr_type == "node":
> +                self.default_node_attrs[-1] |= attrs
> +            elif attr_type == "edge":
> +                self.default_edge_attrs[-1] |= attrs
> +            else:
> +                # graph attributes are irrelevant
> +                pass
> +
> +            self.visit_children(tree)
> +
> +    def __init__(self, dot_file):
> +        parser = lark.Lark(self.grammar, parser='lalr')
> +        node_parser = self.ParseNodes()
> +        edge_parser = self.ParseEdges()
> +        attributes_parser = self.ParseAttributes()
> +
> +        try:
> +            with open(dot_file, "r") as dot_file:
> +                tree = parser.parse(dot_file.read())
> +                attributes_parser.visit(tree)
> +                node_parser.visit(tree)
> +                edge_parser.visit(tree)
> +        except OSError as exc:
> +            raise AutomataError(exc.strerror) from exc
> +
> +        self.nodes = node_parser.nodes
> +        self.edges = edge_parser.edges
> +        self.node_attrs = attributes_parser.node_attrs
> +        self.edge_attrs = attributes_parser.edge_attrs
> +
>  class _ConstraintKey:
>      """Base class for constraint keys."""
>  
> @@ -66,6 +247,7 @@ class Automata:
>          self.__dot_path = file_path
>          self.name = model_name or self.__get_model_name()
>          self.__dot_lines = self.__open_dot()
> +        self.__parse_tree = ParseTree(file_path)
>          self.states, self.initial_state, self.final_states = 
> self.__get_state_variables()
>          self.env_types = {}
>          self.env_stored = set()
> -- 
> 2.47.3
> 


Reply via email to