-
Notifications
You must be signed in to change notification settings - Fork 56
Open
Description
grammar.json
is needed to map from node.type
to node.kind_id
assuming that node.type
is more stable across different versions of a parser
the extra files should be stored in the filesystem to save memory
# TODO better? get name-id mappings from parser binary?
import json
with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/grammar.json", "r") as f:
tree_sitter_html_grammar = json.load(f)
# no. names can be ugly names like '"'
# import types
# node_kind = types.SimpleNamespace(**{
# name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
# })
# print("node_kind.document", node_kind.document)
node_kind = {
name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
}
print("node_kind document", node_kind["document"])
TODO better? get name-id mappings from parser binary?
probably this should be fixed upstream in tree-sitter
edit: tree_sitter_html_grammar["rules"]
is wrong
i was looking for ts_symbol_identifiers
and ts_symbol_names
in src/parser.c
enum ts_symbol_identifiers {
anon_sym_LT_BANG = 1,
aux_sym_doctype_token1 = 2,
anon_sym_GT = 3,
static const char * const ts_symbol_names[] = {
[ts_builtin_sym_end] = "end",
[anon_sym_LT_BANG] = "<!",
[aux_sym_doctype_token1] = "doctype_token1",
[anon_sym_GT] = ">",
parsing src/parser.c
is a bit more than json.load
...
parse_parser_c.py
import ast
import tree_sitter_languages
with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/parser.c", "rb") as f:
parser_c_src = f.read()
tree_sitter_c = tree_sitter_languages.get_parser("c")
parser_c_tree = tree_sitter_c.parse(parser_c_src)
def walk_tree(tree):
cursor = tree.walk()
reached_root = False
while reached_root == False:
yield cursor.node
if cursor.goto_first_child():
continue
if cursor.goto_next_sibling():
continue
retracing = True
while retracing:
if not cursor.goto_parent():
retracing = False
reached_root = True
if cursor.goto_next_sibling():
retracing = False
if False:
# debug: print AST
node_idx = 0
max_len = 30
for node in walk_tree(parser_c_tree.root_node):
node_text = json.dumps(node.text.decode("utf8"))
if len(node_text) > max_len:
node_text = node_text[0:max_len] + "..."
#pfx = "# " if is_compound else " "
pfx = ""
print(pfx + f"node {node.kind_id:2d} = {node.type:25s} : {node_text:30s}")
node_idx += 1
#if node_idx > 100: break
sys.exit()
in_enum_ts_symbol_identifiers = False
in_char_ts_symbol_names = False
enum_name = None
current_identifier = None
enum_ts_symbol_identifiers = dict()
char_ts_symbol_names = dict()
for node in walk_tree(parser_c_tree.root_node):
node_source = node.text.decode("utf8")
if node.type == "type_identifier" and node.text == b"ts_symbol_identifiers":
in_enum_ts_symbol_identifiers = True
continue
if node.type == "pointer_declarator" and node.text == b"* const ts_symbol_names[]":
in_char_ts_symbol_names = True
continue
if in_enum_ts_symbol_identifiers:
if node.type == "identifier":
current_identifier = node_source
continue
if node.type == "number_literal":
enum_ts_symbol_identifiers[current_identifier] = (
int(node_source)
)
current_identifier = None
continue
if node.type == "}":
current_identifier = node_source
in_enum_ts_symbol_identifiers = False
continue
continue
if in_char_ts_symbol_names:
if node.type == "subscript_designator":
current_identifier = node_source[1:-1]
continue
if node.type == "string_literal":
char_ts_symbol_names[current_identifier] = (
ast.literal_eval(node_source)
)
current_identifier = None
continue
if node.type == "}":
current_identifier = node_source
in_char_ts_symbol_names = False
break
continue
#print("enum_ts_symbol_identifiers =", json.dumps(enum_ts_symbol_identifiers, indent=2))
#print("char_ts_symbol_names =", json.dumps(char_ts_symbol_names, indent=2))
# force user to use exact names from full_node_kind
# names can collide when grammars
# use the same names for different tokens...
# example: <!doctype html>
# both the full tag and the tag_name have the token name "doctype"
# sym_doctype = 26, // full doctype tag
# sym__doctype = 4, // tag_name of doctype tag
full_node_kind = enum_ts_symbol_identifiers
node_kind = dict()
for full_name, id in enum_ts_symbol_identifiers.items():
name = char_ts_symbol_names[full_name]
if len(list(filter(lambda n: n == name, char_ts_symbol_names.values()))) > 1:
# duplicate name
# force user to use full_name in full_node_kind
# also store full_name in node_kind
node_kind[full_name] = id
continue
node_kind[name] = id
# allow reverse lookup from id to name
node_name = [None] + list(node_kind.keys())
#print("full_node_kind =", json.dumps(full_node_kind, indent=2))
print("node_kind =", json.dumps(node_kind, indent=2))
#print("node_kind document =", node_kind["document"])
alternative: parse a source that contains all possible node types
and build the mapping from the node.type
and node.kind_id
values
keywords: tree-sitter use numeric node types in scripting languages python javascript
Metadata
Metadata
Assignees
Labels
No labels