expose extra files of each language: grammar.json, node-types.json, highlights.scm, ...

`grammar.json` is needed to map from `node.type` to `node.kind_id`
assuming that `node.type` is more stable across different versions of a parser

the extra files should be stored in the filesystem to save memory

```py
# TODO better? get name-id mappings from parser binary?

import json
with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/grammar.json", "r") as f:
    tree_sitter_html_grammar = json.load(f)

# no. names can be ugly names like '"'
# import types
# node_kind = types.SimpleNamespace(**{
#     name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
# })
# print("node_kind.document", node_kind.document)

node_kind = {
    name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
}

print("node_kind document", node_kind["document"])
```

> TODO better? get name-id mappings from parser binary?

probably this should be fixed upstream in tree-sitter

edit: `tree_sitter_html_grammar["rules"]` is wrong
i was looking for `ts_symbol_identifiers` and `ts_symbol_names` in `src/parser.c`

```
enum ts_symbol_identifiers {
  anon_sym_LT_BANG = 1,
  aux_sym_doctype_token1 = 2,
  anon_sym_GT = 3,
```

```
static const char * const ts_symbol_names[] = {
  [ts_builtin_sym_end] = "end",
  [anon_sym_LT_BANG] = "<!",
  [aux_sym_doctype_token1] = "doctype_token1",
  [anon_sym_GT] = ">",
```

parsing `src/parser.c` is a bit more than `json.load`...

<details>
<summary>
parse_parser_c.py
</summary>

```py
import ast
import tree_sitter_languages

with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/parser.c", "rb") as f:
    parser_c_src = f.read()

tree_sitter_c = tree_sitter_languages.get_parser("c")
parser_c_tree = tree_sitter_c.parse(parser_c_src)

def walk_tree(tree):
    cursor = tree.walk()
    reached_root = False
    while reached_root == False:
        yield cursor.node
        if cursor.goto_first_child():
            continue
        if cursor.goto_next_sibling():
            continue
        retracing = True
        while retracing:
            if not cursor.goto_parent():
                retracing = False
                reached_root = True
            if cursor.goto_next_sibling():
                retracing = False

if False:

    # debug: print AST

    node_idx = 0
    max_len = 30

    for node in walk_tree(parser_c_tree.root_node):

        node_text = json.dumps(node.text.decode("utf8"))
        if len(node_text) > max_len:
            node_text = node_text[0:max_len] + "..."

        #pfx = "# " if is_compound else "  "
        pfx = ""
        print(pfx + f"node {node.kind_id:2d} = {node.type:25s} : {node_text:30s}")

        node_idx += 1
        #if node_idx > 100: break

    sys.exit()

in_enum_ts_symbol_identifiers = False
in_char_ts_symbol_names = False
enum_name = None
current_identifier = None
enum_ts_symbol_identifiers = dict()
char_ts_symbol_names = dict()


for node in walk_tree(parser_c_tree.root_node):

    node_source = node.text.decode("utf8")

    if node.type == "type_identifier" and node.text == b"ts_symbol_identifiers":
        in_enum_ts_symbol_identifiers = True
        continue

    if node.type == "pointer_declarator" and node.text == b"* const ts_symbol_names[]":
        in_char_ts_symbol_names = True
        continue

    if in_enum_ts_symbol_identifiers:

        if node.type == "identifier":
            current_identifier = node_source
            continue

        if node.type == "number_literal":
            enum_ts_symbol_identifiers[current_identifier] = (
                int(node_source)
            )
            current_identifier = None
            continue

        if node.type == "}":
            current_identifier = node_source
            in_enum_ts_symbol_identifiers = False
            continue

        continue

    if in_char_ts_symbol_names:

        if node.type == "subscript_designator":
            current_identifier = node_source[1:-1]
            continue

        if node.type == "string_literal":
            char_ts_symbol_names[current_identifier] = (
                ast.literal_eval(node_source)
            )
            current_identifier = None
            continue

        if node.type == "}":
            current_identifier = node_source
            in_char_ts_symbol_names = False
            break

        continue


#print("enum_ts_symbol_identifiers =", json.dumps(enum_ts_symbol_identifiers, indent=2))
#print("char_ts_symbol_names =", json.dumps(char_ts_symbol_names, indent=2))

# force user to use exact names from full_node_kind
# names can collide when grammars
# use the same names for different tokens...
# example: <!doctype html>
# both the full tag and the tag_name have the token name "doctype"
#   sym_doctype = 26, // full doctype tag
#   sym__doctype = 4, // tag_name of doctype tag

full_node_kind = enum_ts_symbol_identifiers
node_kind = dict()
for full_name, id in enum_ts_symbol_identifiers.items():
    name = char_ts_symbol_names[full_name]
    if len(list(filter(lambda n: n == name, char_ts_symbol_names.values()))) > 1:
        # duplicate name
        # force user to use full_name in full_node_kind
        # also store full_name in node_kind
        node_kind[full_name] = id
        continue
    node_kind[name] = id

# allow reverse lookup from id to name
node_name = [None] + list(node_kind.keys())

#print("full_node_kind =", json.dumps(full_node_kind, indent=2))
print("node_kind =", json.dumps(node_kind, indent=2))
#print("node_kind document =", node_kind["document"])
```

</details>

alternative: parse a source that contains all possible node types
and build the mapping from the `node.type` and `node.kind_id` values

keywords: tree-sitter use numeric node types in scripting languages python javascript


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

expose extra files of each language: grammar.json, node-types.json, highlights.scm, ... #59

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

expose extra files of each language: grammar.json, node-types.json, highlights.scm, ... #59

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions