diff --git a/CHANGELOG.md b/CHANGELOG.md index c661874c..3fb15ba1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Added +- **Nix support** (flake-aware): `.nix` files are parsed via the `nix` tree-sitter grammar shipped with `tree-sitter-language-pack`. Top-level and nested attrset bindings become `Function` nodes with flattened dotted names (e.g. `packages.default`, `devShells.default`). In `flake.nix`, `inputs..url = "..."` strings emit `IMPORTS_FROM` edges to the URL; `import ` and `callPackage ` applications in any `.nix` file emit `IMPORTS_FROM` edges (relative paths are resolved against the caller's directory). Adds 7 tests (`TestNixParsing`) and fixtures `tests/fixtures/sample.nix`, `tests/fixtures/sample_module.nix`. - **GDScript support** (Godot): `.gd` files are parsed via the `gdscript` tree-sitter grammar shipped with `tree-sitter-language-pack`. Extracts inner classes (`class Name:`), the file-level `class_name` identity, functions (including `static func`), `extends` parent class as an IMPORTS_FROM edge, direct calls (`call`) and method calls (`attribute_call`). Adds 10 tests and `tests/fixtures/sample.gd`. ## [2.3.2] - 2026-04-14 diff --git a/CLAUDE.md b/CLAUDE.md index d3983fe5..b9f4dcdf 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -15,7 +15,7 @@ When using code-review-graph MCP tools, follow these rules: ## Architecture - **Core Package**: `code_review_graph/` (Python 3.10+) - - `parser.py` — Tree-sitter multi-language AST parser (19 languages including Vue SFC, Solidity, Dart, R, Perl, Lua + Jupyter/Databricks notebooks) + - `parser.py` — Tree-sitter multi-language AST parser (20 languages including Vue SFC, Solidity, Dart, R, Perl, Lua, Nix + Jupyter/Databricks notebooks) - `graph.py` — SQLite-backed graph store (nodes, edges, BFS impact analysis) - `tools.py` — 22 MCP tool implementations - `main.py` — FastMCP server entry point (stdio transport), registers 22 tools + 5 prompts diff --git a/README.md b/README.md index 07249031..f995f127 100644 --- a/README.md +++ b/README.md @@ -104,13 +104,13 @@ Large monorepos are where token waste is most painful. The graph cuts through th Next.js monorepo: 27,732 files funnelled through code-review-graph down to ~15 files — 49x fewer tokens

-### 23 languages + Jupyter notebooks +### 24 languages + Jupyter notebooks

- 19 languages organized by category: Web, Backend, Systems, Mobile, Scripting, plus Jupyter/Databricks notebook support + 24 languages organized by category: Web, Backend, Systems, Mobile, Scripting, Config (Nix), plus Jupyter/Databricks notebook support

-Full Tree-sitter grammar support for functions, classes, imports, call sites, inheritance, and test detection in every language. Includes Zig, PowerShell, Julia, and Svelte SFC support. Plus Jupyter/Databricks notebook parsing (`.ipynb`) with multi-language cell support (Python, R, SQL), and Perl XS files (`.xs`). +Full Tree-sitter grammar support for functions, classes, imports, call sites, inheritance, and test detection in every language. Includes Zig, PowerShell, Julia, Svelte SFC, and flake-aware Nix support. Plus Jupyter/Databricks notebook parsing (`.ipynb`) with multi-language cell support (Python, R, SQL), and Perl XS files (`.xs`). --- @@ -192,7 +192,7 @@ The blast-radius analysis never misses an actually impacted file (perfect recall | Feature | Details | |---------|---------| | **Incremental updates** | Re-parses only changed files. Subsequent updates complete in under 2 seconds. | -| **23 languages + notebooks** | Python, TypeScript/TSX, JavaScript, Vue, Svelte, Go, Rust, Java, Scala, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++, Dart, R, Perl, Lua, Zig, PowerShell, Julia, Jupyter/Databricks (.ipynb) | +| **24 languages + notebooks** | Python, TypeScript/TSX, JavaScript, Vue, Svelte, Go, Rust, Java, Scala, C#, Ruby, Kotlin, Swift, PHP, Solidity, C/C++, Dart, R, Perl, Lua, Zig, PowerShell, Julia, Nix, Jupyter/Databricks (.ipynb) | | **Blast-radius analysis** | Shows exactly which functions, classes, and files are affected by any change | | **Auto-update hooks** | Graph updates on every file edit and git commit without manual intervention | | **Semantic search** | Optional vector embeddings via sentence-transformers, Google Gemini, MiniMax, or any OpenAI-compatible endpoint (real OpenAI, Azure, new-api, LiteLLM, vLLM, LocalAI) | diff --git a/code_review_graph/parser.py b/code_review_graph/parser.py index f681263a..b9cce558 100644 --- a/code_review_graph/parser.py +++ b/code_review_graph/parser.py @@ -125,6 +125,7 @@ class EdgeInfo: ".res": "rescript", ".resi": "rescript", ".gd": "gdscript", + ".nix": "nix", } # Tree-sitter node type mappings per language @@ -169,6 +170,9 @@ class EdgeInfo: # identifier is literally "defmodule". Dispatched via # _extract_elixir_constructs to avoid matching every ``call`` here. "elixir": [], + # Nix: attrset bindings aren't "classes"; dispatched via + # _extract_nix_constructs. + "nix": [], "zig": ["container_declaration"], "powershell": ["class_statement"], "julia": ["struct_definition", "abstract_definition"], @@ -216,6 +220,9 @@ class EdgeInfo: # Elixir: def/defp/defmacro are all ``call`` nodes whose first # identifier matches. Dispatched via _extract_elixir_constructs. "elixir": [], + # Nix: `attrpath = expr;` bindings become Function nodes — + # handled in _extract_nix_constructs. + "nix": [], "zig": ["fn_proto", "fn_decl"], "powershell": ["function_statement"], "julia": [ @@ -256,6 +263,10 @@ class EdgeInfo: # Elixir: alias/import/require/use are all ``call`` nodes — # handled in _extract_elixir_constructs. "elixir": [], + # Nix: `import ./x.nix`, `callPackage ./y.nix {}`, and flake + # `inputs.*.url` strings become IMPORTS_FROM edges — + # handled in _extract_nix_constructs. + "nix": [], # Zig: @import("...") is a builtin_call_expr — handled # generically via call types below. "zig": [], @@ -297,6 +308,9 @@ class EdgeInfo: # _extract_elixir_constructs which filters out def/defmodule/alias/etc. # before treating what's left as a real call. "elixir": [], + # Nix: function application is ubiquitous; only import/callPackage + # produce edges, in _extract_nix_constructs. + "nix": [], "zig": ["call_expression", "builtin_call_expr"], "powershell": ["command_expression"], "julia": ["call_expression"], @@ -1797,6 +1811,20 @@ def _extract_from_tree( ): continue + # --- Nix-specific constructs --- + # Nix bindings (``attrpath = expr;``) are the graph's addressable + # things; dispatch via _extract_nix_constructs to flatten dotted + # attrpaths into Function nodes and to emit IMPORTS_FROM edges for + # flake ``inputs.*.url`` strings and ``import``/``callPackage`` + # applications. See: #366 follow-up (flake-aware Nix support). + if language == "nix" and node_type == "binding": + if self._extract_nix_constructs( + child, source, language, file_path, nodes, edges, + enclosing_class, enclosing_func, + import_map, defined_names, _depth, + ): + continue + # --- Dart call detection (see #87) --- # tree-sitter-dart does not wrap calls in a single # ``call_expression`` node; instead the pattern is @@ -2127,6 +2155,292 @@ def _extract_elixir_constructs( ) return True + @staticmethod + def _is_nix_flake_file(file_path: str) -> bool: + """Return True for files whose basename is ``flake.nix``.""" + return Path(file_path).name == "flake.nix" + + def _nix_attrpath_parts(self, attrpath_node) -> list[str]: + """Flatten a Nix ``attrpath`` node into a list of identifier parts. + + ``packages.default`` → ``["packages", "default"]``; + ``inputs.nixpkgs.url`` → ``["inputs", "nixpkgs", "url"]``. Dotted + attrpaths have ``identifier`` children separated by ``.`` tokens. + """ + parts: list[str] = [] + for child in attrpath_node.children: + if child.type == "identifier": + parts.append(child.text.decode("utf-8", errors="replace")) + return parts + + def _extract_nix_flake_input_urls( + self, attrset_node, + ) -> list[tuple[str, int]]: + """Walk a Nix ``attrset_expression`` looking for ``*.url = "..."`` + bindings whose RHS is a literal string. Returns ``(url, line)`` + tuples. Used when the enclosing attrpath is ``inputs`` so that both + the nested form + + inputs = { nixpkgs.url = "..."; flake-utils.url = "..."; }; + + and the mixed form (an inner input with its own nested attrset) + surface the URL strings as IMPORTS_FROM targets. + """ + results: list[tuple[str, int]] = [] + + def visit(n) -> None: + if n is None: + return + if n.type == "binding": + inner_path = None + inner_rhs = None + for sub in n.children: + if sub.type == "attrpath": + inner_path = sub + elif sub.type not in ("=", ";") and inner_path is not None: + if inner_rhs is None: + inner_rhs = sub + if inner_path is not None and inner_rhs is not None: + parts = self._nix_attrpath_parts(inner_path) + if ( + parts + and parts[-1] == "url" + and inner_rhs.type == "string_expression" + ): + for c in inner_rhs.children: + if c.type == "string_fragment": + url = c.text.decode("utf-8", errors="replace") + results.append((url, n.start_point[0] + 1)) + break + return # leaf binding — no children to recurse into + # Non-url binding: still recurse so a deeper url survives + if inner_rhs.type == "attrset_expression": + visit(inner_rhs) + return + for c in n.children: + visit(c) + + visit(attrset_node) + return results + + def _extract_nix_import_targets(self, rhs_node) -> list[tuple[str, int]]: + """Walk an expression looking for ``import `` and + ``callPackage `` applications. Returns a list of + ``(target_path, line)`` tuples for each match. + + Recurses through ``apply_expression`` (so ``import ./x.nix { ... }`` + and ``pkgs.callPackage ./y.nix { }`` are both caught) and descends + into bodies of ``let_expression`` / ``parenthesized_expression`` / + ``function_expression`` / ``attrset_expression`` / ``list_expression`` + so a ``let pkgs = import nixpkgs; in { ... }`` body is scanned too. + """ + results: list[tuple[str, int]] = [] + + def head_call_name(apply) -> Optional[str]: + """Drill down the left-most side of nested apply_expressions to + the callee identifier. ``import ./x`` → ``"import"``; + ``pkgs.callPackage ./y { }`` → ``"callPackage"`` (last dotted + segment of the select_expression).""" + cur = apply + while cur is not None and cur.type == "apply_expression": + cur = cur.children[0] if cur.children else None + if cur is None: + return None + if cur.type == "variable_expression": + for c in cur.children: + if c.type == "identifier": + return c.text.decode("utf-8", errors="replace") + if cur.type == "select_expression": + # Last identifier in the attrpath portion. + last: Optional[str] = None + for c in cur.children: + if c.type == "attrpath": + for ac in c.children: + if ac.type == "identifier": + last = ac.text.decode("utf-8", errors="replace") + elif c.type == "identifier": + last = c.text.decode("utf-8", errors="replace") + return last + if cur.type == "identifier": + return cur.text.decode("utf-8", errors="replace") + return None + + def first_path_arg(apply) -> Optional[str]: + """For nested apply_expressions like ``import ./x.nix { }``, walk + down collecting arguments; return the first ``path_expression`` + we find.""" + # Descend left spine collecting right-hand args in outer→inner order + stack: list = [] + cur = apply + while cur is not None and cur.type == "apply_expression": + if len(cur.children) >= 2: + stack.append(cur.children[1]) + cur = cur.children[0] if cur.children else None + # Args closest to the callee come last in stack; try them in + # that order (innermost first) so ``import ./x { }`` picks + # ``./x`` not ``{ }``. + for arg in reversed(stack): + if arg.type == "path_expression": + return arg.text.decode("utf-8", errors="replace").strip() + return None + + def visit(n) -> None: + if n is None: + return + if n.type == "apply_expression": + name = head_call_name(n) + if name in ("import", "callPackage"): + path = first_path_arg(n) + if path: + results.append((path, n.start_point[0] + 1)) + # Still recurse into children so nested imports inside + # argument attrsets/lets are caught. + for c in n.children: + visit(c) + + visit(rhs_node) + return results + + def _extract_nix_constructs( + self, + node, + source: bytes, + language: str, + file_path: str, + nodes: list[NodeInfo], + edges: list[EdgeInfo], + enclosing_class: Optional[str], + enclosing_func: Optional[str], + import_map: Optional[dict[str, str]], + defined_names: Optional[set[str]], + _depth: int, + ) -> bool: + """Handle a Nix ``binding`` node (``attrpath = expr;``). + + - Flattens dotted attrpaths into a single dotted node name + (``packages.default``). + - In ``flake.nix``, ``inputs..url = "..."`` bindings emit an + ``IMPORTS_FROM`` edge with target = the URL string, and no node. + - All other bindings become ``Function`` nodes (matching the + Bash/Elixir convention for "the graph's addressable things") with a + CONTAINS edge from the File. + - The RHS is scanned for ``import `` / ``callPackage ...`` + applications; each emits an ``IMPORTS_FROM`` edge (relative paths + are resolved against the caller's directory when possible). + - Recurses into the RHS so nested bindings (e.g. inside + ``let ... in { ... }`` or ``outputs = { ... }: { ... }``) are + discovered and flattened as their own top-level nodes. + + Returns True (Nix has no other node-type dispatches in the walker). + """ + attrpath_node = None + rhs_node = None + for sub in node.children: + if sub.type == "attrpath": + attrpath_node = sub + elif sub.type not in ("=", ";") and attrpath_node is not None: + # First non-attrpath, non-punctuation child is the RHS. + if rhs_node is None: + rhs_node = sub + if attrpath_node is None or rhs_node is None: + return False + + parts = self._nix_attrpath_parts(attrpath_node) + if not parts: + return False + name = ".".join(parts) + line = node.start_point[0] + 1 + + # --- Flake input URL: inputs..url = "..." ------------------ + # Flat form: ``inputs.nixpkgs.url = "github:...";`` — emit one edge, + # skip node creation (this is metadata, not a graph "thing"). + if ( + self._is_nix_flake_file(file_path) + and len(parts) >= 2 + and parts[0] == "inputs" + and parts[-1] == "url" + and rhs_node.type == "string_expression" + ): + url: Optional[str] = None + for c in rhs_node.children: + if c.type == "string_fragment": + url = c.text.decode("utf-8", errors="replace") + break + if url: + edges.append(EdgeInfo( + kind="IMPORTS_FROM", + source=file_path, + target=url, + file_path=file_path, + line=line, + )) + return True + + # Nested form: ``inputs = { nixpkgs.url = "..."; ... };`` — emit an + # edge per inner url string. Still fall through so the ``inputs`` + # binding itself becomes a Function node and the default recursion + # continues (the recursion won't re-emit these urls as separate + # Function nodes because the flat form above short-circuits). + if ( + self._is_nix_flake_file(file_path) + and parts == ["inputs"] + and rhs_node.type == "attrset_expression" + ): + for url, uline in self._extract_nix_flake_input_urls(rhs_node): + edges.append(EdgeInfo( + kind="IMPORTS_FROM", + source=file_path, + target=url, + file_path=file_path, + line=uline, + )) + + # --- Regular binding → Function node ----------------------------- + qualified = self._qualify(name, file_path, enclosing_class) + nodes.append(NodeInfo( + kind="Function", + name=name, + file_path=file_path, + line_start=line, + line_end=node.end_point[0] + 1, + language=language, + parent_name=enclosing_class, + )) + container = ( + self._qualify(enclosing_class, file_path, None) + if enclosing_class else file_path + ) + edges.append(EdgeInfo( + kind="CONTAINS", + source=container, + target=qualified, + file_path=file_path, + line=line, + )) + + # --- IMPORTS_FROM edges for import / callPackage inside the RHS -- + for target, tline in self._extract_nix_import_targets(rhs_node): + resolved = self._resolve_module_to_file(target, file_path, "nix") + edges.append(EdgeInfo( + kind="IMPORTS_FROM", + source=file_path, + target=resolved if resolved else target, + file_path=file_path, + line=tline, + )) + + # Recurse into the RHS so nested bindings become their own nodes + # (e.g. ``outputs = ...: { packages.default = ...; }`` surfaces + # ``packages.default`` as a top-level-named Function node too). + self._extract_from_tree( + rhs_node, source, language, file_path, nodes, edges, + enclosing_class=enclosing_class, + enclosing_func=enclosing_func, + import_map=import_map, defined_names=defined_names, + _depth=_depth + 1, + ) + return True + def _extract_bash_source_command( self, node, @@ -3683,6 +3997,18 @@ def _do_resolve_module( pass return None + if language == "nix": + # ``import ./x.nix`` / ``callPackage ./x.nix { }`` — relative to + # the caller's directory. Non-relative targets (URLs, bare + # identifiers like ``nixpkgs``) are left unresolved. + try: + target = (caller_dir / module).resolve() + if target.is_file(): + return str(target) + except (OSError, ValueError): + pass + return None + if language == "python": rel_path = module.replace(".", "/") candidates = [rel_path + ".py", rel_path + "/__init__.py"] diff --git a/diagrams/generate_diagrams.py b/diagrams/generate_diagrams.py index 45eca114..347dc7fc 100644 --- a/diagrams/generate_diagrams.py +++ b/diagrams/generate_diagrams.py @@ -641,7 +641,7 @@ def d8(): # ════════════════════════════════════════════ def d9(): els = [] - els.append(TC(550, 15, "19 Languages + Notebook Support", 34)) + els.append(TC(550, 15, "20 Languages + Notebook Support", 34)) # Group languages by ecosystem groups = [ @@ -650,7 +650,8 @@ def d9(): ("Systems", ["C", "C++", "C#"], ORG, ORG_BG), ("Mobile", ["Kotlin", "Swift", "Dart"], PRP, PRP_BG), ("Scripting", ["Ruby", "PHP", "Perl", "Lua", "R"], YLW, YLW_BG), - ("Other", ["Solidity", "Jupyter/.ipynb"], GRY, GRY_BG), + ("Config", ["Nix"], GRY, GRY_BG), + ("Other", ["Solidity", "Jupyter/.ipynb"], GRY, GRY_BG), ] gw = 155 # group width diff --git a/tests/fixtures/sample.nix b/tests/fixtures/sample.nix new file mode 100644 index 00000000..e909110d --- /dev/null +++ b/tests/fixtures/sample.nix @@ -0,0 +1,17 @@ +{ + description = "Sample flake fixture for code-review-graph tests"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils, ... }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = import nixpkgs { inherit system; }; + in { + packages.default = pkgs.callPackage ./default.nix { }; + devShells.default = import ./shell.nix { inherit pkgs; }; + }); +} diff --git a/tests/fixtures/sample_module.nix b/tests/fixtures/sample_module.nix new file mode 100644 index 00000000..99db0a77 --- /dev/null +++ b/tests/fixtures/sample_module.nix @@ -0,0 +1,12 @@ +{ lib, pkgs, ... }: + +let + helper = import ./foo.nix { inherit lib; }; +in { + environment.systemPackages = [ pkgs.hello ]; + + services.myservice = { + enable = true; + greeting = helper.greeting; + }; +} diff --git a/tests/test_multilang.py b/tests/test_multilang.py index 9d45f434..07d6b375 100644 --- a/tests/test_multilang.py +++ b/tests/test_multilang.py @@ -1916,3 +1916,82 @@ def test_resolver_is_idempotent(self, tmp_path): # Second run should find nothing new — all already resolved. assert second["calls_resolved"] == 0 assert second["imports_resolved"] == 0 + + +class TestNixParsing: + """Flake-aware Nix parser — see the Nix language-support epic.""" + + def setup_method(self): + self.parser = CodeParser() + # Parse the flake-shaped fixture as if its basename were ``flake.nix`` + # so the ``inputs.*.url`` branch of _extract_nix_constructs fires. + flake_bytes = (FIXTURES / "sample.nix").read_bytes() + self.flake_path = FIXTURES / "flake.nix" + self.flake_nodes, self.flake_edges = self.parser.parse_bytes( + self.flake_path, flake_bytes, + ) + # The non-flake fixture retains its actual path; it's used to verify + # the flake-input branch does *not* fire on non-flake files. + module_path = FIXTURES / "sample_module.nix" + self.module_nodes, self.module_edges = self.parser.parse_file(module_path) + + def test_detects_language(self): + assert self.parser.detect_language(Path("flake.nix")) == "nix" + assert self.parser.detect_language(Path("modules/foo.nix")) == "nix" + + def test_nodes_have_nix_language(self): + for n in self.flake_nodes: + assert n.language == "nix" + for n in self.module_nodes: + assert n.language == "nix" + + def test_top_level_bindings_become_functions(self): + funcs = {n.name for n in self.flake_nodes if n.kind == "Function"} + # Top-level bindings from sample.nix (flake-shaped). + assert "description" in funcs + assert "inputs" in funcs + assert "outputs" in funcs + # Nested bindings flattened to dotted names. + assert "packages.default" in funcs + assert "devShells.default" in funcs + + def test_flake_inputs_produce_import_edges(self): + targets = { + e.target for e in self.flake_edges if e.kind == "IMPORTS_FROM" + } + assert "github:NixOS/nixpkgs/nixos-unstable" in targets + assert "github:numtide/flake-utils" in targets + + def test_import_and_callpackage_produce_import_edges(self): + targets = { + e.target for e in self.flake_edges if e.kind == "IMPORTS_FROM" + } + # callPackage ./default.nix and import ./shell.nix. Relative paths + # are resolved against the caller's directory when possible; since + # neither file exists alongside the fixture, the raw relative + # path is preserved. + assert "./default.nix" in targets + assert "./shell.nix" in targets + + def test_non_flake_file_has_no_input_edges(self): + # ``sample_module.nix`` is not named ``flake.nix``, so the + # inputs.*.url branch must not fire — no github:-prefixed targets. + targets = [ + e.target for e in self.module_edges if e.kind == "IMPORTS_FROM" + ] + assert not any(t.startswith("github:") for t in targets) + # The import ./foo.nix inside the `let` body still produces an edge. + assert any("foo.nix" in t for t in targets) + + def test_contains_edges_wire_file_to_top_level_bindings(self): + file_path = str(self.flake_path) + contains_targets = { + e.target for e in self.flake_edges + if e.kind == "CONTAINS" and e.source == file_path + } + # Each top-level binding should be CONTAINS-linked from the file. + for name in ("description", "inputs", "outputs"): + qualified = f"{file_path}::{name}" + assert qualified in contains_targets, ( + f"missing CONTAINS edge for {qualified}" + ) diff --git a/uv.lock b/uv.lock index 62a32add..c40535b3 100644 --- a/uv.lock +++ b/uv.lock @@ -411,6 +411,7 @@ requires-dist = [ { name = "pyyaml", marker = "extra == 'eval'", specifier = ">=6.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.3.0,<1" }, { name = "sentence-transformers", marker = "extra == 'embeddings'", specifier = ">=3.0.0,<4" }, + { name = "tomli", marker = "python_full_version < '3.11'", specifier = ">=2.0.0,<3" }, { name = "tomli", marker = "python_full_version < '3.11' and extra == 'dev'", specifier = ">=2.0" }, { name = "tree-sitter", specifier = ">=0.23.0,<1" }, { name = "tree-sitter-language-pack", specifier = ">=0.3.0,<1" },