diff --git a/code_review_graph/cli.py b/code_review_graph/cli.py index 18e9d527..aa7892b3 100644 --- a/code_review_graph/cli.py +++ b/code_review_graph/cli.py @@ -312,6 +312,20 @@ def _cli_post_process(store: GraphStore) -> None: print(f"Communities: {pp['communities_detected']}") +def _handle_data_dir_option(args, repo_root: Path) -> None: + """Handle --data-dir option by updating registry if specified.""" + if hasattr(args, "data_dir") and args.data_dir: + try: + from .registry import Registry + data_dir_path = Path(args.data_dir).expanduser().resolve() + data_dir_path.mkdir(parents=True, exist_ok=True) + Registry().set_data_dir(str(repo_root), str(data_dir_path)) + logging.info(f"Graph database will be stored at: {data_dir_path}") + except Exception as exc: + logging.error(f"Failed to set data directory: {exc}") + sys.exit(1) + + def main() -> None: """Main CLI entry point.""" ap = argparse.ArgumentParser( @@ -414,6 +428,11 @@ def main() -> None: action="store_true", help="Skip all post-processing (raw parse only)", ) + build_cmd.add_argument( + "--data-dir", + default=None, + help="External directory to store graph database (useful for network shares)" + ) # update update_cmd = sub.add_parser("update", help="Incremental update (only changed files)") @@ -429,6 +448,11 @@ def main() -> None: action="store_true", help="Skip all post-processing (raw parse only)", ) + update_cmd.add_argument( + "--data-dir", + default=None, + help="External directory to store graph database (useful for network shares)" + ) # postprocess pp_cmd = sub.add_parser( @@ -439,14 +463,29 @@ def main() -> None: pp_cmd.add_argument("--no-flows", action="store_true", help="Skip flow detection") pp_cmd.add_argument("--no-communities", action="store_true", help="Skip community detection") pp_cmd.add_argument("--no-fts", action="store_true", help="Skip FTS rebuild") + pp_cmd.add_argument( + "--data-dir", + default=None, + help="External directory to store graph database (useful for network shares)" + ) # watch watch_cmd = sub.add_parser("watch", help="Watch for changes and auto-update") watch_cmd.add_argument("--repo", default=None, help="Repository root (auto-detected)") + watch_cmd.add_argument( + "--data-dir", + default=None, + help="External directory to store graph database (useful for network shares)" + ) # status status_cmd = sub.add_parser("status", help="Show graph statistics") status_cmd.add_argument("--repo", default=None, help="Repository root (auto-detected)") + status_cmd.add_argument( + "--data-dir", + default=None, + help="External directory to store graph database (useful for network shares)" + ) # visualize vis_cmd = sub.add_parser("visualize", help="Generate interactive HTML graph visualization") @@ -468,6 +507,11 @@ def main() -> None: default="html", help="Export format (default: html)", ) + vis_cmd.add_argument( + "--data-dir", + default=None, + help="External directory to store graph database (useful for network shares)" + ) # wiki wiki_cmd = sub.add_parser("wiki", help="Generate markdown wiki from community structure") @@ -477,6 +521,11 @@ def main() -> None: action="store_true", help="Regenerate all pages even if content unchanged", ) + wiki_cmd.add_argument( + "--data-dir", + default=None, + help="External directory to store graph database (useful for network shares)" + ) # register register_cmd = sub.add_parser( @@ -766,6 +815,7 @@ def main() -> None: if args.command == "postprocess": repo_root = Path(args.repo) if args.repo else find_project_root() + _handle_data_dir_option(args, repo_root) db_path = get_db_path(repo_root) store = GraphStore(db_path) try: @@ -802,6 +852,10 @@ def main() -> None: else: repo_root = Path(args.repo) if args.repo else find_project_root() + # Handle --data-dir for commands that support it + if args.command in ("build", "update", "detect-changes", "status", "watch", "visualize", "wiki"): + _handle_data_dir_option(args, repo_root) + db_path = get_db_path(repo_root) store = GraphStore(db_path) diff --git a/code_review_graph/incremental.py b/code_review_graph/incremental.py index a68b2e11..68e29063 100644 --- a/code_review_graph/incremental.py +++ b/code_review_graph/incremental.py @@ -174,9 +174,38 @@ def find_project_root( return start or Path.cwd() +def _write_data_dir_gitignore(data_dir: Path) -> None: + """Write .gitignore file in data directory if it doesn't exist. + + The gitignore contains a single '*' to prevent accidental commits. + """ + inner_gitignore = data_dir / ".gitignore" + if not inner_gitignore.exists(): + try: + # `encoding="utf-8"` is REQUIRED — the em-dash in the header is + # U+2014 which falls outside cp1252. On Windows, calling + # write_text without an encoding silently uses the system default + # codepage, producing a file that subsequently fails to decode as + # UTF-8 (see issue #239). + inner_gitignore.write_text( + "# Auto-generated by code-review-graph — do not commit database files.\n" + "# The graph.db contains absolute paths and code structure metadata.\n" + "*\n", + encoding="utf-8", + ) + except OSError: + # Data dir might be read-only (rare); that's OK, it's a best-effort guard. + pass + + def get_data_dir(repo_root: Path) -> Path: """Return the directory where this project's graph data lives. + Resolution priority: + 1. Registry entry for this repo (set via --data-dir) + 2. CRG_DATA_DIR environment variable (global override) + 3. Default: /.code-review-graph/ + By default, ``/.code-review-graph``. If the ``CRG_DATA_DIR`` environment variable is set, it is used verbatim instead — letting you keep graphs outside the working tree (useful @@ -186,6 +215,20 @@ def get_data_dir(repo_root: Path) -> Path: ``.gitignore`` (with ``*``) is written so any accidentally-nested files never get committed. Both are idempotent. """ + # Check registry first + try: + from .registry import Registry + registry_data_dir = Registry().get_data_dir_for_repo(str(repo_root)) + if registry_data_dir: + data_dir = Path(registry_data_dir).resolve() + data_dir.mkdir(parents=True, exist_ok=True) + _write_data_dir_gitignore(data_dir) + return data_dir + except Exception as exc: + # If registry lookup fails, log and fall through to other methods + logger.debug("Registry lookup failed for %s: %s", repo_root, exc) + + # Check environment variable env_override = os.environ.get("CRG_DATA_DIR", "").strip() if env_override: data_dir = Path(env_override).expanduser().resolve() @@ -193,24 +236,7 @@ def get_data_dir(repo_root: Path) -> Path: data_dir = repo_root / ".code-review-graph" data_dir.mkdir(parents=True, exist_ok=True) - - inner_gitignore = data_dir / ".gitignore" - if not inner_gitignore.exists(): - try: - # `encoding="utf-8"` is REQUIRED — the em-dash in the header is - # U+2014 which falls outside cp1252. On Windows, calling - # write_text without an encoding silently uses the system default - # codepage, producing a file that subsequently fails to decode as - # UTF-8 (see issue #239). - inner_gitignore.write_text( - "# Auto-generated by code-review-graph — do not commit database files.\n" - "# The graph.db contains absolute paths and code structure metadata.\n" - "*\n", - encoding="utf-8", - ) - except OSError: - # Data dir might be read-only (rare); that's OK, it's a best-effort guard. - pass + _write_data_dir_gitignore(data_dir) return data_dir diff --git a/code_review_graph/registry.py b/code_review_graph/registry.py index eae397fa..40e6bc0d 100644 --- a/code_review_graph/registry.py +++ b/code_review_graph/registry.py @@ -54,7 +54,7 @@ def _save(self) -> None: json.dumps(data, indent=2) + "\n", encoding="utf-8" ) - def register(self, path: str, alias: str | None = None) -> dict[str, str]: + def register(self, path: str, alias: str | None = None, data_dir: str | None = None) -> dict[str, str]: """Register a repository path. Validates that the path contains a ``.git`` or ``.code-review-graph`` @@ -63,6 +63,7 @@ def register(self, path: str, alias: str | None = None) -> dict[str, str]: Args: path: Absolute or relative path to the repository root. alias: Optional short alias for the repository. + data_dir: Optional external directory for graph database. Returns: The registered entry dict. @@ -84,15 +85,19 @@ def register(self, path: str, alias: str | None = None) -> dict[str, str]: str_path = str(resolved) for entry in self._repos: if entry["path"] == str_path: - # Update alias if provided + # Update alias and/or data_dir if provided if alias: entry["alias"] = alias - self._save() + if data_dir: + entry["data_dir"] = str(Path(data_dir).resolve()) + self._save() return entry new_entry: dict[str, str] = {"path": str_path} if alias: new_entry["alias"] = alias + if data_dir: + new_entry["data_dir"] = str(Path(data_dir).resolve()) self._repos.append(new_entry) self._save() return new_entry @@ -159,6 +164,52 @@ def find_by_path(self, path: str) -> dict[str, str] | None: return dict(entry) return None + def set_data_dir(self, path: str, data_dir: str) -> dict[str, str]: + """Set the external data directory for a repository. + + Args: + path: Repository path (absolute or relative). + data_dir: External directory path to store graph database. + + Returns: + The updated or created registry entry. + """ + resolved = str(Path(path).resolve()) + data_resolved = str(Path(data_dir).resolve()) + + with self._lock: + # Check for existing entry + for entry in self._repos: + if entry["path"] == resolved: + entry["data_dir"] = data_resolved + self._save() + return dict(entry) + + # Create new entry if not found + new_entry = { + "path": resolved, + "data_dir": data_resolved + } + self._repos.append(new_entry) + self._save() + return new_entry + + def get_data_dir_for_repo(self, path: str) -> str | None: + """Get the stored data directory for a repository. + + Args: + path: Repository path (absolute or relative). + + Returns: + The stored data_dir path, or None if not set. + """ + resolved = str(Path(path).resolve()) + with self._lock: + for entry in self._repos: + if entry["path"] == resolved: + return entry.get("data_dir") + return None + class ConnectionPool: """LRU connection pool for SQLite graph databases. diff --git a/code_review_graph/tools/registry_tools.py b/code_review_graph/tools/registry_tools.py index 5e2dd212..698cf23e 100644 --- a/code_review_graph/tools/registry_tools.py +++ b/code_review_graph/tools/registry_tools.py @@ -7,6 +7,7 @@ from typing import Any from ..graph import GraphStore +from ..incremental import get_db_path from ..search import hybrid_search logger = logging.getLogger(__name__) @@ -83,7 +84,7 @@ def cross_repo_search_func( for repo_entry in repos: repo_path = Path(repo_entry["path"]) - db_path = repo_path / ".code-review-graph" / "graph.db" + db_path = get_db_path(repo_path) if not db_path.exists(): continue diff --git a/tests/test_incremental.py b/tests/test_incremental.py index c78a021d..f550dd41 100644 --- a/tests/test_incremental.py +++ b/tests/test_incremental.py @@ -337,6 +337,101 @@ def test_find_project_root_env_override_missing_dir_falls_through( assert result != tmp_path / "does-not-exist-123" +class TestDataDirRegistry: + """Tests for registry-based data_dir resolution.""" + + def test_registry_data_dir_overrides_default(self, tmp_path, monkeypatch): + """Registry data_dir should override default .code-review-graph.""" + from code_review_graph.incremental import get_data_dir + from code_review_graph.registry import Registry + + repo = tmp_path / "project" + repo.mkdir() + external = tmp_path / "external" + + monkeypatch.delenv("CRG_DATA_DIR", raising=False) + + # Set in registry + registry = Registry() + registry.set_data_dir(str(repo), str(external)) + + result = get_data_dir(repo) + assert result == external.resolve() + assert result.is_dir() + assert not (repo / ".code-review-graph").exists() + + def test_registry_data_dir_overrides_env_var(self, tmp_path, monkeypatch): + """Registry data_dir should override CRG_DATA_DIR.""" + from code_review_graph.incremental import get_data_dir + from code_review_graph.registry import Registry + + repo = tmp_path / "project" + repo.mkdir() + registry_dir = tmp_path / "registry-data" + env_dir = tmp_path / "env-data" + + monkeypatch.setenv("CRG_DATA_DIR", str(env_dir)) + + # Set in registry + registry = Registry() + registry.set_data_dir(str(repo), str(registry_dir)) + + result = get_data_dir(repo) + # Registry should win over env var + assert result == registry_dir.resolve() + assert not env_dir.exists() + + def test_registry_fallback_to_env_var(self, tmp_path, monkeypatch): + """Fall back to CRG_DATA_DIR when registry has no entry.""" + from code_review_graph.incremental import get_data_dir + from code_review_graph.registry import Registry + + repo = tmp_path / "project" + repo.mkdir() + env_dir = tmp_path / "env-data" + + monkeypatch.setenv("CRG_DATA_DIR", str(env_dir)) + + # Don't set in registry + result = get_data_dir(repo) + assert result == env_dir.resolve() + assert result.is_dir() + + def test_registry_fallback_to_default(self, tmp_path, monkeypatch): + """Fall back to default when neither registry nor env var is set.""" + from code_review_graph.incremental import get_data_dir + from code_review_graph.registry import Registry + + repo = tmp_path / "project" + repo.mkdir() + + monkeypatch.delenv("CRG_DATA_DIR", raising=False) + + # Don't set in registry + result = get_data_dir(repo) + assert result == repo / ".code-review-graph" + assert result.is_dir() + + def test_data_dir_auto_creates_directory(self, tmp_path, monkeypatch): + """get_data_dir should auto-create the data directory.""" + from code_review_graph.incremental import get_data_dir + from code_review_graph.registry import Registry + + repo = tmp_path / "project" + repo.mkdir() + data_dir = tmp_path / "nonexistent" / "nested" / "path" + + monkeypatch.delenv("CRG_DATA_DIR", raising=False) + + registry = Registry() + registry.set_data_dir(str(repo), str(data_dir)) + + result = get_data_dir(repo) + assert result.exists() + assert result.is_dir() + assert result == data_dir.resolve() + + class TestIsBinary: def test_text_file_is_not_binary(self, tmp_path): f = tmp_path / "text.py" diff --git a/tests/test_registry.py b/tests/test_registry.py index 69f676c4..1589aa3c 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -238,3 +238,99 @@ def test_cross_repo_search_no_repos(self): import shutil shutil.rmtree(tmp_dir, ignore_errors=True) + + +class TestSetDataDir: + """Tests for set_data_dir and get_data_dir_for_repo methods.""" + + def setup_method(self): + """Set up isolated test registry.""" + self.tmp_dir = tempfile.mkdtemp() + self.registry_path = Path(self.tmp_dir) / "registry.json" + self.registry = Registry(path=self.registry_path) + + def teardown_method(self): + """Clean up temporary directory.""" + import shutil + shutil.rmtree(self.tmp_dir, ignore_errors=True) + + def test_set_data_dir_new_repo(self): + """set_data_dir should create new registry entry if repo not registered.""" + repo = Path(self.tmp_dir) / "project" + repo.mkdir() + data_dir = Path(self.tmp_dir) / "data" + + entry = self.registry.set_data_dir(str(repo), str(data_dir)) + + assert entry["path"] == str(repo.resolve()) + assert entry["data_dir"] == str(data_dir.resolve()) + + # Verify it can be retrieved + retrieved = self.registry.get_data_dir_for_repo(str(repo)) + assert retrieved == str(data_dir.resolve()) + + # Verify entry is in list + repos = self.registry.list_repos() + assert len(repos) == 1 + assert repos[0]["path"] == str(repo.resolve()) + + def test_set_data_dir_existing_repo(self): + """set_data_dir should update data_dir for already registered repo.""" + repo = Path(self.tmp_dir) / "project" + repo.mkdir() + data_dir1 = Path(self.tmp_dir) / "data1" + data_dir2 = Path(self.tmp_dir) / "data2" + + # Initial registration + entry1 = self.registry.set_data_dir(str(repo), str(data_dir1)) + assert entry1["data_dir"] == str(data_dir1.resolve()) + + # Update with new data_dir + entry2 = self.registry.set_data_dir(str(repo), str(data_dir2)) + assert entry2["data_dir"] == str(data_dir2.resolve()) + + # Verify only one entry exists + repos = self.registry.list_repos() + assert len(repos) == 1 + + def test_get_data_dir_for_repo_unknown(self): + """get_data_dir_for_repo should return None for unknown repo.""" + unknown_repo = Path(self.tmp_dir) / "unknown" + + result = self.registry.get_data_dir_for_repo(str(unknown_repo)) + assert result is None + + def test_set_data_dir_with_alias(self): + """register() with data_dir should store both.""" + repo = Path(self.tmp_dir) / "project" + repo.mkdir() + (repo / ".git").mkdir() + data_dir = Path(self.tmp_dir) / "data" + alias = "my-project" + + entry = self.registry.register(str(repo), alias=alias, data_dir=str(data_dir)) + + assert entry["path"] == str(repo.resolve()) + assert entry["alias"] == alias + assert entry["data_dir"] == str(data_dir.resolve()) + + def test_backward_compatibility(self): + """Old registry entries without data_dir should work.""" + repo = Path(self.tmp_dir) / "project" + repo.mkdir() + + # Create entry without data_dir (old format) + self.registry._repos.append({ + "path": str(repo.resolve()), + "alias": "old-project" + }) + self.registry._save() + + # Should not crash + result = self.registry.get_data_dir_for_repo(str(repo)) + assert result is None + + # Should be able to add data_dir + data_dir = Path(self.tmp_dir) / "data" + entry = self.registry.set_data_dir(str(repo), str(data_dir)) + assert entry["data_dir"] == str(data_dir.resolve())