From dc646cae0dca4b3f5d1b8aeeadba61f5e680d5e5 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Thu, 2 Oct 2025 22:08:08 +0100 Subject: [PATCH 1/5] feat: initial impl. --- dvc/cli/parser.py | 2 + dvc/commands/purge.py | 88 ++++++++++++++++++++++++++ dvc/repo/__init__.py | 1 + dvc/repo/purge.py | 102 +++++++++++++++++++++++++++++++ tests/func/test_purge.py | 73 ++++++++++++++++++++++ tests/unit/command/test_purge.py | 62 +++++++++++++++++++ 6 files changed, 328 insertions(+) create mode 100644 dvc/commands/purge.py create mode 100644 dvc/repo/purge.py create mode 100644 tests/func/test_purge.py create mode 100644 tests/unit/command/test_purge.py diff --git a/dvc/cli/parser.py b/dvc/cli/parser.py index c92c8276c6..2a548b1abd 100644 --- a/dvc/cli/parser.py +++ b/dvc/cli/parser.py @@ -39,6 +39,7 @@ move, params, plots, + purge, queue, remote, remove, @@ -90,6 +91,7 @@ move, params, plots, + purge, queue, remote, remove, diff --git a/dvc/commands/purge.py b/dvc/commands/purge.py new file mode 100644 index 0000000000..e016b5c4b0 --- /dev/null +++ b/dvc/commands/purge.py @@ -0,0 +1,88 @@ +import os + +from dvc.cli import formatter +from dvc.cli.command import CmdBase +from dvc.cli.utils import append_doc_link +from dvc.log import logger +from dvc.ui import ui + +logger = logger.getChild(__name__) + + +class CmdPurge(CmdBase): + def run(self): + # Dry run should not prompt + if self.args.dry_run: + self.args.force = True + + msg = "This will permanently remove local DVC-tracked outputs " + if self.args.targets: + msg += "for the following targets:\n - " + "\n - ".join( + [os.path.abspath(t) for t in self.args.targets] + ) + else: + msg += "for the entire workspace." + + if self.args.recursive: + msg += "\nRecursive purge is enabled." + + if self.args.dry_run: + msg += "\n(dry-run: showing what would be removed, no changes)." + + logger.warning(msg) + + if not self.args.force and not ui.confirm("Are you sure you want to proceed?"): + return 1 + + # Call repo API + self.repo.purge( + targets=self.args.targets, + recursive=self.args.recursive, + force=self.args.force, + dry_run=self.args.dry_run, + ) + return 0 + + +def add_parser(subparsers, parent_parser): + PURGE_HELP = "Remove tracked outputs and their cache." + PURGE_DESCRIPTION = ( + "Removes cache objects and workspace copies of DVC-tracked outputs.\n" + "Metadata remains intact, and non-DVC files are untouched." + ) + purge_parser = subparsers.add_parser( + "purge", + parents=[parent_parser], + description=append_doc_link(PURGE_DESCRIPTION, "purge"), + help=PURGE_HELP, + formatter_class=formatter.RawDescriptionHelpFormatter, + ) + + purge_parser.add_argument( + "targets", + nargs="*", + help="Optional list of files/directories to purge (default: entire repo).", + ) + purge_parser.add_argument( + "-r", + "--recursive", + action="store_true", + default=False, + help="Recursively purge directories.", + ) + purge_parser.add_argument( + "--dry-run", + dest="dry_run", + action="store_true", + default=False, + help="Only print what would be removed without actually removing.", + ) + purge_parser.add_argument( + "-f", + "--force", + action="store_true", + default=False, + help="Force purge, bypassing safety checks and prompts.", + ) + + purge_parser.set_defaults(func=CmdPurge) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index ba850bb6ee..bc40de962c 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -82,6 +82,7 @@ class Repo: from dvc.repo.ls_url import ls_url as _ls_url # type: ignore[misc] from dvc.repo.move import move # type: ignore[misc] from dvc.repo.pull import pull # type: ignore[misc] + from dvc.repo.purge import purge # type: ignore[misc] from dvc.repo.push import push # type: ignore[misc] from dvc.repo.remove import remove # type: ignore[misc] from dvc.repo.reproduce import reproduce # type: ignore[misc] diff --git a/dvc/repo/purge.py b/dvc/repo/purge.py new file mode 100644 index 0000000000..701ab80411 --- /dev/null +++ b/dvc/repo/purge.py @@ -0,0 +1,102 @@ +from typing import TYPE_CHECKING, Optional + +from dvc.exceptions import DvcException +from dvc.log import logger + +from . import locked + +if TYPE_CHECKING: + from dvc.output import Output + from dvc.repo import Repo + +logger = logger.getChild(__name__) + + +class PurgeError(DvcException): + """Raised when purge fails due to safety or internal errors.""" + + +def _flatten_stages_or_outs(items) -> list["Output"]: + """Normalize collect() results into a flat list of Output objects.""" + outs = [] + for item in items: + if isinstance(item, list): + outs.extend(_flatten_stages_or_outs(item)) + elif hasattr(item, "outs"): # Stage + outs.extend(item.outs) + elif hasattr(item, "use_cache"): # Already an Output + outs.append(item) + else: + # skip strings or unknown types + logger.debug("Skipping non-stage item in collect(): %r", item) + return outs + + +@locked +def purge( + self: "Repo", + targets: Optional[list[str]] = None, + recursive: bool = False, + force: bool = False, + dry_run: bool = False, +): + """ + Purge removes DVC-tracked outputs and their cache. + + - Collects outs from .dvc files and dvc.yaml. + - Ensures safety (no dirty outs unless --force). + - Removes both workspace copies and cache objects. + - Metadata remains intact. + """ + from dvc.repo.collect import collect + from dvc.stage.exceptions import StageFileDoesNotExistError + + try: + if targets: + items = collect(self, targets=targets, recursive=recursive) + else: + items = list(self.index.stages) # full repo + except StageFileDoesNotExistError as e: + raise PurgeError(str(e)) from e + + outs = _flatten_stages_or_outs(items) + + if not outs: + logger.info("No DVC-tracked outputs found to purge.") + return 0 + + # Safety check: make sure outs aren’t dirty + dirty = [o for o in outs if o.use_cache and o.changed()] + if dirty and not force: + raise PurgeError( + "Some tracked outputs have uncommitted changes. " + "Use `--force` to purge anyway." + ) + + removed = 0 + for out in outs: + if dry_run: + logger.info("[dry-run] Would remove %s", out) + continue + + try: + # remove workspace file + if out.exists: + out.remove(ignore_remove=False) + + # remove cache entry + if out.use_cache and out.hash_info: + cache_path = out.cache.oid_to_path(out.hash_info.value) + if out.cache.fs.exists(cache_path): + out.cache.fs.remove(cache_path, recursive=True) + + removed += 1 + except Exception as e: # noqa: BLE001 + logger.error("Failed to remove %s: %s", out, e) + + if removed: + logger.info("Removed %d outputs (workspace + cache).", removed) + else: + logger.info("Nothing to purge.") + + return 0 diff --git a/tests/func/test_purge.py b/tests/func/test_purge.py new file mode 100644 index 0000000000..f9f4c4bda4 --- /dev/null +++ b/tests/func/test_purge.py @@ -0,0 +1,73 @@ +from pathlib import Path + +import pytest + +from dvc.cli import main +from dvc.repo.purge import PurgeError + + +def test_purge_api_removes_file_and_cache(tmp_dir, dvc): + (stage,) = tmp_dir.dvc_gen("foo", "foo") + assert (tmp_dir / "foo").exists() + assert Path(stage.outs[0].cache_path).exists() + + dvc.purge() + + # workspace file gone, cache gone, metadata remains + assert not (tmp_dir / "foo").exists() + assert not Path(stage.outs[0].cache_path).exists() + assert (tmp_dir / "foo.dvc").exists() + + +def test_purge_cli_removes_file_and_cache(tmp_dir, dvc): + (stage,) = tmp_dir.dvc_gen("bar", "bar") + assert (tmp_dir / "bar").exists() + assert Path(stage.outs[0].cache_path).exists() + + assert main(["purge", "--force"]) == 0 + + assert not (tmp_dir / "bar").exists() + assert not Path(stage.outs[0].cache_path).exists() + assert (tmp_dir / "bar.dvc").exists() + + +def test_purge_targets_only(tmp_dir, dvc): + (stage_dir,) = tmp_dir.dvc_gen({"dir": {"a.txt": "A", "b.txt": "B"}}) + assert (tmp_dir / "dir" / "a.txt").exists() + assert (tmp_dir / "dir" / "b.txt").exists() + + # purge the whole dir, not just a subfile + dvc.purge(targets=[str(tmp_dir / "dir")], force=True) + + assert not (tmp_dir / "dir").exists() + assert (tmp_dir / "dir.dvc").exists() + + +def test_purge_recursive(tmp_dir, dvc): + tmp_dir.dvc_gen({"nested": {"sub": {"file.txt": "content"}}}) + assert (tmp_dir / "nested" / "sub" / "file.txt").exists() + + dvc.purge(targets=["nested"], recursive=True, force=True) + assert not (tmp_dir / "nested" / "sub" / "file.txt").exists() + + +def test_purge_dry_run_does_not_delete(tmp_dir, dvc): + (stage,) = tmp_dir.dvc_gen("baz", "baz") + cache_path = Path(stage.outs[0].cache_path) + + dvc.purge(dry_run=True, force=True) + + assert (tmp_dir / "baz").exists() + assert cache_path.exists() + + +def test_purge_dirty_file_requires_force(tmp_dir, dvc): + (stage,) = tmp_dir.dvc_gen("foo", "foo") + (tmp_dir / "foo").write_text("modified") + + with pytest.raises(PurgeError): + dvc.purge() + + # but with --force it succeeds + dvc.purge(force=True) + assert not (tmp_dir / "foo").exists() diff --git a/tests/unit/command/test_purge.py b/tests/unit/command/test_purge.py new file mode 100644 index 0000000000..a09b7109ae --- /dev/null +++ b/tests/unit/command/test_purge.py @@ -0,0 +1,62 @@ +import pytest + +from dvc.cli import parse_args +from dvc.commands.purge import CmdPurge +from dvc.repo.purge import PurgeError + + +def test_purge_args_and_call(dvc, scm, mocker): + cli_args = parse_args( + [ + "purge", + "foo", + "bar", + "--recursive", + "--dry-run", + "--force", + ] + ) + assert cli_args.func == CmdPurge + + cmd = cli_args.func(cli_args) + mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", return_value=0) + + assert cmd.run() == 0 + + m.assert_called_once_with( + targets=["foo", "bar"], + recursive=True, + force=True, + dry_run=True, + ) + + +def test_purge_defaults(mocker): + cli_args = parse_args(["purge"]) + cmd = cli_args.func(cli_args) + + mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", return_value=0) + + assert cmd.run() == 0 + + m.assert_called_once_with( + targets=[], + recursive=False, + force=False, + dry_run=False, + ) + + +def test_purge_safety_error(mocker): + cli_args = parse_args(["purge"]) + cmd = cli_args.func(cli_args) + + mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", side_effect=PurgeError("dirty outs")) + + with pytest.raises(PurgeError): + cmd.run() + + m.assert_called_once() From 70b195f51551e5a89d977e46f5d340064d9707b9 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Thu, 2 Oct 2025 22:58:52 +0100 Subject: [PATCH 2/5] feat: check data exists in remote --- dvc/repo/purge.py | 45 +++++++++++++++++-- tests/func/test_purge.py | 95 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 129 insertions(+), 11 deletions(-) diff --git a/dvc/repo/purge.py b/dvc/repo/purge.py index 701ab80411..1b1f116564 100644 --- a/dvc/repo/purge.py +++ b/dvc/repo/purge.py @@ -41,10 +41,11 @@ def purge( dry_run: bool = False, ): """ - Purge removes DVC-tracked outputs and their cache. + Purge removes local copies of DVC-tracked outputs and their cache. - Collects outs from .dvc files and dvc.yaml. - Ensures safety (no dirty outs unless --force). + - Ensures outputs are backed up to remote (unless --force). - Removes both workspace copies and cache objects. - Metadata remains intact. """ @@ -55,7 +56,7 @@ def purge( if targets: items = collect(self, targets=targets, recursive=recursive) else: - items = list(self.index.stages) # full repo + items = list(self.index.stages) except StageFileDoesNotExistError as e: raise PurgeError(str(e)) from e @@ -65,7 +66,7 @@ def purge( logger.info("No DVC-tracked outputs found to purge.") return 0 - # Safety check: make sure outs aren’t dirty + # --- SAFETY CHECK 1: dirty outs dirty = [o for o in outs if o.use_cache and o.changed()] if dirty and not force: raise PurgeError( @@ -73,6 +74,44 @@ def purge( "Use `--force` to purge anyway." ) + # --- SAFETY CHECK 2: remote + remote presence + not_in_remote = [] + try: + remote_odb = self.cloud.get_remote_odb(None) # default remote + except Exception: + remote_odb = None + + if not remote_odb: + if not force: + raise PurgeError( + "No default remote configured. " + "Cannot safely purge outputs without verifying remote backup.\n" + "Use `--force` to purge anyway." + ) + logger.warning( + "No default remote configured. Proceeding with purge due to --force. " + "Outputs may be permanently lost." + ) + else: + # remote exists, check objects + for out in outs: + if out.use_cache and out.hash_info and out.hash_info.value: + if not remote_odb.exists(out.hash_info.value): + not_in_remote.append(str(out)) + + if not_in_remote: + if not force: + raise PurgeError( + "Some outputs are not present in the remote cache and would be " + "permanently lost if purged:\n - " + + "\n - ".join(not_in_remote) + + "\nUse `--force` to purge anyway." + ) + logger.warning( + "Some outputs are not present in the remote cache and may be " + "permanently lost:\n - " + "\n - ".join(not_in_remote) + ) + removed = 0 for out in outs: if dry_run: diff --git a/tests/func/test_purge.py b/tests/func/test_purge.py index f9f4c4bda4..c0e4e52661 100644 --- a/tests/func/test_purge.py +++ b/tests/func/test_purge.py @@ -6,11 +6,30 @@ from dvc.repo.purge import PurgeError -def test_purge_api_removes_file_and_cache(tmp_dir, dvc): +def test_purge_no_remote_configured_errors(tmp_dir, dvc): + tmp_dir.dvc_gen("foo", "foo") + with pytest.raises(PurgeError): + dvc.purge() + + +def test_purge_no_remote_configured_with_force_warns(tmp_dir, dvc, caplog): + tmp_dir.dvc_gen("foo", "foo") + caplog.clear() + dvc.purge(force=True) + assert ( + "No default remote configured. Proceeding with purge due to --force" + in caplog.text + ) + + +def test_purge_api_removes_file_and_cache(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) (stage,) = tmp_dir.dvc_gen("foo", "foo") assert (tmp_dir / "foo").exists() assert Path(stage.outs[0].cache_path).exists() + dvc.push("foo") # ensure remote has backup + dvc.purge() # workspace file gone, cache gone, metadata remains @@ -19,11 +38,13 @@ def test_purge_api_removes_file_and_cache(tmp_dir, dvc): assert (tmp_dir / "foo.dvc").exists() -def test_purge_cli_removes_file_and_cache(tmp_dir, dvc): +def test_purge_cli_removes_file_and_cache(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) (stage,) = tmp_dir.dvc_gen("bar", "bar") assert (tmp_dir / "bar").exists() assert Path(stage.outs[0].cache_path).exists() + # force will skip check that remote has backup assert main(["purge", "--force"]) == 0 assert not (tmp_dir / "bar").exists() @@ -31,19 +52,20 @@ def test_purge_cli_removes_file_and_cache(tmp_dir, dvc): assert (tmp_dir / "bar.dvc").exists() -def test_purge_targets_only(tmp_dir, dvc): +def test_purge_targets_only(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) (stage_dir,) = tmp_dir.dvc_gen({"dir": {"a.txt": "A", "b.txt": "B"}}) assert (tmp_dir / "dir" / "a.txt").exists() assert (tmp_dir / "dir" / "b.txt").exists() - # purge the whole dir, not just a subfile dvc.purge(targets=[str(tmp_dir / "dir")], force=True) assert not (tmp_dir / "dir").exists() assert (tmp_dir / "dir.dvc").exists() -def test_purge_recursive(tmp_dir, dvc): +def test_purge_recursive(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) tmp_dir.dvc_gen({"nested": {"sub": {"file.txt": "content"}}}) assert (tmp_dir / "nested" / "sub" / "file.txt").exists() @@ -51,7 +73,37 @@ def test_purge_recursive(tmp_dir, dvc): assert not (tmp_dir / "nested" / "sub" / "file.txt").exists() -def test_purge_dry_run_does_not_delete(tmp_dir, dvc): +def test_purge_individual_targets(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + + # Generate two *separate* tracked files + (stage_a,) = tmp_dir.dvc_gen("a.txt", "A") + (stage_b,) = tmp_dir.dvc_gen("b.txt", "B") + + assert (tmp_dir / "a.txt").exists() + assert (tmp_dir / "b.txt").exists() + assert Path(stage_a.outs[0].cache_path).exists() + assert Path(stage_b.outs[0].cache_path).exists() + + # Push both so purge passes remote safety + dvc.push() + + # Purge only a.txt + dvc.purge(targets=[str(tmp_dir / "a.txt")]) + + # a.txt and its cache should be gone, but metadata intact + assert not (tmp_dir / "a.txt").exists() + assert not Path(stage_a.outs[0].cache_path).exists() + assert (tmp_dir / "a.txt.dvc").exists() + + # b.txt and its cache should still exist + assert (tmp_dir / "b.txt").exists() + assert Path(stage_b.outs[0].cache_path).exists() + assert (tmp_dir / "b.txt.dvc").exists() + + +def test_purge_dry_run_does_not_delete(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) (stage,) = tmp_dir.dvc_gen("baz", "baz") cache_path = Path(stage.outs[0].cache_path) @@ -61,13 +113,40 @@ def test_purge_dry_run_does_not_delete(tmp_dir, dvc): assert cache_path.exists() -def test_purge_dirty_file_requires_force(tmp_dir, dvc): +def test_purge_dirty_file_requires_force(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) (stage,) = tmp_dir.dvc_gen("foo", "foo") (tmp_dir / "foo").write_text("modified") with pytest.raises(PurgeError): dvc.purge() - # but with --force it succeeds dvc.purge(force=True) assert not (tmp_dir / "foo").exists() + + +def test_purge_missing_remote_object_requires_force(tmp_dir, dvc, make_remote): + make_remote("backup", default=True) + (stage,) = tmp_dir.dvc_gen("foo", "foo") + dvc.push("foo") + + remote = dvc.cloud.get_remote_odb("backup") + remote.fs.remove(remote.path, recursive=True) # wipe remote + + with pytest.raises(PurgeError): + dvc.purge() + + +def test_purge_missing_remote_object_with_force_warns( + tmp_dir, dvc, make_remote, caplog +): + make_remote("backup", default=True) + (stage,) = tmp_dir.dvc_gen("foo", "foo") + dvc.push("foo") + + remote = dvc.cloud.get_remote_odb("backup") + remote.fs.remove(remote.path, recursive=True) # wipe remote + + caplog.clear() + dvc.purge(force=True) + assert "Some outputs are not present in the remote cache" in caplog.text From 3028d132c8bcbf2940e0f315817112c2c3571d67 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Fri, 3 Oct 2025 10:04:32 +0100 Subject: [PATCH 3/5] fix: improve dry-run behavior --- dvc/commands/purge.py | 10 +++++----- dvc/repo/purge.py | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/dvc/commands/purge.py b/dvc/commands/purge.py index e016b5c4b0..1d8e826942 100644 --- a/dvc/commands/purge.py +++ b/dvc/commands/purge.py @@ -11,10 +11,6 @@ class CmdPurge(CmdBase): def run(self): - # Dry run should not prompt - if self.args.dry_run: - self.args.force = True - msg = "This will permanently remove local DVC-tracked outputs " if self.args.targets: msg += "for the following targets:\n - " + "\n - ".join( @@ -31,7 +27,11 @@ def run(self): logger.warning(msg) - if not self.args.force and not ui.confirm("Are you sure you want to proceed?"): + if ( + not self.args.force + and not self.args.dry_run + and not ui.confirm("Are you sure you want to proceed?") + ): return 1 # Call repo API diff --git a/dvc/repo/purge.py b/dvc/repo/purge.py index 1b1f116564..78246d7e09 100644 --- a/dvc/repo/purge.py +++ b/dvc/repo/purge.py @@ -72,6 +72,7 @@ def purge( raise PurgeError( "Some tracked outputs have uncommitted changes. " "Use `--force` to purge anyway." + "\n - " + "\n - ".join(str(o) for o in dirty) ) # --- SAFETY CHECK 2: remote + remote presence From 6c17293e01088994d032f1d18c94e91d75af49df Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Fri, 3 Oct 2025 10:22:02 +0100 Subject: [PATCH 4/5] feat: add `-y` option --- dvc/commands/purge.py | 13 ++++++++++++- tests/unit/command/test_purge.py | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/dvc/commands/purge.py b/dvc/commands/purge.py index 1d8e826942..bec9a35645 100644 --- a/dvc/commands/purge.py +++ b/dvc/commands/purge.py @@ -11,7 +11,10 @@ class CmdPurge(CmdBase): def run(self): - msg = "This will permanently remove local DVC-tracked outputs " + if not self.args.dry_run: + msg = "This will permanently remove local DVC-tracked outputs " + else: + msg = "This will show what local DVC-tracked outputs would be removed " if self.args.targets: msg += "for the following targets:\n - " + "\n - ".join( [os.path.abspath(t) for t in self.args.targets] @@ -30,6 +33,7 @@ def run(self): if ( not self.args.force and not self.args.dry_run + and not self.args.yes and not ui.confirm("Are you sure you want to proceed?") ): return 1 @@ -84,5 +88,12 @@ def add_parser(subparsers, parent_parser): default=False, help="Force purge, bypassing safety checks and prompts.", ) + purge_parser.add_argument( + "-y", + "--yes", + action="store_true", + default=False, + help="Do not prompt for confirmation (respects saftey checks).", + ) purge_parser.set_defaults(func=CmdPurge) diff --git a/tests/unit/command/test_purge.py b/tests/unit/command/test_purge.py index a09b7109ae..c7b51c5220 100644 --- a/tests/unit/command/test_purge.py +++ b/tests/unit/command/test_purge.py @@ -60,3 +60,17 @@ def test_purge_safety_error(mocker): cmd.run() m.assert_called_once() + + +def test_purge_yes_skips_confirm(mocker): + cli_args = parse_args(["purge", "-y"]) + cmd = cli_args.func(cli_args) + + confirm = mocker.patch("dvc.ui.ui.confirm", return_value=True) + m = mocker.patch("dvc.repo.Repo.purge", return_value=0) + + assert cmd.run() == 0 + + # -y should skip confirmation + confirm.assert_not_called() + m.assert_called_once() From 8a218df4ff3f8a14620fb12d8598e29af4d4643b Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Fri, 3 Oct 2025 10:37:28 +0100 Subject: [PATCH 5/5] refactor: apply pre-commit lint checks --- dvc/commands/purge.py | 2 +- dvc/repo/purge.py | 149 ++++++++++++++++++++++----------------- tests/func/test_purge.py | 8 +-- 3 files changed, 89 insertions(+), 70 deletions(-) diff --git a/dvc/commands/purge.py b/dvc/commands/purge.py index bec9a35645..381caf3dde 100644 --- a/dvc/commands/purge.py +++ b/dvc/commands/purge.py @@ -93,7 +93,7 @@ def add_parser(subparsers, parent_parser): "--yes", action="store_true", default=False, - help="Do not prompt for confirmation (respects saftey checks).", + help="Do not prompt for confirmation (respects safety checks).", ) purge_parser.set_defaults(func=CmdPurge) diff --git a/dvc/repo/purge.py b/dvc/repo/purge.py index 78246d7e09..9370d2dfd5 100644 --- a/dvc/repo/purge.py +++ b/dvc/repo/purge.py @@ -1,5 +1,6 @@ from typing import TYPE_CHECKING, Optional +from dvc.config import NoRemoteError, RemoteNotFoundError from dvc.exceptions import DvcException from dvc.log import logger @@ -27,60 +28,29 @@ def _flatten_stages_or_outs(items) -> list["Output"]: elif hasattr(item, "use_cache"): # Already an Output outs.append(item) else: - # skip strings or unknown types logger.debug("Skipping non-stage item in collect(): %r", item) return outs -@locked -def purge( - self: "Repo", - targets: Optional[list[str]] = None, - recursive: bool = False, - force: bool = False, - dry_run: bool = False, -): - """ - Purge removes local copies of DVC-tracked outputs and their cache. - - - Collects outs from .dvc files and dvc.yaml. - - Ensures safety (no dirty outs unless --force). - - Ensures outputs are backed up to remote (unless --force). - - Removes both workspace copies and cache objects. - - Metadata remains intact. - """ - from dvc.repo.collect import collect - from dvc.stage.exceptions import StageFileDoesNotExistError - - try: - if targets: - items = collect(self, targets=targets, recursive=recursive) - else: - items = list(self.index.stages) - except StageFileDoesNotExistError as e: - raise PurgeError(str(e)) from e - - outs = _flatten_stages_or_outs(items) - - if not outs: - logger.info("No DVC-tracked outputs found to purge.") - return 0 - - # --- SAFETY CHECK 1: dirty outs +def _check_dirty(outs, force: bool) -> None: dirty = [o for o in outs if o.use_cache and o.changed()] if dirty and not force: raise PurgeError( "Some tracked outputs have uncommitted changes. " - "Use `--force` to purge anyway." - "\n - " + "\n - ".join(str(o) for o in dirty) + "Use `--force` to purge anyway.\n - " + + "\n - ".join(str(o) for o in dirty) ) - # --- SAFETY CHECK 2: remote + remote presence - not_in_remote = [] + +def _get_remote_odb(repo: "Repo"): try: - remote_odb = self.cloud.get_remote_odb(None) # default remote - except Exception: - remote_odb = None + return repo.cloud.get_remote_odb(None) + except (RemoteNotFoundError, NoRemoteError): + return None + + +def _check_remote_backup(repo: "Repo", outs, force: bool) -> None: + remote_odb = _get_remote_odb(repo) if not remote_odb: if not force: @@ -93,26 +63,33 @@ def purge( "No default remote configured. Proceeding with purge due to --force. " "Outputs may be permanently lost." ) - else: - # remote exists, check objects - for out in outs: - if out.use_cache and out.hash_info and out.hash_info.value: - if not remote_odb.exists(out.hash_info.value): - not_in_remote.append(str(out)) - - if not_in_remote: - if not force: - raise PurgeError( - "Some outputs are not present in the remote cache and would be " - "permanently lost if purged:\n - " - + "\n - ".join(not_in_remote) - + "\nUse `--force` to purge anyway." - ) - logger.warning( - "Some outputs are not present in the remote cache and may be " - "permanently lost:\n - " + "\n - ".join(not_in_remote) - ) + return + + # remote exists, check objects + not_in_remote = [ + str(o) + for o in outs + if o.use_cache + and o.hash_info + and o.hash_info.value + and not remote_odb.exists(o.hash_info.value) + ] + if not_in_remote and not force: + raise PurgeError( + "Some outputs are not present in the remote cache and would be " + "permanently lost if purged:\n - " + + "\n - ".join(not_in_remote) + + "\nUse `--force` to purge anyway." + ) + if not_in_remote and force: + logger.warning( + "Some outputs are not present in the remote cache and may be " + "permanently lost:\n - %s", + "\n - ".join(not_in_remote), + ) + +def _remove_outs(outs, dry_run: bool) -> int: removed = 0 for out in outs: if dry_run: @@ -131,12 +108,54 @@ def purge( out.cache.fs.remove(cache_path, recursive=True) removed += 1 - except Exception as e: # noqa: BLE001 - logger.error("Failed to remove %s: %s", out, e) + except Exception: + logger.exception("Failed to remove %s", out) + return removed + + +@locked +def purge( + self: "Repo", + targets: Optional[list[str]] = None, + recursive: bool = False, + force: bool = False, + dry_run: bool = False, +) -> int: + """ + Purge removes local copies of DVC-tracked outputs and their cache. + + - Collects outs from .dvc files and dvc.yaml. + - Ensures safety (no dirty outs unless --force). + - Ensures outputs are backed up to remote (unless --force). + - Removes both workspace copies and cache objects. + - Metadata remains intact. + """ + from dvc.repo.collect import collect + from dvc.stage.exceptions import StageFileDoesNotExistError + + try: + items = ( + collect(self, targets=targets, recursive=recursive) + if targets + else list(self.index.stages) + ) + except StageFileDoesNotExistError as e: + raise PurgeError(str(e)) from e + + outs = _flatten_stages_or_outs(items) + if not outs: + logger.info("No DVC-tracked outputs found to purge.") + return 0 + + # Run safety checks + _check_dirty(outs, force) + _check_remote_backup(self, outs, force) + + # Remove outs + removed = _remove_outs(outs, dry_run) if removed: logger.info("Removed %d outputs (workspace + cache).", removed) else: logger.info("Nothing to purge.") - return 0 diff --git a/tests/func/test_purge.py b/tests/func/test_purge.py index c0e4e52661..1e6a9f8168 100644 --- a/tests/func/test_purge.py +++ b/tests/func/test_purge.py @@ -54,7 +54,7 @@ def test_purge_cli_removes_file_and_cache(tmp_dir, dvc, make_remote): def test_purge_targets_only(tmp_dir, dvc, make_remote): make_remote("backup", default=True) - (stage_dir,) = tmp_dir.dvc_gen({"dir": {"a.txt": "A", "b.txt": "B"}}) + tmp_dir.dvc_gen({"dir": {"a.txt": "A", "b.txt": "B"}}) assert (tmp_dir / "dir" / "a.txt").exists() assert (tmp_dir / "dir" / "b.txt").exists() @@ -115,7 +115,7 @@ def test_purge_dry_run_does_not_delete(tmp_dir, dvc, make_remote): def test_purge_dirty_file_requires_force(tmp_dir, dvc, make_remote): make_remote("backup", default=True) - (stage,) = tmp_dir.dvc_gen("foo", "foo") + tmp_dir.dvc_gen("foo", "foo") (tmp_dir / "foo").write_text("modified") with pytest.raises(PurgeError): @@ -127,7 +127,7 @@ def test_purge_dirty_file_requires_force(tmp_dir, dvc, make_remote): def test_purge_missing_remote_object_requires_force(tmp_dir, dvc, make_remote): make_remote("backup", default=True) - (stage,) = tmp_dir.dvc_gen("foo", "foo") + tmp_dir.dvc_gen("foo", "foo") dvc.push("foo") remote = dvc.cloud.get_remote_odb("backup") @@ -141,7 +141,7 @@ def test_purge_missing_remote_object_with_force_warns( tmp_dir, dvc, make_remote, caplog ): make_remote("backup", default=True) - (stage,) = tmp_dir.dvc_gen("foo", "foo") + tmp_dir.dvc_gen("foo", "foo") dvc.push("foo") remote = dvc.cloud.get_remote_odb("backup")