Skip to content

Commit d829d77

Browse files
authored
Merge pull request #199 from man-group/add-cleanup-script
Adding a cleanup entrypoint
2 parents e82f538 + b0651d4 commit d829d77

File tree

10 files changed

+128
-12
lines changed

10 files changed

+128
-12
lines changed

.circleci/config.yml

+4-4
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ jobs:
213213
PYTHON_VERSION: "3_6"
214214
CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_6
215215
CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_6
216-
VERSION: 0.7.1
216+
VERSION: 0.7.2
217217
PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
218218
YARN_STATIC_DIR: notebooker/web/static/
219219
IMAGE_NAME: mangroup/notebooker
@@ -229,7 +229,7 @@ jobs:
229229
environment:
230230
CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_7
231231
CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_7
232-
VERSION: 0.7.1
232+
VERSION: 0.7.2
233233
PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
234234
YARN_STATIC_DIR: notebooker/web/static/
235235
IMAGE_NAME: mangroup/notebooker
@@ -243,7 +243,7 @@ jobs:
243243
environment:
244244
CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_8
245245
CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_8
246-
VERSION: 0.7.1
246+
VERSION: 0.7.2
247247
PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
248248
YARN_STATIC_DIR: notebooker/web/static/
249249
IMAGE_NAME: mangroup/notebooker
@@ -257,7 +257,7 @@ jobs:
257257
environment:
258258
CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_11
259259
CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_11
260-
VERSION: 0.7.1
260+
VERSION: 0.7.2
261261
PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
262262
YARN_STATIC_DIR: notebooker/web/static/
263263
IMAGE_NAME: mangroup/notebooker

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
0.7.2 (2025-01-17)
2+
------------------
3+
4+
* feature: added a cleanup script to delete reports older than a given number of days, optionally filterable by report name.
5+
16
0.7.1 (2025-01-02)
27
------------------
38

docs/conf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
author = "Man Group Quant Tech"
2424

2525
# The full version, including alpha/beta/rc tags
26-
release = "0.7.1"
26+
release = "0.7.2"
2727

2828

2929
# -- General configuration ---------------------------------------------------

notebooker/_entrypoints.py

+13
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import uuid
3+
from typing import Optional
34

45
import click
56

@@ -10,6 +11,7 @@
1011
from notebooker.serialization import SERIALIZER_TO_CLI_OPTIONS
1112
from notebooker.settings import BaseConfig, WebappConfig
1213
from notebooker.snapshot import snap_latest_successful_notebooks
14+
from notebooker.utils.cleanup import delete_old_reports
1315
from notebooker.web.app import main
1416

1517

@@ -267,6 +269,17 @@ def execute_notebook(
267269
)
268270

269271

272+
@base_notebooker.command()
273+
@click.option("--days", "--days-cutoff", "-d", type=int, required=True, help="Delete reports older than this many days")
274+
@click.option(
275+
"--report-name", required=False, help="The name of the template to retrieve, relative to the template directory."
276+
)
277+
@click.option("--dry-run", is_flag=True, default=False, help="Show what would be deleted without actually deleting")
278+
@pass_config
279+
def cleanup_old_reports(config: BaseConfig, days: int, report_name: Optional[str], dry_run: bool):
280+
delete_old_reports(config, days_cutoff=days, report_name=report_name, dry_run=dry_run)
281+
282+
270283
@base_notebooker.command()
271284
@click.option(
272285
"--report-name", required=True, help="The name of the template to retrieve, relative to the template directory."

notebooker/serialization/mongo.py

+20-5
Original file line numberDiff line numberDiff line change
@@ -521,18 +521,33 @@ def get_latest_successful_job_ids_for_name_all_params(self, report_name: str) ->
521521
def n_all_results_for_report_name(self, report_name: str) -> int:
522522
return self._get_result_count({"report_name": report_name})
523523

524-
def delete_result(self, job_id: AnyStr) -> Dict[str, Any]:
524+
def delete_result(self, job_id: AnyStr, dry_run: bool = False) -> Dict[str, Any]:
525525
result = self._get_raw_check_result(job_id)
526526
status = JobStatus.from_string(result["status"])
527527
gridfs_filenames = load_files_from_gridfs(self.result_data_store, result, do_read=False)
528528
if status in (JobStatus.ERROR, JobStatus.TIMEOUT, JobStatus.CANCELLED):
529529
gridfs_filenames.append(_error_info_filename(job_id))
530-
self.update_check_status(job_id, JobStatus.DELETED)
530+
if not dry_run:
531+
self.update_check_status(job_id, JobStatus.DELETED)
532+
deleted_gridfs_files = []
531533
for filename in gridfs_filenames:
532-
logger.info(f"Deleting {filename}")
534+
logger.debug(f"Deleting {filename}")
535+
existed = False
533536
for grid_out in self.result_data_store.find({"filename": filename}):
534-
self.result_data_store.delete(grid_out._id)
535-
return {"deleted_result_document": result, "gridfs_filenames": gridfs_filenames}
537+
existed = True
538+
if not dry_run:
539+
self.result_data_store.delete(grid_out._id)
540+
if existed:
541+
deleted_gridfs_files.append(filename)
542+
return {"deleted_result_document": result, "gridfs_filenames": deleted_gridfs_files}
543+
544+
def get_job_ids_older_than(self, cutoff: datetime.datetime, report_name: Optional[str] = None) -> List[str]:
545+
query = {"job_start_time": {"$lte": cutoff}}
546+
query = _add_deleted_status_to_filter(query)
547+
if report_name:
548+
query["report_name"] = report_name
549+
to_delete = [d["job_id"] for d in self.library.find(query, {"_id": 0, "job_id": 1})]
550+
return to_delete
536551

537552

538553
def _pdf_filename(job_id: str) -> str:

notebooker/utils/cleanup.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import datetime
2+
from typing import Optional
3+
4+
from tqdm import tqdm
5+
import logging
6+
7+
from notebooker.serialization.serialization import get_serializer_from_cls
8+
from notebooker.settings import BaseConfig
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
def delete_old_reports(config: BaseConfig, days_cutoff: int, report_name: Optional[str], dry_run: bool = True) -> None:
14+
"""
15+
Delete notebooker reports older than specified days.
16+
17+
Args:
18+
config: The configuration which will point to the serializer class and config.
19+
days_cutoff: Delete reports older than this many days
20+
report_name: Optionally specify which report_name we should be removing old reports for.
21+
dry_run: If True, only show what would be deleted without actually deleting
22+
"""
23+
serializer = get_serializer_from_cls(config.SERIALIZER_CLS, **config.SERIALIZER_CONFIG)
24+
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=days_cutoff)
25+
26+
# Find reports to delete
27+
to_delete = serializer.get_job_ids_older_than(cutoff_date, report_name=report_name)
28+
29+
num_reports = len(to_delete)
30+
31+
if num_reports == 0:
32+
logger.info(f"No reports found older than {days_cutoff} days")
33+
return
34+
35+
logger.info(f"Found {num_reports} reports older than {days_cutoff} days")
36+
37+
# Delete reports
38+
logger.info("Starting deletion process...")
39+
for report in tqdm(to_delete, desc="Deleting reports"):
40+
try:
41+
removed = serializer.delete_result(report, dry_run=dry_run)
42+
logger.info(
43+
f"{'Would have deleted' if dry_run else 'Deleted'}: "
44+
f"Title={removed['deleted_result_document']['report_title']}, "
45+
f"GridFS files={removed['gridfs_filenames']}"
46+
)
47+
except Exception as e:
48+
logger.error(f"Failed to delete report {report}: {str(e)}")
49+
50+
logger.info(f"{'Would have' if dry_run else 'Successfully'} removed {num_reports} reports")

notebooker/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.1"
1+
__version__ = "0.7.2"

notebooker/web/static/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "notebooker",
3-
"version": "0.7.1",
3+
"version": "0.7.2",
44
"description": "Notebooker - Turn notebooks into reports",
55
"dependencies": {
66
"bootstrap-table": "1.20.2",

tests/integration/test_mongo.py

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def test_delete(bson_library, webapp_config):
7777
raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2**20)}, "other_stuff": "Yep"},
7878
)
7979
)
80+
assert serializer.get_job_ids_older_than(datetime.datetime(2020, 1, 1), report_name=report_name) == [job_id]
8081
assert bson_library.find_one({"job_id": job_id}) is not None
8182
result = serializer.get_check_result(job_id)
8283
assert result is not None

tests/unit/serialization/test_mongoose.py renamed to tests/unit/serialization/test_mongo.py

+32
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from unittest.mock import Mock, MagicMock
2+
13
from mock import patch
24

35
from notebooker.serialization.mongo import JobStatus, MongoResultSerializer
@@ -42,3 +44,33 @@ def test__get_all_job_ids(conn, db, gridfs):
4244
{"$project": {"report_name": 1, "job_id": 1}},
4345
]
4446
)
47+
48+
49+
@patch("notebooker.serialization.mongo.gridfs")
50+
@patch("notebooker.serialization.mongo.MongoResultSerializer.get_mongo_database")
51+
@patch("notebooker.serialization.mongo.MongoResultSerializer.get_mongo_connection")
52+
def test_delete_result_dry_run(mock_conn, mock_db, mock_gridfs):
53+
# Setup
54+
serializer = MongoResultSerializer()
55+
mock_result = {
56+
"job_id": "test_job",
57+
"status": JobStatus.DONE.value,
58+
"raw_html_resources": {"outputs": ["file1.html"]},
59+
"generate_pdf_output": True,
60+
}
61+
62+
serializer._get_raw_check_result = Mock(return_value=mock_result)
63+
mock_gridfs_instance = MagicMock()
64+
serializer.result_data_store = mock_gridfs_instance
65+
mock_gridfs_instance.find.return_value = [Mock(_id="id1")]
66+
67+
# Execute with dry_run=True
68+
result = serializer.delete_result("test_job", dry_run=True)
69+
70+
# Verify no actual deletions occurred
71+
assert not serializer.library.find_one_and_update.called
72+
assert not mock_gridfs_instance.delete.called
73+
74+
# But verify the result contains what would be deleted
75+
assert result["deleted_result_document"] == mock_result
76+
assert len(result["gridfs_filenames"]) > 0

0 commit comments

Comments
 (0)