Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1dc9d7e
Rework memoizer vs exception tests to use fixed config
benclifford Oct 7, 2025
83259dc
Make explicit checkpoint calls on memoizer, not DFK
benclifford Nov 10, 2025
92b1159
Turn memoizer make_hash into a function for later reusability
benclifford Aug 22, 2024
a1511df
Make memoizer instance pluggable
benclifford Dec 3, 2025
08372cc
Move wipe_task into task completion helpers, remove AppFuture callback
benclifford Oct 17, 2025
3299108
Define interface between DFK and memoizer
benclifford Jul 19, 2024
12f0574
Move checkpointing earlier in task completion
benclifford Sep 29, 2025
6e399ec
Pass result or exception into update_memo
benclifford Sep 29, 2025
1208e69
Collapse two nested ifs into a single multiway if
benclifford Oct 15, 2025
068ad9b
Make dependency failure handling more consistent
benclifford Oct 15, 2025
ce0f7ec
Test correct increase in dep_fail count
benclifford Oct 15, 2025
bb3a1f8
Replace an always-true `if` with an assert
benclifford Oct 15, 2025
9564cf7
Pull more completion code into _complete_task and rename
benclifford Oct 15, 2025
6032f98
Remove spurious double set of task time_returned
benclifford Oct 15, 2025
c52f098
Remove spurious double set of task time_returned
benclifford Oct 15, 2025
c21269f
Move a lot of checkpointing code into Memoizer, from DFK
benclifford Oct 10, 2025
44914af
Tell memoizer the task result earlier, to fix race condition #3762
benclifford Sep 3, 2025
dbdcdbb
Inline a single-use test config, from its own file
benclifford Sep 29, 2025
566a4ed
Remove commented dead code
benclifford Sep 29, 2025
8134045
Remove special-case checkpoint case from garbage collector test
benclifford Sep 29, 2025
9f83b25
inline a test config that is only used in one test and is very specif…
benclifford Sep 29, 2025
b6aa1c4
Remove unused memoizer->dfk reference
benclifford Sep 29, 2025
dab205a
Remove a spurious comment on checkpointing shutdown
benclifford Sep 29, 2025
beae1eb
Remove redundant argument from update_memo
benclifford Sep 3, 2025
d808ea4
checkpoint exceptions
benclifford Jul 30, 2024
be0049c
after discussion with Micha Pfeiffer about performance of checkpointi…
benclifford Dec 5, 2025
e005dc5
out-of-memory checkpointing
benclifford Aug 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions parsl/benchmark/perf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import concurrent.futures
import importlib
import math
import time
from typing import Any, Dict, Literal

Expand Down Expand Up @@ -38,7 +39,7 @@ def load_dfk_from_config(filename: str) -> DataFlowKernel:
raise RuntimeError("Config module does not define config or fresh_config")


@parsl.python_app
@parsl.python_app(cache=True)
def app(extra_payload: Any, parsl_resource_specification: Dict = {}) -> int:
return 7

Expand All @@ -49,7 +50,7 @@ def performance(*, resources: dict, target_t: float, args_extra_size: int, itera

iteration = 1

args_extra_payload = "x" * args_extra_size
# args_extra_payload = "x" * args_extra_size

if isinstance(iterate_mode, list):
n = iterate_mode[0]
Expand All @@ -65,7 +66,15 @@ def performance(*, resources: dict, target_t: float, args_extra_size: int, itera

fs = []
print("Submitting tasks / invoking apps")
for _ in range(n):
pmax = int(math.sqrt(n))
print(f"pmax = {pmax}")
for index in range(n):
# this means there is a different argument for each iteration,
# which will make checkpointing/memo behave differently
# so this could be switchable in parsl-perf dev branch
# args_extra_payload = index # always a new one (except for run repeats)

args_extra_payload = index % pmax
fs.append(app(args_extra_payload, parsl_resource_specification=resources))

submitted_t = time.time()
Expand Down
36 changes: 24 additions & 12 deletions parsl/dataflow/memoization.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,8 +449,12 @@ def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[A
data = pickle.load(f)
# Copy and hash only the input attributes
memo_fu: Future = Future()
assert data['exception'] is None
memo_fu.set_result(data['result'])

if data['exception'] is None:
memo_fu.set_result(data['result'])
else:
assert data['result'] is None
memo_fu.set_exception(data['exception'])
memo_lookup_table[data['hash']] = memo_fu

except EOFError:
Expand Down Expand Up @@ -524,7 +528,6 @@ def checkpoint_queue(self) -> None:

.. note::
Checkpointing only works if memoization is enabled

"""
with self._checkpoint_lock:
self._checkpoint_these_tasks(self.checkpointable_tasks)
Expand Down Expand Up @@ -556,17 +559,18 @@ def _checkpoint_these_tasks(self, checkpoint_queue: List[CheckpointCommand]) ->

with open(checkpoint_tasks, 'ab') as f:
for cc in checkpoint_queue:
if cc.exception is None:
hashsum = cc.task_record['hashsum']
if not hashsum:
continue
t = {'hash': hashsum, 'exception': None, 'result': cc.result}

# We are using pickle here since pickle dumps to a file in 'ab'
# mode behave like a incremental log.
if cc.exception is None and self.filter_result_for_checkpoint(cc.result):
t = {'hash': cc.task_record['hashsum'], 'exception': None, 'result': cc.result}
pickle.dump(t, f)
count += 1
logger.debug("Task %s checkpointed result", cc.task_record['id'])
elif cc.exception is not None and self.filter_exception_for_checkpoint(cc.exception):
t = {'hash': cc.task_record['hashsum'], 'exception': cc.exception, 'result': None}
pickle.dump(t, f)
count += 1
logger.debug("Task {cc.task_record['id']} checkpointed")
logger.debug("Task %s checkpointed exception", cc.task_record['id'])
else:
logger.debug("Task %s not checkpointed", cc.task_record['id'])

self.checkpointed_tasks += count

Expand All @@ -577,3 +581,11 @@ def _checkpoint_these_tasks(self, checkpoint_queue: List[CheckpointCommand]) ->
logger.debug("No tasks checkpointed in this pass.")
else:
logger.info("Done checkpointing {} tasks".format(count))

def filter_result_for_checkpoint(self, result: Any) -> bool:
"""Overridable method to decide if an task that ended with a successful result should be checkpointed"""
return True

def filter_exception_for_checkpoint(self, exception: BaseException) -> bool:
"""Overridable method to decide if an entry that ended with an exception should be checkpointed"""
return False
157 changes: 157 additions & 0 deletions parsl/dataflow/memosql.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import logging
import pickle
import sqlite3
import threading
from concurrent.futures import Future
from pathlib import Path
from typing import Any, Optional

from parsl.dataflow.memoization import Memoizer, make_hash
from parsl.dataflow.taskrecord import TaskRecord

logger = logging.getLogger(__name__)


class SQLiteMemoizer(Memoizer):
"""Memoize out of memory into an sqlite3 database.
"""

def __init__(self, *, checkpoint_dir: str | None = None):
self.checkpoint_dir = checkpoint_dir
self._db_lock = threading.Lock()

def start(self, *, run_dir: str) -> None:
"""TODO: run_dir is the per-workflow run dir, but we need a broader checkpoint context... one level up
by default... get_all_checkpoints uses "runinfo/" as a relative path for that by default so replicating
that choice would do here. likewise I think for monitoring."""

self.run_dir = run_dir

dir = self.checkpoint_dir if self.checkpoint_dir is not None else self.run_dir

self.db_path = Path(dir) / "checkpoint.sqlite3"
logger.debug("starting with db_path %r", self.db_path)

# check_same_thread should be safe if this assertion
# passes.
assert sqlite3.threadsafety == 3, "sqlite3 threadsafety violation"
self._connection = sqlite3.connect(self.db_path, check_same_thread=False, autocommit=True)
self._cursor = self._connection.cursor()

with self._db_lock:
self._cursor.execute("CREATE TABLE IF NOT EXISTS checkpoints(key PRIMARY KEY, result)")

# self.connection.commit()
# connection.close()
logger.debug("checkpoint table created")

def close(self):
"""TODO: probably going to need some kind of shutdown now, to close the sqlite3 connection."""
pass

def check_memo(self, task: TaskRecord) -> Optional[Future]:
"""TODO: document this: check_memo is required to set the task hashsum,
if that's how we're going to key checkpoints in update_memo. (that's not
a requirement though: other equalities are available."""
logger.debug("check memo start")
task_id = task['id']

if not task['memoize']:
task['hashsum'] = None
logger.debug("Task %s will not be memoized", task_id)
return None

hashsum = make_hash(task)
logger.debug("Task {} has memoization hash {}".format(task_id, hashsum))
task['hashsum'] = hashsum

logger.debug("checking memo")
# connection = sqlite3.connect(self.db_path)
# cursor = connection.cursor()
with self._db_lock:
self._cursor.execute("SELECT result FROM checkpoints WHERE key = ?", (hashsum, ))
r = self._cursor.fetchone()

logger.debug("checked memo")
if r is None:
# connection.close()
return None
else:
data = pickle.loads(r[0])
# connection.close()

memo_fu: Future = Future()

if data['exception'] is None:
memo_fu.set_result(data['result'])
else:
assert data['result'] is None
memo_fu.set_exception(data['exception'])

return memo_fu

def update_memo_result(self, task: TaskRecord, result: Any) -> None:
logger.debug("updating memo")

if not task['memoize'] or 'hashsum' not in task:
logger.debug("preconditions for memo not satisfied")
return

if not isinstance(task['hashsum'], str):
logger.error(f"Attempting to update app cache entry but hashsum is not a string key: {task['hashsum']}")
return

hashsum = task['hashsum']

# this comes from the original concatenation-based checkpoint code:
# assert app_fu.done(), "assumption: update_memo is called after future has a result"
t = {'hash': hashsum, 'exception': None, 'result': result}
# else:
# t = {'hash': hashsum, 'exception': app_fu.exception(), 'result': None}

value = pickle.dumps(t)

# connection = sqlite3.connect(self.db_path)
# cursor = connection.cursor()

with self._db_lock:
self._cursor.execute("INSERT OR IGNORE INTO checkpoints VALUES(?, ?)", (hashsum, value))

# connection.commit()
# connection.close()
logger.debug("updated memo")

def update_memo_exception(self, task: TaskRecord, exception: BaseException) -> None:
logger.debug("updating memo")

if not task['memoize'] or 'hashsum' not in task:
logger.debug("preconditions for memo not satisfied")
return

if not isinstance(task['hashsum'], str):
logger.error(f"Attempting to update app cache entry but hashsum is not a string key: {task['hashsum']}")
return

hashsum = task['hashsum']

# this comes from the original concatenation-based checkpoint code:
# assert app_fu.done(), "assumption: update_memo is called after future has a result"
# t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
# else:
t = {'hash': hashsum, 'exception': exception, 'result': None}

value = pickle.dumps(t)

# connection = sqlite3.connect(self.db_path)
# cursor = connection.cursor()

logger.debug("running sql")

with self._db_lock:
self._cursor.execute("INSERT INTO checkpoints VALUES(?, ?)", (hashsum, value))

# logger.debug("commit sql")
# connection.commit()
# logger.debug("close sql")
# connection.close()
logger.debug("updating memo - finished")
6 changes: 3 additions & 3 deletions parsl/tests/configs/htex_local_alternate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from parsl.data_provider.ftp import FTPInTaskStaging
from parsl.data_provider.http import HTTPInTaskStaging
from parsl.data_provider.zip import ZipFileStaging
from parsl.dataflow.memoization import BasicMemoizer
from parsl.dataflow.memosql import SQLiteMemoizer
from parsl.executors import HighThroughputExecutor
from parsl.launchers import SingleNodeLauncher

Expand Down Expand Up @@ -57,14 +57,14 @@ def fresh_config():
)
],
strategy='simple',
memoizer=BasicMemoizer(memoize=True, checkpoint_mode='task_exit'),
retries=2,
monitoring=MonitoringHub(
monitoring_debug=False,
resource_monitoring_interval=1,
),
usage_tracking=3,
project_name="parsl htex_local_alternate test configuration"
project_name="parsl htex_local_alternate test configuration",
memoizer=SQLiteMemoizer()
)


Expand Down
47 changes: 47 additions & 0 deletions parsl/tests/test_checkpointing/test_python_checkpoint_2_sqlite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import contextlib
import os

import pytest

import parsl
from parsl import python_app
from parsl.config import Config
from parsl.dataflow.memosql import SQLiteMemoizer


def parsl_configured(run_dir, memoizer):
return parsl.load(Config(
run_dir=str(run_dir),
memoizer=memoizer
))


@python_app(cache=True)
def uuid_app():
import uuid
return uuid.uuid4()


@pytest.mark.local
def test_loading_checkpoint(tmpd_cwd):
"""Load memoization table from previous checkpoint
"""
with parsl_configured(tmpd_cwd, SQLiteMemoizer(checkpoint_dir=tmpd_cwd)):
result = uuid_app().result()

with parsl_configured(tmpd_cwd, SQLiteMemoizer(checkpoint_dir=tmpd_cwd)):
relaunched = uuid_app().result()

assert result == relaunched, "Expected following call to uuid_app to return cached uuid"


@python_app(cache=True)
def exception_app():
raise RuntimeError("this is exception app")


@pytest.mark.local
def test_checkpointing_exception(tmpd_cwd):
with pytest.raises(RuntimeError):
with parsl_configured(tmpd_cwd, SQLiteMemoizer(checkpoint_dir=tmpd_cwd)):
_ = exception_app().result()
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import contextlib
import os

import pytest

import parsl
from parsl import python_app
from parsl.config import Config
from parsl.dataflow.memoization import BasicMemoizer
from parsl.executors.threads import ThreadPoolExecutor


class CheckpointExceptionsMemoizer(BasicMemoizer):
def filter_exception_for_checkpoint(self, ex):
# TODO: this used to be the case, but in moving to results-only mode,
# the task record is lost. Maybe it's useful to pass it in? What
# are the use cases for this deciding function?
# task record is available from app_fu.task_record
# assert app_fu.task_record is not None

# override the default always-False, to be always-True
return True


def fresh_config(run_dir, memoizer):
return Config(
memoizer=memoizer,
run_dir=str(run_dir)
)


@python_app(cache=True)
def uuid_app():
import uuid
raise RuntimeError(str(uuid.uuid4()))


@pytest.mark.local
def test_loading_checkpoint(tmpd_cwd):
"""Load memoization table from previous checkpoint
"""
with parsl.load(fresh_config(tmpd_cwd, CheckpointExceptionsMemoizer(checkpoint_mode="task_exit"))):
checkpoint_files = [os.path.join(parsl.dfk().run_dir, "checkpoint")]
result = uuid_app().exception()

with parsl.load(fresh_config(tmpd_cwd, CheckpointExceptionsMemoizer(checkpoint_files=checkpoint_files))):
relaunched = uuid_app().exception()

assert result.args == relaunched.args, "Expected following call to uuid_app to return cached uuid in exception"
Loading