Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/stamp/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def _run_cli(args: argparse.Namespace) -> None:
output_dir=config.preprocessing.output_dir,
wsi_dir=config.preprocessing.wsi_dir,
wsi_list=config.preprocessing.wsi_list,
mpp_list=config.preprocessing.mpp_list,
cache_dir=config.preprocessing.cache_dir,
tile_size_um=config.preprocessing.tile_size_um,
tile_size_px=config.preprocessing.tile_size_px,
Expand Down
10 changes: 8 additions & 2 deletions src/stamp/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,15 @@ preprocessing:
# Used to distinguish features extracted with different code versions.
#generate_hash: True

# A list (.txt, .xlsx or .csv) specifying filenames to preprocess
# A list (.txt, .xlsx or .csv) specifying filenames (without ending) to preprocess
# instead of processing all data in the directory.
#slide_list: "list.csv"
wsi_list: "path/to/wsi_list.txt"

# Optional per-slide MPP override list (.txt, .xlsx or .csv).
# Each row: path/to/filename.suffix, mpp_x.
# If provided, this value is used as the slide's MPP (typically as fallback
# if MPP extraction from slide metadata fails).
mpp_list: "path/to/mpp_list.txt"

crossval:
output_dir: "/path/to/save/files/to"
Expand Down
59 changes: 57 additions & 2 deletions src/stamp/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def extract_(
wsi_dir: Path,
output_dir: Path,
wsi_list: Path | None,
mpp_list: Path | None,
cache_dir: Path | None,
cache_tiles_ext: ImageExtension,
extractor: ExtractorName | Extractor,
Expand Down Expand Up @@ -265,6 +266,16 @@ def extract_(
perm = rng.permutation(len(slide_paths))
slide_paths = [slide_paths[i] for i in perm]

# If mpp_list is given, load it
if mpp_list is not None:
mpp_lookup = _load_mpp_overrides_(mpp_list)
_logger.info(
f"Loaded {len(mpp_lookup)} MPP overrides from {mpp_list}. Don't forget QC."
)
else:
mpp_lookup = {}
_logger.info("No MPP override file provided.")

for slide_path in (progress := tqdm(slide_paths)):
progress.set_description(str(slide_path.relative_to(wsi_dir)))
_logger.debug(f"processing {slide_path}")
Expand All @@ -280,6 +291,18 @@ def extract_(

feature_output_path.parent.mkdir(parents=True, exist_ok=True)

# Determine per-slide fallback MPP
# Default to the global value (or None)
current_slide_fallback_mpp = default_slide_mpp

# If the filename exists in the given file-list, assign the override value as fallback MPP
if str(slide_path) in mpp_lookup:
current_slide_fallback_mpp = mpp_lookup[str(slide_path)]

_logger.info(
f"MPP-value from mpp_list: {current_slide_fallback_mpp} for slide {slide_path.name}"
)

try:
ds = _TileDataset(
slide_path=slide_path,
Expand All @@ -292,7 +315,7 @@ def extract_(
max_workers=max_workers,
brightness_cutoff=brightness_cutoff,
canny_cutoff=canny_cutoff,
default_slide_mpp=default_slide_mpp,
default_slide_mpp=current_slide_fallback_mpp,
)
# Parallelism is implemented in the dataset iterator already, so one worker is enough!
dl = DataLoader(ds, batch_size=64, num_workers=1, drop_last=False)
Expand Down Expand Up @@ -354,7 +377,7 @@ def extract_(
size=(512, 512),
coords_um=coords,
tile_size_um=tile_size_um,
default_slide_mpp=default_slide_mpp,
default_slide_mpp=current_slide_fallback_mpp,
).convert("RGB").save(thumbnail_path)


Expand Down Expand Up @@ -411,3 +434,35 @@ def _get_slide_paths(wsi_list: Path) -> set[str]:
else:
raise ValueError(f"Unsupported file type: {suf}")
return slide_paths


# helper-function to load mpp overrides from file
def _load_mpp_overrides_(path: Path) -> dict[str, SlideMPP]:
suf = path.suffix.lower()

if suf in {".csv"}:
sep = ","
df = pd.read_csv(path, sep=sep, header=None, comment="#")
elif suf in {".xlsx", ".xls"}:
df = pd.read_excel(path, header=None)
elif suf == ".txt":
out: dict[str, SlideMPP] = {}
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
fn, mpp_s = [x.strip() for x in line.split(",", 1)]
out[fn] = SlideMPP(float(mpp_s))
return out
else:
raise ValueError(f"Unsupported mpp_list format: {path.suffix}")

# takes first two columns (no header)
df = df.iloc[:, :2].copy()
df.columns = ["Filename", "MPP_X"]

df["Filename"] = df["Filename"].astype(str).str.strip()
df["MPP_X"] = pd.to_numeric(df["MPP_X"], errors="coerce")
df = df[df["Filename"].ne("") & df["MPP_X"].notna()]

return {fn: SlideMPP(float(mpp)) for fn, mpp in zip(df["Filename"], df["MPP_X"])}
7 changes: 7 additions & 0 deletions src/stamp/preprocessing/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ class PreprocessingConfig(BaseModel, arbitrary_types_allowed=True):
wsi_list: Path | None = Field(
default=None, description="Txt, Excel or CSV to read data filename from"
)
mpp_list: Path | None = Field(
default=None,
description=(
"Optional per-slide MPP list. "
"Only used if MPP extraction from slide metadata fails, as a last resort."
),
)
cache_dir: Path | None = None
cache_tiles_ext: ImageExtension = "jpg"
tile_size_um: Microns = Microns(256.0)
Expand Down
Loading