KatherLab · drgmo · Jan 12, 2026 · Jan 13, 2026
diff --git a/src/stamp/__main__.py b/src/stamp/__main__.py
@@ -82,6 +82,7 @@ def _run_cli(args: argparse.Namespace) -> None:
                 output_dir=config.preprocessing.output_dir,
                 wsi_dir=config.preprocessing.wsi_dir,
                 wsi_list=config.preprocessing.wsi_list,
+                mpp_list=config.preprocessing.mpp_list,
                 cache_dir=config.preprocessing.cache_dir,
                 tile_size_um=config.preprocessing.tile_size_um,
                 tile_size_px=config.preprocessing.tile_size_px,

diff --git a/src/stamp/config.yaml b/src/stamp/config.yaml
@@ -38,9 +38,15 @@ preprocessing:
   # Used to distinguish features extracted with different code versions.
   #generate_hash: True
 
-  # A list (.txt, .xlsx or .csv) specifying filenames to preprocess 
+  # A list (.txt, .xlsx or .csv) specifying filenames (without ending) to preprocess 
   # instead of processing all data in the directory.
-  #slide_list: "list.csv"
+  wsi_list: "path/to/wsi_list.txt"
+
+  # Optional per-slide MPP override list (.txt, .xlsx or .csv).
+  # Each row: path/to/filename.suffix, mpp_x.
+  # If provided, this value is used as the slide's MPP (typically as fallback
+  # if MPP extraction from slide metadata fails).
+  mpp_list: "path/to/mpp_list.txt"
 
 crossval:
   output_dir: "/path/to/save/files/to"

diff --git a/src/stamp/preprocessing/__init__.py b/src/stamp/preprocessing/__init__.py
@@ -119,6 +119,7 @@ def extract_(
     wsi_dir: Path,
     output_dir: Path,
     wsi_list: Path | None,
+    mpp_list: Path | None,
     cache_dir: Path | None,
     cache_tiles_ext: ImageExtension,
     extractor: ExtractorName | Extractor,
@@ -265,6 +266,16 @@ def extract_(
     perm = rng.permutation(len(slide_paths))
     slide_paths = [slide_paths[i] for i in perm]
 
+    # If mpp_list is given, load it
+    if mpp_list is not None:
+        mpp_lookup = _load_mpp_overrides_(mpp_list)
+        _logger.info(
+            f"Loaded {len(mpp_lookup)} MPP overrides from {mpp_list}. Don't forget QC."
+        )
+    else:
+        mpp_lookup = {}
+        _logger.info("No MPP override file provided.")
+
     for slide_path in (progress := tqdm(slide_paths)):
         progress.set_description(str(slide_path.relative_to(wsi_dir)))
         _logger.debug(f"processing {slide_path}")
@@ -280,6 +291,18 @@ def extract_(
 
         feature_output_path.parent.mkdir(parents=True, exist_ok=True)
 
+        # Determine per-slide fallback MPP
+        # Default to the global value (or None)
+        current_slide_fallback_mpp = default_slide_mpp
+
+        # If the filename exists in the given file-list, assign the override value as fallback MPP
+        if str(slide_path) in mpp_lookup:
+            current_slide_fallback_mpp = mpp_lookup[str(slide_path)]
+
+            _logger.info(
+                f"MPP-value from mpp_list: {current_slide_fallback_mpp} for slide {slide_path.name}"
+            )
+
         try:
             ds = _TileDataset(
                 slide_path=slide_path,
@@ -292,7 +315,7 @@ def extract_(
                 max_workers=max_workers,
                 brightness_cutoff=brightness_cutoff,
                 canny_cutoff=canny_cutoff,
-                default_slide_mpp=default_slide_mpp,
+                default_slide_mpp=current_slide_fallback_mpp,
             )
             # Parallelism is implemented in the dataset iterator already, so one worker is enough!
             dl = DataLoader(ds, batch_size=64, num_workers=1, drop_last=False)
@@ -354,7 +377,7 @@ def extract_(
             size=(512, 512),
             coords_um=coords,
             tile_size_um=tile_size_um,
-            default_slide_mpp=default_slide_mpp,
+            default_slide_mpp=current_slide_fallback_mpp,
         ).convert("RGB").save(thumbnail_path)
 
 
@@ -411,3 +434,35 @@ def _get_slide_paths(wsi_list: Path) -> set[str]:
     else:
         raise ValueError(f"Unsupported file type: {suf}")
     return slide_paths
+
+
+# helper-function to load mpp overrides from file
+def _load_mpp_overrides_(path: Path) -> dict[str, SlideMPP]:
+    suf = path.suffix.lower()
+
+    if suf in {".csv"}:
+        sep = ","
+        df = pd.read_csv(path, sep=sep, header=None, comment="#")
+    elif suf in {".xlsx", ".xls"}:
+        df = pd.read_excel(path, header=None)
+    elif suf == ".txt":
+        out: dict[str, SlideMPP] = {}
+        for line in path.read_text(encoding="utf-8").splitlines():
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            fn, mpp_s = [x.strip() for x in line.split(",", 1)]
+            out[fn] = SlideMPP(float(mpp_s))
+        return out
+    else:
+        raise ValueError(f"Unsupported mpp_list format: {path.suffix}")
+
+    # takes first two columns (no header)
+    df = df.iloc[:, :2].copy()
+    df.columns = ["Filename", "MPP_X"]
+
+    df["Filename"] = df["Filename"].astype(str).str.strip()
+    df["MPP_X"] = pd.to_numeric(df["MPP_X"], errors="coerce")
+    df = df[df["Filename"].ne("") & df["MPP_X"].notna()]
+
+    return {fn: SlideMPP(float(mpp)) for fn, mpp in zip(df["Filename"], df["MPP_X"])}
diff --git a/src/stamp/preprocessing/config.py b/src/stamp/preprocessing/config.py
@@ -39,6 +39,13 @@ class PreprocessingConfig(BaseModel, arbitrary_types_allowed=True):
     wsi_list: Path | None = Field(
         default=None, description="Txt, Excel or CSV to read data filename from"
     )
+    mpp_list: Path | None = Field(
+        default=None,
+        description=(
+            "Optional per-slide MPP list. "
+            "Only used if MPP extraction from slide metadata fails, as a last resort."
+        ),
+    )
     cache_dir: Path | None = None
     cache_tiles_ext: ImageExtension = "jpg"
     tile_size_um: Microns = Microns(256.0)