diff --git a/src/stamp/__main__.py b/src/stamp/__main__.py index 4ab8416..14b313e 100755 --- a/src/stamp/__main__.py +++ b/src/stamp/__main__.py @@ -82,6 +82,7 @@ def _run_cli(args: argparse.Namespace) -> None: output_dir=config.preprocessing.output_dir, wsi_dir=config.preprocessing.wsi_dir, wsi_list=config.preprocessing.wsi_list, + mpp_list=config.preprocessing.mpp_list, cache_dir=config.preprocessing.cache_dir, tile_size_um=config.preprocessing.tile_size_um, tile_size_px=config.preprocessing.tile_size_px, diff --git a/src/stamp/config.yaml b/src/stamp/config.yaml index 796140a..03c151d 100644 --- a/src/stamp/config.yaml +++ b/src/stamp/config.yaml @@ -38,9 +38,15 @@ preprocessing: # Used to distinguish features extracted with different code versions. #generate_hash: True - # A list (.txt, .xlsx or .csv) specifying filenames to preprocess + # A list (.txt, .xlsx or .csv) specifying filenames (without ending) to preprocess # instead of processing all data in the directory. - #slide_list: "list.csv" + wsi_list: "path/to/wsi_list.txt" + + # Optional per-slide MPP override list (.txt, .xlsx or .csv). + # Each row: path/to/filename.suffix, mpp_x. + # If provided, this value is used as the slide's MPP (typically as fallback + # if MPP extraction from slide metadata fails). + mpp_list: "path/to/mpp_list.txt" crossval: output_dir: "/path/to/save/files/to" diff --git a/src/stamp/preprocessing/__init__.py b/src/stamp/preprocessing/__init__.py index a184452..2ff3931 100755 --- a/src/stamp/preprocessing/__init__.py +++ b/src/stamp/preprocessing/__init__.py @@ -119,6 +119,7 @@ def extract_( wsi_dir: Path, output_dir: Path, wsi_list: Path | None, + mpp_list: Path | None, cache_dir: Path | None, cache_tiles_ext: ImageExtension, extractor: ExtractorName | Extractor, @@ -265,6 +266,16 @@ def extract_( perm = rng.permutation(len(slide_paths)) slide_paths = [slide_paths[i] for i in perm] + # If mpp_list is given, load it + if mpp_list is not None: + mpp_lookup = _load_mpp_overrides_(mpp_list) + _logger.info( + f"Loaded {len(mpp_lookup)} MPP overrides from {mpp_list}. Don't forget QC." + ) + else: + mpp_lookup = {} + _logger.info("No MPP override file provided.") + for slide_path in (progress := tqdm(slide_paths)): progress.set_description(str(slide_path.relative_to(wsi_dir))) _logger.debug(f"processing {slide_path}") @@ -280,6 +291,18 @@ def extract_( feature_output_path.parent.mkdir(parents=True, exist_ok=True) + # Determine per-slide fallback MPP + # Default to the global value (or None) + current_slide_fallback_mpp = default_slide_mpp + + # If the filename exists in the given file-list, assign the override value as fallback MPP + if str(slide_path) in mpp_lookup: + current_slide_fallback_mpp = mpp_lookup[str(slide_path)] + + _logger.info( + f"MPP-value from mpp_list: {current_slide_fallback_mpp} for slide {slide_path.name}" + ) + try: ds = _TileDataset( slide_path=slide_path, @@ -292,7 +315,7 @@ def extract_( max_workers=max_workers, brightness_cutoff=brightness_cutoff, canny_cutoff=canny_cutoff, - default_slide_mpp=default_slide_mpp, + default_slide_mpp=current_slide_fallback_mpp, ) # Parallelism is implemented in the dataset iterator already, so one worker is enough! dl = DataLoader(ds, batch_size=64, num_workers=1, drop_last=False) @@ -354,7 +377,7 @@ def extract_( size=(512, 512), coords_um=coords, tile_size_um=tile_size_um, - default_slide_mpp=default_slide_mpp, + default_slide_mpp=current_slide_fallback_mpp, ).convert("RGB").save(thumbnail_path) @@ -411,3 +434,35 @@ def _get_slide_paths(wsi_list: Path) -> set[str]: else: raise ValueError(f"Unsupported file type: {suf}") return slide_paths + + +# helper-function to load mpp overrides from file +def _load_mpp_overrides_(path: Path) -> dict[str, SlideMPP]: + suf = path.suffix.lower() + + if suf in {".csv"}: + sep = "," + df = pd.read_csv(path, sep=sep, header=None, comment="#") + elif suf in {".xlsx", ".xls"}: + df = pd.read_excel(path, header=None) + elif suf == ".txt": + out: dict[str, SlideMPP] = {} + for line in path.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + fn, mpp_s = [x.strip() for x in line.split(",", 1)] + out[fn] = SlideMPP(float(mpp_s)) + return out + else: + raise ValueError(f"Unsupported mpp_list format: {path.suffix}") + + # takes first two columns (no header) + df = df.iloc[:, :2].copy() + df.columns = ["Filename", "MPP_X"] + + df["Filename"] = df["Filename"].astype(str).str.strip() + df["MPP_X"] = pd.to_numeric(df["MPP_X"], errors="coerce") + df = df[df["Filename"].ne("") & df["MPP_X"].notna()] + + return {fn: SlideMPP(float(mpp)) for fn, mpp in zip(df["Filename"], df["MPP_X"])} diff --git a/src/stamp/preprocessing/config.py b/src/stamp/preprocessing/config.py index 244d70d..e1bc423 100644 --- a/src/stamp/preprocessing/config.py +++ b/src/stamp/preprocessing/config.py @@ -39,6 +39,13 @@ class PreprocessingConfig(BaseModel, arbitrary_types_allowed=True): wsi_list: Path | None = Field( default=None, description="Txt, Excel or CSV to read data filename from" ) + mpp_list: Path | None = Field( + default=None, + description=( + "Optional per-slide MPP list. " + "Only used if MPP extraction from slide metadata fails, as a last resort." + ), + ) cache_dir: Path | None = None cache_tiles_ext: ImageExtension = "jpg" tile_size_um: Microns = Microns(256.0)