openpipelines-bio
diff --git a/‎.git-blame-ignore-revs‎
Lines changed: 3 additions & 0 deletions b/‎.git-blame-ignore-revs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/integration-test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/release-build.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/release-build.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/viash-test.yml‎
Lines changed: 38 additions & 0 deletions b/‎.github/workflows/viash-test.yml‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 23 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 59 additions & 2 deletions b/‎CHANGELOG.md‎
Lines changed: 59 additions & 2 deletions
diff --git a/‎_viash.yaml‎
Lines changed: 0 additions & 2 deletions b/‎_viash.yaml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎resources_test_scripts/rna_velocity.sh‎
Lines changed: 9 additions & 3 deletions b/‎resources_test_scripts/rna_velocity.sh‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎resources_test_scripts/scgpt.sh‎
100644100755
Lines changed: 50 additions & 25 deletions b/‎resources_test_scripts/scgpt.sh‎
100644100755
Lines changed: 50 additions & 25 deletions
diff --git a/‎ruff.toml‎
Lines changed: 43 additions & 0 deletions b/‎ruff.toml‎
Lines changed: 43 additions & 0 deletions
@@ -0,0 +1,3 @@
+# Apply ruff as linter for python code
+9e87b864f699c371b444b592a19e610a3c9d3286
+ 
@@ -72,7 +72,7 @@ jobs:
 
     - uses: viash-io/viash-actions/setup@v6
 
-    - uses: nf-core/setup-nextflow@v2.0.0
+    - uses: nf-core/setup-nextflow@v2.1.4
 
     # use cache
     - name: Cache resources data
 
@@ -60,7 +60,7 @@ jobs:
 
     - uses: viash-io/viash-actions/setup@v6
 
-    - uses: nf-core/setup-nextflow@v2.0.0
+    - uses: nf-core/setup-nextflow@v2.1.4
 
     # use cache
     - name: Cache resources data
@@ -186,13 +186,13 @@ jobs:
         password: ${{ secrets.GTHB_PAT }}
 
     - name: Test component
-      timeout-minutes: 30
+      timeout-minutes: 40
       run: |
         viash test \
           "${{ matrix.component.config }}" \
           --config_mod ".engines[.type == 'docker'].image := 'ghcr.io/openpipelines-bio/openpipeline/${{ matrix.component.namespace }}${{matrix.component.namespace_separator}}${{ matrix.component.name }}:${{ github.event.inputs.version_tag }}'" \
           --config_mod ".engines[.type == 'docker'].setup := []" \
           --cpus 4 \
-          --memory "12gb" \
+          --memory "14gb" \
           --engine docker \
           --runner executable
@@ -10,6 +10,44 @@ concurrency:
   cancel-in-progress: ${{ !contains(github.ref, 'main')}}
 
 jobs:
+  linting:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install ruff
+      - name: Run Ruff
+        run: ruff check --output-format=github .
+      
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          packages: any::lintr, any::styler, any::roxygen2
+          needs: lint, styler
+
+      - name: Lint
+        run: lintr::lint_dir(path = ".")
+        shell: Rscript {0}
+        env:
+          LINTR_ERROR_ON_LINT: true
+
+      - name: Style
+        run: styler::style_dir(dry = "off")
+        shell: Rscript {0}
+        
+
   # phase 1
   list:
     env:
 
@@ -0,0 +1,23 @@
+
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.8.1
+  hooks:
+    - id: ruff
+    - id: ruff-format
+- repo: local
+  hooks:
+  - id: run_styler
+    name: run_styler
+    language: r
+    description: style files with {styler}
+    entry: "Rscript -e 'styler::style_file(commandArgs(TRUE))'"
+    files: '(\.[rR]profile|\.[rR]|\.[rR]md|\.[rR]nw|\.[qQ]md)$'
+    additional_dependencies: 
+      - styler
+      - knitr
+- repo: https://github.com/lorenzwalthert/precommit
+  rev: v0.4.3 
+  hooks:
+    - id: lintr
@@ -1,12 +1,69 @@
+# openpipelines 2.0.0
+
+## BREAKING CHANGES
+
+* `velocity/scvelo`: update `scvelo` to `0.3.3`, which also removes support for using `loom` input files. The component now uses a `MuData` object as input. Several arguments were added to support selecting different inputs from the MuData file: `counts_layer`, `modality`, `layer_spliced`, `layer_unspliced`, `layer_ambiguous`. An `output_h5mu` argument was has been added (PR #932). 
+
+* `src/annotate/onclass` and `src/annotate/celltypist`: Input parameter for gene name layers of input datasets has been updated to `--input_var_gene_names` and `reference_var_gene_names` (PR #919).
+
+* Several components under `src/scgpt` (`cross_check_genes`, `tokenize_pad`, `binning`) now processes the input (query) datasets differently. Instead of subsetting datasets based on genes in the model vocabulary and/or highly variable genes, these components require an input .var column with a boolean mask specifying this information. The results are written back to the original input data, preserving the dataset structure (PR #832).
+
+* `query/cellxgene_census`: The default output layer has been changed from `.layers["counts"]` to `.X` to be more aligned with the standard OpenPipelines format (PR #933).
+  Use argument `--output_layer_counts counts` to revert the behaviour to the previous default.
+
+## NEW FUNCTIONALITY
+
+* `velocyto_to_h5mu`: now writes counts to `.X` (PR #932)
+
+* `qc/calculate_atac_qc_metrics`: new component for calculating ATAC QC metrics (PR #868).
+
+* `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832).
+
+* `workflows/annotation/scgpt_integration_knn` workflow: Cell-type annotation based on scGPT integration with KNN label transfer (PR #875).
+
+* CI: Use `params.resources_test` in test workflows in order to point to an alternative location (e.g. a cache) (PR #889).
+
+## MINOR CHANGES
+
+* Pin `scikit-learn` for `labels_transfer/xgboost` to `<1.6` (PR #931).
+
+* `filter/filter_with_scrublet`: provide cleaner error message when running scrublet on an empty modality (PR #929).
+
+* Several component (cleanup): remove workaround for using being able to use shared utility functions with Nextflow Fusion (PR #920).
+
+* `scgpt/cell_type_annotation` component update: Added support for multi-processing (PR #832).
+
+* Several annotation (`src/annotate/`) components (`onclass`, `celltypist`, `random_forest_annotation`, `scanvi`, `svm_annotation`): Updated input parameteres to ensure uniformity across components, implemented functionality to cross-check the overlap of genes between query and reference (model) datasets and implemented logic to allow for subsetting of genes (PR #919). 
+
+* `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832).
+
+* `scgpt/cross_check_genes` component update: Highly variable genes are now cross-checked based on the boolean mask in `var_input`. The filtering information is stored in the `--output_var_filter` .var field instead of subsetting the dataset (PR #832).
+
+* `scgpt/binning` component update: This component now requires the `--var_input` parameter to provide gene filtering information. Binned data is written to the `--output_obsm_binned_counts` .obsm field in the original input data (PR #832).
+
+* `scgpt/pad_tokenize` component update: Genes are padded and tokenized based on filtering information in `--var_input` and `--input_obsm_binned_counts` (PR #832).
+
+* `resources_test_scripts/scgpt.sh`: Update scGPT test resources to avoid subsetting of datasets (PR #926).
+
+* `workflows/integration/scgpt_leiden` workflow update: Update workflow such that input dataset is not subsetted for HVG but uses boolean masks in .var field instead (PR #875).
+
+## BUG FIXES
+
+* `scvi_leiden` workflow: fix the input layer argument of the workflow not being passed to the scVI component (PR #936 and PR #938). 
+
+* `scgpt/embedding`: remove unused argument `dbsn` (PR #875).
+
+* `scgpt/binning`: update handling of empty rows in sparse matrices (PR #875).
+
 # openpipelines 2.0.0-rc.2
 
 ## BUG FIXES
 
-* `annotate/popv`: fix popv raising `ValueError` when an accelerator (e.g. GPU) is unavailable (PR #918, backported from PR #915).
+* `annotate/popv`: fix popv raising `ValueError` when an accelerator (e.g. GPU) is unavailable (PR #915).
 
 ## MINOR CHANGES
 
-* `dataflow/split_h5mu`: Optimize resource usage of the component (PR #917, backported from PR #913).
+* `dataflow/split_h5mu`: Optimize resource usage of the component (PR #913).
 
 # openpipelines 2.0.0-rc.1
 
 
@@ -1,7 +1,5 @@
 viash_version: 0.9.0
 
-version: dev
-
 source: src
 target: target
 
 
@@ -1,7 +1,5 @@
 #!/bin/bash
 
-set -eo pipefail
-
 # ensure that the command below is run from the root of the repository
 REPO_ROOT=$(git rev-parse --show-toplevel)
 cd "$REPO_ROOT"
@@ -19,7 +17,7 @@ mkdir -p "$velocyto_dir"
 # Create a compatible BAM file from BD Rhapsody Output #
 ########################################################
 
-bd_rhap_wta_bam="resources_test/bdrhap_5kjrt/processed/WTA.bd_rhapsody.output_raw/sample_final.BAM"
+bd_rhap_wta_bam="resources_test/bdrhap_5kjrt/processed/output_raw/Combined_sample_Bioproduct.bam"
 
 if [[ ! -f "$bd_rhap_wta_bam" ]]; then
     echo "$bd_rhap_wta_bam does not exist. Please generate BD Rhapsody test data first."
@@ -52,3 +50,11 @@ viash run src/velocity/velocyto/config.vsh.yaml -- \
   -i "$bam" \
   -o "$OUT/velocyto_processed/cellranger_tiny.loom" \
   --transcriptome "$gtf"
+
+echo "> Converting loom file to MuData object"
+viash run src/velocity/velocyto_to_h5mu/config.vsh.yaml -- \
+  --input_loom "$OUT/velocyto_processed/cellranger_tiny.loom" \
+  --input_h5mu "resources_test/cellranger_tiny_fastq/raw_dataset.h5mu" \
+  --modality velocyto \
+  --output_compression "gzip" \
+  --output "$OUT/velocyto_processed/velocyto.h5mu"
@@ -11,6 +11,12 @@ OUT=resources_test/$ID
 # create foundational model directory
 foundation_model_dir="$OUT/source"
 mkdir -p "$foundation_model_dir"
+export foundation_model_dir
+
+# create finetuned model directory
+finetuned_model_dir="$OUT/finetuned_model"
+mkdir -p "$finetuned_model_dir"
+export finetuned_model_dir
 
 # install gdown if necessary
 # Check whether gdown is available
@@ -19,13 +25,39 @@ if ! command -v gdown &> /dev/null; then
     exit 1
 fi
 
+# install torch if necessary
+# Check whether torch is available
+if ! python -c "import torch"; then
+    echo "This script requires torch. Please make sure it is available in your python environment."
+    exit 1
+fi
+
 echo "> Downloading scGPT foundation model (full_human)"
 # download foundational model files (full_human)
 # https://drive.google.com/drive/folders/1oWh_-ZRdhtoGQ2Fw24HP41FgLoomVo-y
 gdown '1H3E_MJ-Dl36AQV6jLbna2EdvgPaqvqcC' -O "${foundation_model_dir}/vocab.json"
 gdown '1hh2zGKyWAx3DyovD30GStZ3QlzmSqdk1' -O "${foundation_model_dir}/args.json"
 gdown '14AebJfGOUF047Eg40hk57HCtrb0fyDTm' -O "${foundation_model_dir}/best_model.pt"
 
+echo "> Converting to finetuned model format"
+python <<HEREDOC
+import torch
+import mudata
+import os
+
+foundation_model_dir = os.environ.get('foundation_model_dir')
+finetuned_model_dir = os.environ.get('finetuned_model_dir')
+
+found_model_path = f"{foundation_model_dir}/best_model.pt"
+ft_model_path = f"{finetuned_model_dir}/best_model.pt"
+
+f_model_dict = torch.load(found_model_path, map_location="cpu")
+model_dict = {}
+model_dict["model_state_dict"] = f_model_dict
+model_dict["id_to_class"] = {k: str(k) for k in range(15)}
+torch.save(model_dict, ft_model_path)
+HEREDOC
+
 # create test data dir
 test_resources_dir="$OUT/test_resources"
 mkdir -p "$test_resources_dir"
@@ -45,12 +77,13 @@ input_mdata.write_h5mu("${test_resources_dir}/Kim2020_Lung.h5mu")
 HEREDOC
 
 echo "> Subsetting datasets"
-viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
+viash run src/filter/subset_h5mu/config.vsh.yaml --engine docker -- \
   --input "${test_resources_dir}/Kim2020_Lung.h5mu" \
   --output "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \
   --number_of_observations 4000
 
 rm "${test_resources_dir}/Kim2020_Lung.h5ad"
+rm "${test_resources_dir}/Kim2020_Lung.h5mu"
 
 echo "> Preprocessing datasets"
 nextflow \
@@ -63,46 +96,38 @@ nextflow \
   --publish_dir "${test_resources_dir}"
 
 echo "> Filtering highly variable features"
-viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/iKim2020_Lung_subset_preprocessed.h5mu" \
+viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml --engine docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung_subset_preprocessed.h5mu" \
   --output "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
   --layer "log_normalized" \
-  --var_name_filter "filter_with_hvg" \
+  --var_name_filter "scgpt_filter_with_hvg" \
   --n_top_features 1200 \
   --flavor "seurat_v3"
-
-viash run src/filter/do_filter/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
-  --var_filter "filter_with_hvg"
 
 echo "> Running scGPT cross check genes"
-viash run src/scgpt/cross_check_genes/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
+viash run src/scgpt/cross_check_genes/config.vsh.yaml --engine docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
   --output "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
-  --vocab_file "${foundation_model_dir}/vocab.json"
+  --vocab_file "${foundation_model_dir}/vocab.json" \
+  --var_input "scgpt_filter_with_hvg" \
+  --output_var_filter "scgpt_cross_checked_genes"
 
 echo "> Running scGPT binning"
-viash run src/scgpt/binning/config.vsh.yaml -p docker -- \
+viash run src/scgpt/binning/config.vsh.yaml --engine docker -- \
   --input "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
   --input_layer "log_normalized" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu"
+  --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
+  --output_obsm_binned_counts "binned_counts" \
+  --var_input "scgpt_cross_checked_genes"
 
 echo "> Running scGPT tokenizing"
-viash run src/scgpt/pad_tokenize/config.vsh.yaml -p docker -- \
+viash run src/scgpt/pad_tokenize/config.vsh.yaml --engine docker -- \
   --input "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
-  --input_layer "binned" \
+  --input_obsm_binned_counts "binned_counts" \
   --output "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
-  --model_vocab "${foundation_model_dir}/vocab.json"
-
-echo "> Running scGPT integration"
-viash run src/scgpt/embedding/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_scgpt_integrated.h5mu" \
-  --model "${foundation_model_dir}/best_model.pt" \
   --model_vocab "${foundation_model_dir}/vocab.json" \
-  --model_config "${foundation_model_dir}/args.json" \
-  --obs_batch_label "sample"
+  --var_input "scgpt_cross_checked_genes" \
+
 
 echo "> Removing unnecessary files in test resources dir"
 find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete
 
@@ -0,0 +1,43 @@
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".git",
+    ".pyenv",
+    ".pytest_cache",
+    ".ruff_cache",
+    ".venv",
+    ".vscode",
+    "__pypackages__",
+    "_build",
+    "build",
+    "dist",
+    "node_modules",
+    "site-packages",
+]
+
+builtins = ["meta"]
+
+
+
+
+[format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
+
+[lint.flake8-pytest-style]
+fixture-parentheses = false
+mark-parentheses = false
+
+[lint]
+ignore = [
+  # module level import not at top of file
+  "E402"
+]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Apply ruff as linter for python code`
	`2`	`+9e87b864f699c371b444b592a19e610a3c9d3286`
	`3`	`+`