openpipelines-bio
diff --git a/‎CHANGELOG.md‎
Lines changed: 89 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎_viash.yaml‎
Lines changed: 1 addition & 2 deletions b/‎_viash.yaml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎resources_test_scripts/10x_5k_anticmv.sh‎
Lines changed: 64 additions & 10 deletions b/‎resources_test_scripts/10x_5k_anticmv.sh‎
Lines changed: 64 additions & 10 deletions
diff --git a/‎resources_test_scripts/annotation_test_data.sh‎
100755100644
Lines changed: 36 additions & 0 deletions b/‎resources_test_scripts/annotation_test_data.sh‎
100755100644
Lines changed: 36 additions & 0 deletions
diff --git a/‎resources_test_scripts/scgpt.sh‎
Lines changed: 1 addition & 2 deletions b/‎resources_test_scripts/scgpt.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/annotate/celltypist/config.vsh.yaml‎
Lines changed: 4 additions & 7 deletions b/‎src/annotate/celltypist/config.vsh.yaml‎
Lines changed: 4 additions & 7 deletions
@@ -1,3 +1,84 @@
+# openpipelines 2.1.0
+
+## BREAKING CHANGES
+
+* Deprecation of `metadata/duplicate_obs` and `metadata/duplicate_var` components (PR #952).
+
+* Deprecation of `workflows/annotation/scgpt_integration_knn` component (PR #952).
+
+* `annotate/scanvi`: Remove scarches functionality from this component, as it is already covered in `integrate/scarches` (PR #986). 
+
+## NEW FUNCTIONALITY
+
+* `dataflow/concatenate_h5mu`: add `modality` parameter (PR #977).
+
+* `filter_with_scrublet`: add `expected_doublet_rate`, `stdev_doublet_rate`, `n_neighbors` and `sim_doublet_ratio` arguments (PR #974).
+
+* `feature_annotation/aling_query_reference`: Added a component to align a query and reference dataset (PR #948, #958, #972).
+
+* `workflows/qc/qc` workflow: Added ribosomal gene detection (PR #961).
+
+* `workflows/rna/rna_singlesample`, `workflows/multiomics/process_samples` workflows: Added ribosomal gene detection (PR #968).
+
+* `scanvi`: enable CUDA acceleration (PR #969).
+
+* `workflows/annotation/scvi_knn` workflow: Cell-type annotation based on scVI integration followed by KNN label transfer (PR #954).
+
+* `convert/from_h5ad_to_seurat`: Add component to convert from h5ad to Seurat (PR #980).
+
+* `workflows/annotation/scanvi_scarches` workflow: Cell-type annotation based on scANVI integration and annotation with scArches for reference mapping (PR #898).
+
+* `integrate/scarches`: Implemented functionality to align the query dataset with the model registry and extend functionality to predict labels for scANVI models (PR #898).
+
+* `workflows/annotation/harmony_knn` workflow: Cell-type annotation based on harmony integration with KNN label transfer (PR #836).
+
+* `from_cellranger_multi_to_h5mu`: add support for `custom` modality (PR #982).
+
+* `integrate/scvi`: Enable passing any .var field for gene name information instead of .var index, using the `--var_gene_names` parameter (PR #986).
+
+## MAJOR CHANGES
+
+* Several components: when a component processes a single modality, only that modality is read into memory (PR #944)
+
+* The `transfer/publish` component is deprecated and will be removed in a future major release (PR #941).
+
+
+# MINOR CHANGES
+
+* Bump viash to `0.9.3` (PR #995).
+
+* Several workflows: refactor neighbors, leiden and UMAP in a separate subworkflow (PR #942 and PR #949). 
+
+* `grep_annotation_column` and `subset_obsp`: Fix compatibility for SciPy (PR #945).
+
+* `popv`: Pin numpy<2 after new release of scvi-tools (PR #946).
+
+* Various  components (`scgpt` and `annotate`): Add resource labels (PR #947, PR #950).
+
+* `feature_annotation/highly_variable_features_scanpy`: Enable calculation of HVG on a subset of genes (PR #957, PR #959).
+
+* `integrate/scvi`, `integrate/totalvi` and `integrate/scarches`: update base image to nvcr.io/nvidia/pytorch:24.12-py3, pin scvi-tools version to 1.1.5, unpin jax and jaxlib version (PR #970).
+
+* `annotate/celltypist`: Enable passing any layer with log normalized counts, enforce checking whether counts are log normalized (PR #971).
+
+* `process_10xh5/filter_10xh5`: update container base to ubuntu 24.04 (PR #983).
+
+# BUG FIXES
+
+* `cluster/leiden`: Fix an issue where insufficient shared memory (size of `/dev/shm`) causes the processing to hang.  
+
+* `utils/subset_vars`: Convert .var column used for subsetting of dtype "boolean" to dtype "bool" when it doesn't contain NaN values (PR #959).
+
+* `resources_test_scripts/annotation_test_data.sh`: Add a layer to the annotation reference dataset with log normalized counts (PR #960).
+
+* `annotate/celltypist`: Fix missing values in annotation column caused by index misalignment (PR #976).
+
+* `workflows/annotation/scgpt_annotation` and `workflows/integrate/scgpt_leiden`: Parameterization of HVG flavor with default method `cell_ranger` instead of `seurat_v3` (PR #979).
+
+* `dataflow/merge`: Resolved an issue where merging two MuData objects with overlapping `var` or `obs` columns sometimes resulted in an unsupported nullable dtype (e.g. merging `pd.IntegerDtype` and `pd.FloatDtype`). These columns are now correctly cast to their native numpy dtypes before writing(PR #990).
+
+* `workflows/annotation/harmony_knn`: Only process RNA modality in the workflow (PR #988).
+
 # openpipelines 2.0.0
 
 ## BREAKING CHANGES
@@ -55,6 +136,8 @@
 
 * `scgpt/binning`: update handling of empty rows in sparse matrices (PR #875).
 
+* `dataflow/split_h5mu`: Update memory label from `lowmem` to `highmem` and cpu label from `singlecpu` to `lowcpu` (PR #930).
+
 # openpipelines 2.0.0-rc.2
 
 ## BUG FIXES
@@ -242,6 +325,12 @@
 
 * Update authorship of components (PR #835).
 
+# openpipelines 1.0.4
+
+## BUG FIXES
+
+* `scvi_leiden` workflow: fix the input layer argument of the workflow not being passed to the scVI component (PR #939, backported from PR #936 and PR #938). 
+
 # openpipelines 1.0.3
 
 ## BUG FIXES
 
@@ -1,4 +1,4 @@
-viash_version: 0.9.0
+viash_version: 0.9.3
 
 source: src
 target: target
@@ -21,7 +21,6 @@ info:
       dest: resources_test
 
 config_mods: |
-  .test_resources += {path: '/src/base/openpipelinetestutils', dest: 'openpipelinetestutils'}
   .resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}
   .runners[.type == 'nextflow'].directives.tag := '$id'
   .runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'
@@ -77,7 +77,6 @@ fi
 
 
 # Run mapping pipeline
-# TODO: Also include conversion to h5mu
 cat > /tmp/params.yaml << HERE
 param_list:
 - id: "$ID"
@@ -97,7 +96,6 @@ feature_reference: "$feature_reference"
 publish_dir: "$OUT/processed"
 HERE
 
-
 nextflow \
   run . \
   -main-script target/nextflow/mapping/cellranger_multi/main.nf \
@@ -107,12 +105,12 @@ nextflow \
   -c src/workflows/utils/labels.config \
   -c src/workflows/utils/errorstrat_ignore.config
 
-# Create h5mu
+# Convert to h5mu
 cat > /tmp/params.yaml << HERE
-id: "$ID"
-input: "$OUT/processed/10x_5k_anticmv.cellranger_multi.output.output"
+id: "$orig_sample_id"
+input: "$OUT/processed/10x_5k_anticmv.cellranger_multi.output"
 publish_dir: "$OUT/"
-output: "$orig_sample_id.h5mu"
+output: "*.h5mu"
 HERE
 
 nextflow \
@@ -123,17 +121,39 @@ nextflow \
   -params-file /tmp/params.yaml \
   -c src/workflows/utils/labels.config
 
+mv "$OUT/0.h5mu" "$OUT/${orig_sample_id}.h5mu"
+
+
+# run qc workflow
 cat > /tmp/params.yaml << HERE
 id: "$ID"
 input: "$OUT/$orig_sample_id.h5mu"
+var_name_mitochondrial_genes: mitochondrial
+var_name_ribosomal_genes: ribosomal
 publish_dir: "$OUT/"
-output: "${orig_sample_id}_mms.h5mu"
+output: "${orig_sample_id}_qc.h5mu"
 HERE
 
+nextflow \
+  run . \
+  -main-script target/nextflow/workflows/qc/qc/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params.yaml \
+  -c src/workflows/utils/labels.config
+
+
 # Run full pipeline
+cat > /tmp/params.yaml << HERE
+id: "$ID"
+input: "$OUT/${orig_sample_id}_qc.h5mu"
+publish_dir: "$OUT/"
+output: "${orig_sample_id}_mms.h5mu"
+HERE
+
 nextflow \
   run . \
-  -main-script src/workflows/multiomics/full_pipeline/main.nf \
+  -main-script target/nextflow/workflows/multiomics/process_samples/main.nf \
   -resume \
   -profile docker,mount_temp \
   -params-file /tmp/params.yaml \
@@ -143,7 +163,41 @@ nextflow \
 fastqc_dir="$OUT/fastqc"
 mkdir -p "$fastqc_dir"
 
-./target/docker/qc/fastqc/fastqc \
+./target/executable/qc/fastqc/fastqc \
   --input "$raw_dir" \
   --mode "dir" \
-  --output "$fastqc_dir"
+  --output "$fastqc_dir"
+
+
+# Create a test dataset for the Custom modality
+# by just labeling the AB as custom
+feat_ref_name=$(basename $feature_reference)
+sed -e 's/Antibody Capture/Custom/g' "$feature_reference" > "/tmp/custom_${feat_ref_name}"
+
+cat > /tmp/params_custom.yaml << HERE
+param_list:
+- id: "$ID"
+  input: "$raw_dir"
+  library_id:
+    - "${orig_sample_id}_GEX_1_subset"
+    - "${orig_sample_id}_AB_subset"
+    - "${orig_sample_id}_VDJ_subset"
+  library_type:
+    - "Gene Expression"
+    - "Custom"
+    - "VDJ"
+
+gex_reference: "$genome_tar"
+feature_reference: "/tmp/custom_${feat_ref_name}"
+vdj_reference: "$vdj_ref"
+publish_dir: "$OUT/processed_with_custom"
+HERE
+
+nextflow \
+  run . \
+  -main-script target/nextflow/mapping/cellranger_multi/main.nf \
+  -resume \
+  -profile docker,mount_temp \
+  -params-file /tmp/params_custom.yaml \
+  -c src/workflows/utils/labels.config \
+  -c src/workflows/utils/errorstrat_ignore.config
@@ -33,14 +33,25 @@ wget "https://zenodo.org/record/7580707/files/pretrained_models_Blood_ts.tar.gz?
 
 # Process Tabula Sapiens Blood reference h5ad
 # (Select one individual and 100 cells per cell type)
+# normalize and log1p transform data
 python <<HEREDOC
 import anndata as ad
+import scanpy as sc
 ref_adata = ad.read_h5ad("${OUT}/tmp_TS_Blood_filtered.h5ad")
 sub_ref_adata = ref_adata[ref_adata.obs["donor_assay"] == "TSP14_10x 3' v3"] 
 n=100
 s=sub_ref_adata.obs.groupby('cell_ontology_class').cell_ontology_class.transform('count')
 sub_ref_adata_final = sub_ref_adata[sub_ref_adata.obs[s>=n].groupby('cell_ontology_class').head(n).index]
 # assert sub_ref_adata_final.shape == (500, 58870)
+data_for_scanpy = ad.AnnData(X=sub_ref_adata_final.X)
+sc.pp.normalize_total(data_for_scanpy, target_sum=10000)
+sc.pp.log1p(
+    data_for_scanpy,
+    base=None,
+    layer=None,
+    copy=False,
+)  
+sub_ref_adata_final.layers["log_normalized"] = data_for_scanpy.X
 sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip')
 HEREDOC
 
@@ -79,3 +90,28 @@ rm "${OUT}/tmp_pretrained_models_Blood_ts.tar.gz"
 
 find "${OUT}/Pretrained_model" ! -name "example_file_model*" -type f -exec rm -f {} +
 mv "${OUT}/Pretrained_model" "${OUT}/onclass_model"
+
+echo "> Creating SCVI model"
+viash run src/integrate/scvi/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/TS_Blood_filtered.h5mu" \
+    --obs_batch "donor_id" \
+    --var_gene_names "ensemblid" \
+    --output "${OUT}/scvi_output.h5mu" \
+    --output_model "${OUT}/scvi_model" \
+    --max_epochs 5 \
+    --n_obs_min_count 10 \
+    --n_var_min_count 10
+
+echo "> Creating SCANVI model"
+viash run src/integrate/scanvi/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/TS_Blood_filtered.h5mu" \
+    --var_gene_names "ensemblid" \
+    --obs_labels "cell_ontology_class" \
+    --scvi_model "${OUT}/scvi_model" \
+    --output "${OUT}/scanvi_output.h5mu" \
+    --output_model "${OUT}/scanvi_model" \
+    --max_epochs 5 
+
+rm "${OUT}/scanvi_output.h5mu"
+rm "${OUT}/scvi_output.h5mu"
+rm -r "${OUT}/Pretrained_model/"
@@ -102,7 +102,7 @@ viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml
   --layer "log_normalized" \
   --var_name_filter "scgpt_filter_with_hvg" \
   --n_top_features 1200 \
-  --flavor "seurat_v3"
+  --flavor "cell_ranger"
 
 echo "> Running scGPT cross check genes"
 viash run src/scgpt/cross_check_genes/config.vsh.yaml --engine docker -- \
@@ -133,4 +133,3 @@ echo "> Removing unnecessary files in test resources dir"
 find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete
 
 echo "> scGPT test resources are ready!"
-
@@ -25,7 +25,7 @@ argument_groups:
         required: false
       - name: "--input_layer"
         type: string
-        description: The layer in the input data to be used for cell type annotation if .X is not to be used. 
+        description: The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used. 
       - name: "--input_var_gene_names"
         type: string
         required: false
@@ -55,11 +55,6 @@ argument_groups:
         type: string
         description: The name of the adata obs column in the reference data containing cell type annotations.
         default: "cell_ontology_class"
-      - name: "--check_expression"
-        type: boolean_true
-        description: | 
-          Whether to check the expression of the reference dataset to the format reccomended by CellTypist.
-          CellTypist requires data to be log-normalized to 10000 counts per cell.
       - name: "--reference_var_gene_names"
         type: string
         required: false
@@ -164,4 +159,6 @@ engines:
     __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
 runners:
   - type: executable
-  - type: nextflow
+  - type: nextflow
+    directives:
+      label: [highcpu, highmem, highdisk]