Add masseq to run_deepvariant

kishwarshafin · pichuan · commit d6b7b438a000 · 2024-12-02T19:31:04.000-08:00
PiperOrigin-RevId: 702021003
diff --git a/Dockerfile b/Dockerfile
@@ -198,6 +198,15 @@ ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/sav
 ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.ont.savedmodel/variables/variables.index .
 RUN chmod -R +r /opt/models/ont_r104/*
 
+WORKDIR /opt/models/masseq
+ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/fingerprint.pb .
+ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/saved_model.pb .
+ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/example_info.json .
+WORKDIR /opt/models/masseq/variables
+ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/variables/variables.data-00000-of-00001 .
+ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/variables/variables.index .
+RUN chmod -R +r /opt/models/masseq/*
+
 # Copy small models
 WORKDIR /opt/smallmodels/wgs
 ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/smallmodels/deepvariant.wgs.smallmodel/fingerprint.pb .
diff --git a/docs/deepvariant-masseq-case-study.md b/docs/deepvariant-masseq-case-study.md
@@ -22,7 +22,7 @@ steps.
 Lets first create directories to organize files.
 
 ```bash
-mkdir -p data benchmark reference output happy
+mkdir -p input benchmark reference output happy
 ```
 
 ### Download the GRCh38 Reference
@@ -56,8 +56,8 @@ For this case study, we download the chr20 of a HG004 MAS-Seq BAM.
 ```bash
 HTTPDIR=https://storage.googleapis.com/deepvariant/masseq-case-study
 
-curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam > data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam
-curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai > data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai
+curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam > input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam
+curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai > input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai
 ```
 
 
@@ -69,58 +69,42 @@ include regions where the BAM file has 10x or more coverage.
 ```bash
 HTTPDIR=https://storage.googleapis.com/deepvariant/masseq-case-study
 
-curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed > data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed
-```
-
-
-
-
-### Download the MAS-Seq model
-
-Finally, lets download the MAS-Seq model that we will use to call variants.
-
-```bash
-gsutil cp -R gs://deepvariant/models/DeepVariant/1.8.0/savedmodels/deepvariant.masseq.savedmodel .
+curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed > input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed
 ```
 
 ### Running DeepVariant MAS-Seq on a CPU-only machine
 
 The command below will run the DeepVariant MAS-Seq model and produce an output
-VCF (`output/out.vcf.gz`).
+VCF.
 
 ```bash
-BIN_VERSION="head687331500"
+BIN_VERSION="1.8.0"
 
 sudo docker run \
-  -v "$(pwd):$(pwd)" \
-  -w $(pwd) \
+  -v "${PWD}/input":"/input" \
+  -v "${PWD}/output":"/output" \
+  -v "${PWD}/reference":"/reference" \
   google/deepvariant:"${BIN_VERSION}" \
   run_deepvariant \
-    --model_type=PACBIO \
-    --customized_model=deepvariant.masseq.savedmodel \
-    --ref=reference/GRCh38_no_alt_analysis_set.fasta \
-    --reads=data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam \
-    --output_vcf=output/HG004.output.vcf.gz \
+    --model_type=MASSEQ \
+    --ref=/reference/GRCh38_no_alt_analysis_set.fasta \
+    --reads=/input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam \
+    --output_vcf=/output/HG004.output.vcf.gz \
     --num_shards=$(nproc) \
     --regions=chr20 \
-    --make_examples_extra_args="phase_reads=true,sort_by_haplotypes=true,parse_sam_aux_fields=true,realign_reads=false,vsc_min_fraction_indels=0.12,alt_aligned_pileup=diff_channels,trim_reads_for_pileup=true,pileup_image_width=199,min_mapping_quality=1,track_ref_reads=true,partition_size=25000,max_reads_per_partition=0,max_reads_for_dynamic_bases_per_region=1500" \
-    --disable_small_model=true \
-    --intermediate_results_dir=output/intermediate_results_dir
+    --intermediate_results_dir=/output/intermediate_results_dir
 ```
 
 **Flag summary**
 
-*   `--model_type` - Sets the model and options, but we will override the model
-    with `--customized model`.
+*   `--model_type` - Sets the model and options for MAS-Seq data.
 *   `--customized_model` - Points to a model trained using MAS-Seq data.
 *   `--ref` - Specifies the reference sequence.
 *   `--reads` - Specifies the input bam file.
 *   `--output_vcf` - Specifies the output variant file.
 *   `--num_shards` - Sets the number of shards to the number of available
     processors (`$(nproc)`). This is used to perform parallelization.
 *   `--regions` - Restricts to chr20 to make this case study faster.
-*   `--make_examples_extra_args=` - Passes additional arguments to
-    make_examples.
 *   `--intermediate_results_dir` - Outputs results to an intermediate directory.
     This is optional. If you don't need the intermediate files, no need to
     specify this flag.
@@ -132,18 +116,21 @@ For running on GPU machines, or using Singularity instead of Docker, see
 
 ```bash
 sudo docker run \
-  -v $(pwd):$(pwd) \
-  -w $(pwd) \
+  -v "${PWD}/benchmark":"/benchmark" \
+  -v "${PWD}/input":"/input" \
+  -v "${PWD}/output":"/output" \
+  -v "${PWD}/reference":"/reference" \
+  -v "${PWD}/happy:/happy" \
   jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \
-    benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \
-    output/HG004.output.vcf.gz \
-    -f benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \
-    -r reference/GRCh38_no_alt_analysis_set.fasta \
-    -o happy/happy.output \
+    /benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \
+    /output/HG004.output.vcf.gz \
+    -f /benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \
+    -r /reference/GRCh38_no_alt_analysis_set.fasta \
+    -o /happy/happy.output \
     --engine=vcfeval \
     --pass-only \
     -l chr20 \
-    --target-regions=data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed \
+    --target-regions=/input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed \
     --threads=$(nproc)
 ```
 
diff --git a/scripts/run_deepvariant.py b/scripts/run_deepvariant.py
@@ -60,6 +60,7 @@ class ModelType(enum.Enum):
   PACBIO = 'PACBIO'
   ONT_R104 = 'ONT_R104'
   HYBRID_PACBIO_ILLUMINA = 'HYBRID_PACBIO_ILLUMINA'
+  MASSEQ = 'MASSEQ'
 
 
 # Required flags.
@@ -277,6 +278,7 @@ class ModelType(enum.Enum):
     ModelType.PACBIO: '/opt/models/pacbio',
     ModelType.ONT_R104: '/opt/models/ont_r104',
     ModelType.HYBRID_PACBIO_ILLUMINA: '/opt/models/hybrid_pacbio_illumina',
+    ModelType.MASSEQ: '/opt/models/masseq',
 }
 
 
@@ -495,6 +497,20 @@ def make_examples_command(
     special_args['trim_reads_for_pileup'] = True
   elif model_type == ModelType.HYBRID_PACBIO_ILLUMINA:
     special_args['trim_reads_for_pileup'] = True
+  elif model_type == ModelType.MASSEQ:
+    special_args['alt_aligned_pileup'] = 'diff_channels'
+    special_args['max_reads_per_partition'] = 0
+    special_args['min_mapping_quality'] = 1
+    special_args['parse_sam_aux_fields'] = True
+    special_args['partition_size'] = 25000
+    special_args['phase_reads'] = True
+    special_args['pileup_image_width'] = 199
+    special_args['realign_reads'] = False
+    special_args['sort_by_haplotypes'] = True
+    special_args['track_ref_reads'] = True
+    special_args['vsc_min_fraction_indels'] = 0.12
+    special_args['trim_reads_for_pileup'] = True
+    special_args['max_reads_for_dynamic_bases_per_region'] = 1500
 
   _set_small_model_config(
       special_args, model_type, _CUSTOMIZED_SMALL_MODEL.value
diff --git a/third_party/nucleus/io/gfile.cc b/third_party/nucleus/io/gfile.cc
@@ -41,13 +41,13 @@ namespace nucleus {
 
 bool Exists(const std::string& filename) {
   // FileExists sets s to tensorflow::error::NOT_FOUND if it doesn't exist.
-  tensorflow::Status s = tensorflow::Env::Default()->FileExists(filename);
+  absl::Status s = tensorflow::Env::Default()->FileExists(filename);
   return s.ok();
 }
 
 std::vector<std::string> Glob(const std::string& pattern) {
   std::vector<std::string> results;
-  ::tensorflow::Status s =
+  absl::Status s =
       tensorflow::Env::Default()->GetMatchingPaths(pattern, &results);
   return results;
 }
@@ -56,7 +56,7 @@ ReadableFile::ReadableFile() {}
 
 std::unique_ptr<ReadableFile> ReadableFile::New(const std::string& filename) {
   std::unique_ptr<tensorflow::RandomAccessFile> file;
-  tensorflow::Status status =
+  absl::Status status =
       tensorflow::Env::Default()->NewRandomAccessFile(filename, &file);
   if (!status.ok()) {
     return nullptr;
@@ -91,8 +91,7 @@ WritableFile::WritableFile() {}
 std::unique_ptr<WritableFile> WritableFile::New(const std::string& filename) {
   std::unique_ptr<tensorflow::WritableFile> file;
 
-  tensorflow::Status s =
-      tensorflow::Env::Default()->NewWritableFile(filename, &file);
+  absl::Status s = tensorflow::Env::Default()->NewWritableFile(filename, &file);
 
   if (!s.ok()) {
     return nullptr;
@@ -105,7 +104,7 @@ std::unique_ptr<WritableFile> WritableFile::New(const std::string& filename) {
 }
 
 bool WritableFile::Write(const std::string& s) {
-  tensorflow::Status status = file_->Append(s);
+  absl::Status status = file_->Append(s);
   return status.ok();
 }
 
diff --git a/third_party/nucleus/io/tfrecord_reader.cc b/third_party/nucleus/io/tfrecord_reader.cc
@@ -45,7 +45,7 @@ TFRecordReader::TFRecordReader() {}
 std::unique_ptr<TFRecordReader> TFRecordReader::New(
     const std::string& filename, const std::string& compression_type) {
   std::unique_ptr<tensorflow::RandomAccessFile> file;
-  tensorflow::Status s =
+  absl::Status s =
       tensorflow::Env::Default()->NewRandomAccessFile(filename, &file);
   if (!s.ok()) {
     LOG(ERROR) << s;
@@ -74,7 +74,7 @@ bool TFRecordReader::GetNext() {
     return false;
   }
 
-  tensorflow::Status s = reader_->ReadRecord(&offset_, &record_);
+  absl::Status s = reader_->ReadRecord(&offset_, &record_);
 
   return s.ok();
 }
diff --git a/third_party/nucleus/io/tfrecord_writer.cc b/third_party/nucleus/io/tfrecord_writer.cc
@@ -45,8 +45,7 @@ TFRecordWriter::TFRecordWriter() {}
 std::unique_ptr<TFRecordWriter> TFRecordWriter::New(
     const std::string& filename, const std::string& compression_type) {
   std::unique_ptr<tensorflow::WritableFile> file;
-  tensorflow::Status s =
-      tensorflow::Env::Default()->NewWritableFile(filename, &file);
+  absl::Status s = tensorflow::Env::Default()->NewWritableFile(filename, &file);
   if (!s.ok()) {
     LOG(ERROR) << s;
     return nullptr;
@@ -69,29 +68,29 @@ bool TFRecordWriter::WriteRecord(const std::string& record) {
   if (writer_ == nullptr) {
     return false;
   }
-  tensorflow::Status s = writer_->WriteRecord(record);
+  absl::Status s = writer_->WriteRecord(record);
   return s.ok();
 }
 
 bool TFRecordWriter::Flush() {
   if (writer_ == nullptr) {
     return false;
   }
-  tensorflow:: Status s = writer_->Flush();
+  absl::Status s = writer_->Flush();
   return s.ok();
 }
 
 bool TFRecordWriter::Close() {
   if (writer_ != nullptr) {
-    tensorflow::Status s = writer_->Close();
+    absl::Status s = writer_->Close();
     if (!s.ok()) {
       return false;
     }
     writer_ = nullptr;
   }
 
   if (file_ != nullptr) {
-    tensorflow:: Status s = file_->Close();
+    absl::Status s = file_->Close();
     if (!s.ok()) {
       return false;
     }