Skip to content

Commit d6b7b43

Browse files
kishwarshafinpichuan
authored andcommitted
Add masseq to run_deepvariant
PiperOrigin-RevId: 702021003
1 parent 27075bd commit d6b7b43

File tree

6 files changed

+63
-53
lines changed

6 files changed

+63
-53
lines changed

Dockerfile

+9
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,15 @@ ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/sav
198198
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.ont.savedmodel/variables/variables.index .
199199
RUN chmod -R +r /opt/models/ont_r104/*
200200

201+
WORKDIR /opt/models/masseq
202+
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/fingerprint.pb .
203+
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/saved_model.pb .
204+
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/example_info.json .
205+
WORKDIR /opt/models/masseq/variables
206+
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/variables/variables.data-00000-of-00001 .
207+
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/savedmodels/deepvariant.masseq.savedmodel/variables/variables.index .
208+
RUN chmod -R +r /opt/models/masseq/*
209+
201210
# Copy small models
202211
WORKDIR /opt/smallmodels/wgs
203212
ADD https://storage.googleapis.com/deepvariant/models/DeepVariant/${VERSION}/smallmodels/deepvariant.wgs.smallmodel/fingerprint.pb .

docs/deepvariant-masseq-case-study.md

+26-39
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ steps.
2222
Lets first create directories to organize files.
2323

2424
```bash
25-
mkdir -p data benchmark reference output happy
25+
mkdir -p input benchmark reference output happy
2626
```
2727

2828
### Download the GRCh38 Reference
@@ -56,8 +56,8 @@ For this case study, we download the chr20 of a HG004 MAS-Seq BAM.
5656
```bash
5757
HTTPDIR=https://storage.googleapis.com/deepvariant/masseq-case-study
5858

59-
curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam > data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam
60-
curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai > data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai
59+
curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam > input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam
60+
curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai > input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam.bai
6161
```
6262

6363

@@ -69,58 +69,42 @@ include regions where the BAM file has 10x or more coverage.
6969
```bash
7070
HTTPDIR=https://storage.googleapis.com/deepvariant/masseq-case-study
7171

72-
curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed > data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed
73-
```
74-
75-
76-
77-
78-
### Download the MAS-Seq model
79-
80-
Finally, lets download the MAS-Seq model that we will use to call variants.
81-
82-
```bash
83-
gsutil cp -R gs://deepvariant/models/DeepVariant/1.8.0/savedmodels/deepvariant.masseq.savedmodel .
72+
curl -L ${HTTPDIR}/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed > input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed
8473
```
8574

8675
### Running DeepVariant MAS-Seq on a CPU-only machine
8776

8877
The command below will run the DeepVariant MAS-Seq model and produce an output
89-
VCF (`output/out.vcf.gz`).
78+
VCF.
9079

9180
```bash
92-
BIN_VERSION="head687331500"
81+
BIN_VERSION="1.8.0"
9382

9483
sudo docker run \
95-
-v "$(pwd):$(pwd)" \
96-
-w $(pwd) \
84+
-v "${PWD}/input":"/input" \
85+
-v "${PWD}/output":"/output" \
86+
-v "${PWD}/reference":"/reference" \
9787
google/deepvariant:"${BIN_VERSION}" \
9888
run_deepvariant \
99-
--model_type=PACBIO \
100-
--customized_model=deepvariant.masseq.savedmodel \
101-
--ref=reference/GRCh38_no_alt_analysis_set.fasta \
102-
--reads=data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam \
103-
--output_vcf=output/HG004.output.vcf.gz \
89+
--model_type=MASSEQ \
90+
--ref=/reference/GRCh38_no_alt_analysis_set.fasta \
91+
--reads=/input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.chr20.bam \
92+
--output_vcf=/output/HG004.output.vcf.gz \
10493
--num_shards=$(nproc) \
10594
--regions=chr20 \
106-
--make_examples_extra_args="phase_reads=true,sort_by_haplotypes=true,parse_sam_aux_fields=true,realign_reads=false,vsc_min_fraction_indels=0.12,alt_aligned_pileup=diff_channels,trim_reads_for_pileup=true,pileup_image_width=199,min_mapping_quality=1,track_ref_reads=true,partition_size=25000,max_reads_per_partition=0,max_reads_for_dynamic_bases_per_region=1500" \
107-
--disable_small_model=true \
108-
--intermediate_results_dir=output/intermediate_results_dir
95+
--intermediate_results_dir=/output/intermediate_results_dir
10996
```
11097

11198
**Flag summary**
11299

113-
* `--model_type` - Sets the model and options, but we will override the model
114-
with `--customized model`.
100+
* `--model_type` - Sets the model and options for MAS-Seq data.
115101
* `--customized_model` - Points to a model trained using MAS-Seq data.
116102
* `--ref` - Specifies the reference sequence.
117103
* `--reads` - Specifies the input bam file.
118104
* `--output_vcf` - Specifies the output variant file.
119105
* `--num_shards` - Sets the number of shards to the number of available
120106
processors (`$(nproc)`). This is used to perform parallelization.
121107
* `--regions` - Restricts to chr20 to make this case study faster.
122-
* `--make_examples_extra_args=` - Passes additional arguments to
123-
make_examples.
124108
* `--intermediate_results_dir` - Outputs results to an intermediate directory.
125109
This is optional. If you don't need the intermediate files, no need to
126110
specify this flag.
@@ -132,18 +116,21 @@ For running on GPU machines, or using Singularity instead of Docker, see
132116

133117
```bash
134118
sudo docker run \
135-
-v $(pwd):$(pwd) \
136-
-w $(pwd) \
119+
-v "${PWD}/benchmark":"/benchmark" \
120+
-v "${PWD}/input":"/input" \
121+
-v "${PWD}/output":"/output" \
122+
-v "${PWD}/reference":"/reference" \
123+
-v "${PWD}/happy:/happy" \
137124
jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \
138-
benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \
139-
output/HG004.output.vcf.gz \
140-
-f benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \
141-
-r reference/GRCh38_no_alt_analysis_set.fasta \
142-
-o happy/happy.output \
125+
/benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \
126+
/output/HG004.output.vcf.gz \
127+
-f /benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \
128+
-r /reference/GRCh38_no_alt_analysis_set.fasta \
129+
-o /happy/happy.output \
143130
--engine=vcfeval \
144131
--pass-only \
145132
-l chr20 \
146-
--target-regions=data/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed \
133+
--target-regions=/input/HG004.giab_na24143.hifi_reads.lima.0--0.lima.IsoSeqX_bc01_5p--IsoSeqX_3p.refined.grch38.mm2.splitN.fc.depth.10x.exons.bed \
147134
--threads=$(nproc)
148135
```
149136

scripts/run_deepvariant.py

+16
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ class ModelType(enum.Enum):
6060
PACBIO = 'PACBIO'
6161
ONT_R104 = 'ONT_R104'
6262
HYBRID_PACBIO_ILLUMINA = 'HYBRID_PACBIO_ILLUMINA'
63+
MASSEQ = 'MASSEQ'
6364

6465

6566
# Required flags.
@@ -277,6 +278,7 @@ class ModelType(enum.Enum):
277278
ModelType.PACBIO: '/opt/models/pacbio',
278279
ModelType.ONT_R104: '/opt/models/ont_r104',
279280
ModelType.HYBRID_PACBIO_ILLUMINA: '/opt/models/hybrid_pacbio_illumina',
281+
ModelType.MASSEQ: '/opt/models/masseq',
280282
}
281283

282284

@@ -495,6 +497,20 @@ def make_examples_command(
495497
special_args['trim_reads_for_pileup'] = True
496498
elif model_type == ModelType.HYBRID_PACBIO_ILLUMINA:
497499
special_args['trim_reads_for_pileup'] = True
500+
elif model_type == ModelType.MASSEQ:
501+
special_args['alt_aligned_pileup'] = 'diff_channels'
502+
special_args['max_reads_per_partition'] = 0
503+
special_args['min_mapping_quality'] = 1
504+
special_args['parse_sam_aux_fields'] = True
505+
special_args['partition_size'] = 25000
506+
special_args['phase_reads'] = True
507+
special_args['pileup_image_width'] = 199
508+
special_args['realign_reads'] = False
509+
special_args['sort_by_haplotypes'] = True
510+
special_args['track_ref_reads'] = True
511+
special_args['vsc_min_fraction_indels'] = 0.12
512+
special_args['trim_reads_for_pileup'] = True
513+
special_args['max_reads_for_dynamic_bases_per_region'] = 1500
498514

499515
_set_small_model_config(
500516
special_args, model_type, _CUSTOMIZED_SMALL_MODEL.value

third_party/nucleus/io/gfile.cc

+5-6
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,13 @@ namespace nucleus {
4141

4242
bool Exists(const std::string& filename) {
4343
// FileExists sets s to tensorflow::error::NOT_FOUND if it doesn't exist.
44-
tensorflow::Status s = tensorflow::Env::Default()->FileExists(filename);
44+
absl::Status s = tensorflow::Env::Default()->FileExists(filename);
4545
return s.ok();
4646
}
4747

4848
std::vector<std::string> Glob(const std::string& pattern) {
4949
std::vector<std::string> results;
50-
::tensorflow::Status s =
50+
absl::Status s =
5151
tensorflow::Env::Default()->GetMatchingPaths(pattern, &results);
5252
return results;
5353
}
@@ -56,7 +56,7 @@ ReadableFile::ReadableFile() {}
5656

5757
std::unique_ptr<ReadableFile> ReadableFile::New(const std::string& filename) {
5858
std::unique_ptr<tensorflow::RandomAccessFile> file;
59-
tensorflow::Status status =
59+
absl::Status status =
6060
tensorflow::Env::Default()->NewRandomAccessFile(filename, &file);
6161
if (!status.ok()) {
6262
return nullptr;
@@ -91,8 +91,7 @@ WritableFile::WritableFile() {}
9191
std::unique_ptr<WritableFile> WritableFile::New(const std::string& filename) {
9292
std::unique_ptr<tensorflow::WritableFile> file;
9393

94-
tensorflow::Status s =
95-
tensorflow::Env::Default()->NewWritableFile(filename, &file);
94+
absl::Status s = tensorflow::Env::Default()->NewWritableFile(filename, &file);
9695

9796
if (!s.ok()) {
9897
return nullptr;
@@ -105,7 +104,7 @@ std::unique_ptr<WritableFile> WritableFile::New(const std::string& filename) {
105104
}
106105

107106
bool WritableFile::Write(const std::string& s) {
108-
tensorflow::Status status = file_->Append(s);
107+
absl::Status status = file_->Append(s);
109108
return status.ok();
110109
}
111110

third_party/nucleus/io/tfrecord_reader.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ TFRecordReader::TFRecordReader() {}
4545
std::unique_ptr<TFRecordReader> TFRecordReader::New(
4646
const std::string& filename, const std::string& compression_type) {
4747
std::unique_ptr<tensorflow::RandomAccessFile> file;
48-
tensorflow::Status s =
48+
absl::Status s =
4949
tensorflow::Env::Default()->NewRandomAccessFile(filename, &file);
5050
if (!s.ok()) {
5151
LOG(ERROR) << s;
@@ -74,7 +74,7 @@ bool TFRecordReader::GetNext() {
7474
return false;
7575
}
7676

77-
tensorflow::Status s = reader_->ReadRecord(&offset_, &record_);
77+
absl::Status s = reader_->ReadRecord(&offset_, &record_);
7878

7979
return s.ok();
8080
}

third_party/nucleus/io/tfrecord_writer.cc

+5-6
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,7 @@ TFRecordWriter::TFRecordWriter() {}
4545
std::unique_ptr<TFRecordWriter> TFRecordWriter::New(
4646
const std::string& filename, const std::string& compression_type) {
4747
std::unique_ptr<tensorflow::WritableFile> file;
48-
tensorflow::Status s =
49-
tensorflow::Env::Default()->NewWritableFile(filename, &file);
48+
absl::Status s = tensorflow::Env::Default()->NewWritableFile(filename, &file);
5049
if (!s.ok()) {
5150
LOG(ERROR) << s;
5251
return nullptr;
@@ -69,29 +68,29 @@ bool TFRecordWriter::WriteRecord(const std::string& record) {
6968
if (writer_ == nullptr) {
7069
return false;
7170
}
72-
tensorflow::Status s = writer_->WriteRecord(record);
71+
absl::Status s = writer_->WriteRecord(record);
7372
return s.ok();
7473
}
7574

7675
bool TFRecordWriter::Flush() {
7776
if (writer_ == nullptr) {
7877
return false;
7978
}
80-
tensorflow:: Status s = writer_->Flush();
79+
absl::Status s = writer_->Flush();
8180
return s.ok();
8281
}
8382

8483
bool TFRecordWriter::Close() {
8584
if (writer_ != nullptr) {
86-
tensorflow::Status s = writer_->Close();
85+
absl::Status s = writer_->Close();
8786
if (!s.ok()) {
8887
return false;
8988
}
9089
writer_ = nullptr;
9190
}
9291

9392
if (file_ != nullptr) {
94-
tensorflow:: Status s = file_->Close();
93+
absl::Status s = file_->Close();
9594
if (!s.ok()) {
9695
return false;
9796
}

0 commit comments

Comments
 (0)