Skip to content

Commit f36f747

Browse files
mobinasricopybara-github
mobinasri
authored andcommitted
Fixing the memory issue with GBZ reader
PiperOrigin-RevId: 695498239
1 parent 57e45ff commit f36f747

14 files changed

+451
-30
lines changed

Dockerfile.pangenome_aware_deepvariant

+6
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ COPY --from=builder /opt/deepvariant/bazel-out/k8-opt/bin/deepvariant/postproces
6565
COPY --from=builder /opt/deepvariant/bazel-out/k8-opt/bin/deepvariant/vcf_stats_report.zip .
6666
COPY --from=builder /opt/deepvariant/bazel-out/k8-opt/bin/deepvariant/show_examples.zip .
6767
COPY --from=builder /opt/deepvariant/bazel-out/k8-opt/bin/deepvariant/runtime_by_region_vis.zip .
68+
COPY --from=builder /opt/deepvariant/bazel-out/k8-opt/bin/deepvariant/load_gbz_into_shared_memory.zip .
6869
COPY --from=builder /opt/deepvariant/scripts/run_pangenome_aware_deepvariant.py .
6970
RUN ./run-prereq.sh
7071

@@ -98,6 +99,10 @@ RUN \
9899
"${BASH_HEADER}" \
99100
'python3 /opt/deepvariant/bin/runtime_by_region_vis.zip "$@"' > \
100101
/opt/deepvariant/bin/runtime_by_region_vis && \
102+
printf "%s\n%s\n" \
103+
"${BASH_HEADER}" \
104+
'python3 /opt/deepvariant/bin/load_gbz_into_shared_memory.zip "$@"' > \
105+
/opt/deepvariant/bin/load_gbz_into_shared_memory && \
101106
printf "%s\n%s\n" \
102107
"${BASH_HEADER}" \
103108
'python3 -u /opt/deepvariant/bin/run_pangenome_aware_deepvariant.py "$@"' > \
@@ -108,6 +113,7 @@ RUN \
108113
/opt/deepvariant/bin/vcf_stats_report \
109114
/opt/deepvariant/bin/show_examples \
110115
/opt/deepvariant/bin/runtime_by_region_vis \
116+
/opt/deepvariant/bin/load_gbz_into_shared_memory \
111117
/opt/deepvariant/bin/run_pangenome_aware_deepvariant
112118

113119
# Copy models.

WORKSPACE

+7-8
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@ http_archive(
4343
http_archive(
4444
name = "gbwt",
4545
build_file = "//:third_party/gbwt.BUILD",
46-
sha256 = "eb90732969ba646702c7490e00859ec99bb9d5fa5e017bdfd5ddd13dc0c4ddc6",
47-
strip_prefix = "gbwt-420f0f494a4cc4b258335d29bf43c58d59cfcd2e",
46+
sha256 = "81eb0a9dc05100195f5dce7b537732d9c0e7896b118a6dd01e5fe1ac63b5deca",
47+
strip_prefix = "gbwt-dbd5ba7c34687184ab46dd9df884f0223fdf1e18",
4848
urls = [
49-
"https://github.com/mobinasri/gbwt/archive/420f0f494a4cc4b258335d29bf43c58d59cfcd2e.zip",
49+
"https://github.com/mobinasri/gbwt/archive/dbd5ba7c34687184ab46dd9df884f0223fdf1e18.zip",
5050
],
5151
)
5252

@@ -73,10 +73,10 @@ http_archive(
7373
http_archive(
7474
name = "gbwtgraph",
7575
build_file = "//:third_party/gbwtgraph.BUILD",
76-
sha256 = "74ce1e0958c094717bbcf9e0c8d820ccc13b20b2c7824f2cca0045c764554748",
77-
strip_prefix = "gbwtgraph-98661b0253a298838d645e71f65bdb9ddedfd408",
76+
sha256 = "40c41c34b152a1eea6991e1acfdad8875e0c738e24cd36ca22dab5187c99a910",
77+
strip_prefix = "gbwtgraph-c96ca88b65fc40ac4bd371319a29111015d38904",
7878
urls = [
79-
"https://github.com/mobinasri/gbwtgraph/archive/98661b0253a298838d645e71f65bdb9ddedfd408.zip",
79+
"https://github.com/mobinasri/gbwtgraph/archive/c96ca88b65fc40ac4bd371319a29111015d38904.zip",
8080
],
8181
)
8282

@@ -115,9 +115,9 @@ http_archive(
115115
# That BUILD file must be kept in sync with the version of protobuf used.
116116
http_archive(
117117
name = "com_google_protobuf",
118+
build_file = "//:third_party/protobuf.BUILD",
118119
patch_args = ["-p1"],
119120
patches = ["//:third_party/protobuf.patch"],
120-
build_file = "//:third_party/protobuf.BUILD",
121121
sha256 = "cfcba2df10feec52a84208693937c17a4b5df7775e1635c1e3baffc487b24c9b",
122122
# This protobuf release is based on protobuf 3.9.2.
123123
strip_prefix = "protobuf-3.9.2",
@@ -137,7 +137,6 @@ http_archive(
137137
],
138138
)
139139

140-
141140
# bazel_skylib is now a required dependency of protobuf_archive.
142141
http_archive(
143142
name = "bazel_skylib",

build_release_binaries.sh

+1
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ bazel build -c opt \
168168
# TODO: Replace this hand-made list with a find command.
169169
fix_zip_file "bazel-out/k8-opt/bin/deepvariant/train"
170170
fix_zip_file "bazel-out/k8-opt/bin/deepvariant/call_variants"
171+
fix_zip_file "bazel-out/k8-opt/bin/deepvariant/load_gbz_into_shared_memory"
171172
fix_zip_file "bazel-out/k8-opt/bin/deepvariant/make_examples"
172173
fix_zip_file "bazel-out/k8-opt/bin/deepvariant/make_examples_pangenome_aware_dv"
173174
fix_zip_file "bazel-out/k8-opt/bin/deepvariant/make_examples_somatic"

deepvariant/BUILD

+19
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ filegroup(
1212
name = "binaries",
1313
srcs = [
1414
"call_variants",
15+
"load_gbz_into_shared_memory",
1516
"make_examples",
1617
"make_examples_pangenome_aware_dv",
1718
"make_examples_somatic",
@@ -712,6 +713,24 @@ py_binary(
712713
],
713714
)
714715

716+
py_binary(
717+
name = "load_gbz_into_shared_memory",
718+
srcs = [
719+
"load_gbz_into_shared_memory.py",
720+
],
721+
main = "load_gbz_into_shared_memory.py",
722+
python_version = "PY3",
723+
deps = [
724+
":logging_level",
725+
"//deepvariant/protos:deepvariant_py_pb2",
726+
"//third_party/nucleus/io/python:gbz_reader",
727+
"//third_party/nucleus/io/python:hts_verbose",
728+
"//third_party/nucleus/util:errors",
729+
"@absl_py//absl:app",
730+
"@absl_py//absl/flags",
731+
],
732+
)
733+
715734
py_binary(
716735
name = "make_examples_pangenome_aware_dv",
717736
srcs = [
+146
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# Copyright 2021 Google LLC.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions
5+
# are met:
6+
#
7+
# 1. Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# 2. Redistributions in binary form must reproduce the above copyright
11+
# notice, this list of conditions and the following disclaimer in the
12+
# documentation and/or other materials provided with the distribution.
13+
#
14+
# 3. Neither the name of the copyright holder nor the names of its
15+
# contributors may be used to endorse or promote products derived from this
16+
# software without specific prior written permission.
17+
#
18+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21+
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22+
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23+
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24+
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28+
# POSSIBILITY OF SUCH DAMAGE.
29+
"""Load the sequneces of a GBZ file into shared memory, usable by multiple processes."""
30+
31+
from absl import app
32+
from absl import flags
33+
34+
from deepvariant import logging_level
35+
from third_party.nucleus.io.python import gbz_reader
36+
from third_party.nucleus.io.python import hts_verbose
37+
from third_party.nucleus.util import errors
38+
39+
40+
# Flags related to loading GBZ into shared memory.
41+
_PANGENOME_GBZ = flags.DEFINE_string(
42+
'pangenome_gbz',
43+
None,
44+
(
45+
'Required. Pangenome GBZ file to load into shared memory'
46+
'(Only the sequences are loaded into shared memory.)'
47+
),
48+
)
49+
50+
_NUM_SHARDS = flags.DEFINE_integer(
51+
'num_shards',
52+
None,
53+
(
54+
'Required. Number of shards that will use the shared memory. It is'
55+
'important to set this number correctly to make sure that the'
56+
'shared memory is not deleted before all processes are done using it.'
57+
'If this value is greater than required then the shared memory will'
58+
'exist even after all processes are done using it,'
59+
'which is not desired.'
60+
),
61+
)
62+
63+
_SHARED_MEMORY_NAME = flags.DEFINE_string(
64+
'shared_memory_name',
65+
'GBZ_SHARED_MEMORY',
66+
(
67+
'Name of the shared memory segment.'
68+
),
69+
)
70+
71+
_SHARED_MEMORY_SIZE_GB = flags.DEFINE_integer(
72+
'shared_memory_size_gb',
73+
10,
74+
(
75+
'Size of the shared memory in GB.'
76+
),
77+
)
78+
79+
80+
def load_gbz_into_shared_memory(
81+
pangenome_gbz: str,
82+
shared_memory_name: str,
83+
shared_memory_size_gb: int,
84+
num_shards: int,
85+
):
86+
"""Loads a GBZ file into a shared memory segment."""
87+
sample_name = 'GRCh38'
88+
context = 1000
89+
chrom_prefix = ''
90+
create_shared_memory = True
91+
num_processes = num_shards
92+
93+
## The only parameters that are important are:
94+
## - shared_memory_name,
95+
## - create_shared_memory,
96+
## - shared_memory_size_gb,
97+
## - num_processes
98+
## The rest are set to default values. This is because the shared memory will
99+
## keep only the sequences of the pangenome and not the rest of the
100+
## information.
101+
## Just instantating a gbz reader with create_shared_memory=True
102+
## will load the sequences into shared memory.
103+
## If num_processes is greater than 0 then the shared memory
104+
## will NOT be deleted after this call (to be more precise
105+
## after calling ~GbzReader() here).
106+
gbz_reader.GbzReader(
107+
pangenome_gbz,
108+
sample_name,
109+
context,
110+
chrom_prefix,
111+
shared_memory_name,
112+
create_shared_memory,
113+
shared_memory_size_gb,
114+
num_processes
115+
)
116+
117+
118+
def main(argv=()):
119+
with errors.clean_commandline_error_exit():
120+
if len(argv) > 1:
121+
errors.log_and_raise(
122+
'Command line parsing failure: load_gbz_into_shared_memory does not'
123+
'accept positional arguments but some are present on'
124+
'the command line:'
125+
'"{}".'.format(str(argv)),
126+
errors.CommandLineError,
127+
)
128+
del argv # Unused.
129+
130+
logging_level.set_from_flag()
131+
hts_verbose.set(hts_verbose.htsLogLevel.HTS_LOG_WARNING)
132+
133+
load_gbz_into_shared_memory(
134+
_PANGENOME_GBZ.value,
135+
_SHARED_MEMORY_NAME.value,
136+
_SHARED_MEMORY_SIZE_GB.value,
137+
_NUM_SHARDS.value,
138+
)
139+
140+
141+
if __name__ == '__main__':
142+
flags.mark_flags_as_required([
143+
'pangenome_gbz',
144+
'num_shards',
145+
])
146+
app.run(main)

deepvariant/make_examples_core.py

+2
Original file line numberDiff line numberDiff line change
@@ -1476,6 +1476,8 @@ def _make_sam_readers(
14761476
ref_name=self.options.ref_name_pangenome,
14771477
context=self.options.allele_counter_options.partition_size,
14781478
chrom_prefix=self.options.ref_chrom_prefix,
1479+
shared_memory_name=self.options.gbz_shared_memory_name,
1480+
create_shared_memory=not self.options.use_loaded_gbz_shared_memory,
14791481
ref_path=self.options.reference_filename
14801482
if self.options.use_ref_for_cram
14811483
else None,

deepvariant/make_examples_pangenome_aware_dv.py

+27
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,27 @@
169169
),
170170
)
171171

172+
_USE_LOADED_GBZ_SHARED_MEMORY = flags.DEFINE_bool(
173+
'use_loaded_gbz_shared_memory',
174+
False,
175+
(
176+
'If enabled, the sequences of the gbz file are already loaded into'
177+
'shared memory using load_gbz_into_shared_memory.py and the SamReader'
178+
'reads the sequences from the shared memory.'
179+
),
180+
)
181+
182+
_GBZ_SHARED_MEMORY_NAME = flags.DEFINE_string(
183+
'gbz_shared_memory_name',
184+
'GBZ_SHARED_MEMORY',
185+
(
186+
'Name of the shared memory segment that contains the sequences of the'
187+
'gbz format. If --use_loaded_gbz_shared_memory is enabled, this flag '
188+
'must be set based on the name of the shared memory created by'
189+
'load_gbz_into_shared_memory.py'
190+
),
191+
)
192+
172193
# Change any flag defaults that differ for Pangenome-aware DeepVariant.
173194
# I'm setting this to float('inf') because we don't want to include any
174195
# candidates from the non-target (i.e., pangenome) sample.
@@ -294,6 +315,12 @@ def default_options(add_flags=True, flags_obj=None):
294315
options.ref_name_pangenome = flags_obj.ref_name_pangenome
295316
if flags_obj.ref_chrom_prefix:
296317
options.ref_chrom_prefix = flags_obj.ref_chrom_prefix
318+
if flags_obj.use_loaded_gbz_shared_memory:
319+
options.use_loaded_gbz_shared_memory = (
320+
flags_obj.use_loaded_gbz_shared_memory
321+
)
322+
if flags_obj.gbz_shared_memory_name:
323+
options.gbz_shared_memory_name = flags_obj.gbz_shared_memory_name
297324

298325
if add_flags:
299326
options.bam_fname = f'{os.path.basename(flags_obj.reads)}|{os.path.basename(flags_obj.pangenome)}'

deepvariant/protos/deepvariant.proto

+8
Original file line numberDiff line numberDiff line change
@@ -926,6 +926,14 @@ message MakeExamplesOptions {
926926
string ref_chrom_prefix = 80;
927927

928928
bool output_phase_info = 81;
929+
930+
// If true, the sequences of the gbz file is already loaded into shared
931+
// memory and the SamReader reads the sequences from the shared memory.
932+
bool use_loaded_gbz_shared_memory = 83;
933+
934+
// The name of the shared memory segment that contains the sequences of the
935+
// gbz file.
936+
string gbz_shared_memory_name = 84;
929937
}
930938

931939
// Config describe information needed for a dataset that can be used for

deepvariant/stream_examples.h

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "deepvariant/pileup_image_native.h"
4040
#include "deepvariant/protos/deepvariant.pb.h"
4141
#include "absl/types/span.h"
42+
4243
#include "boost/interprocess/managed_shared_memory.hpp" // NOLINT
4344
#include "boost/interprocess/shared_memory_object.hpp" // NOLINT
4445
#include "boost/interprocess/sync/named_mutex.hpp" // NOLINT

scripts/run_pangenome_aware_deepvariant.py

+38
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,35 @@ def _set_small_model_config(
386386
)
387387

388388

389+
def load_gbz_into_shared_memory_command(
390+
gbz,
391+
gbz_shared_memory_name,
392+
gbz_shared_memory_size_gb,
393+
) -> Tuple[str, Optional[str]]:
394+
"""Returns a load_gbz_into_shared_memory (command, logfile) for subprocess.
395+
396+
Args:
397+
gbz: Input pangenome GBZ file(s).
398+
gbz_shared_memory_name: Name of the shared memory region to create.
399+
gbz_shared_memory_size_gb: Size of the shared memory region to create.
400+
401+
Returns:
402+
(string, string) A command to run, and a log file to output to.
403+
"""
404+
command = ['time', '/opt/deepvariant/bin/load_gbz_into_shared_memory']
405+
command.extend(['--pangenome_gbz', '"{}"'.format(gbz)])
406+
command.extend(
407+
['--shared_memory_name', '"{}"'.format(gbz_shared_memory_name)]
408+
)
409+
command.extend(['--shared_memory_size_gb', str(gbz_shared_memory_size_gb)])
410+
command.extend(['--num_shards', '"{}"'.format(_NUM_SHARDS.value)])
411+
412+
logfile = None
413+
if _LOGGING_DIR.value:
414+
logfile = '{}/load_gbz_into_shared_memory.log'.format(_LOGGING_DIR.value)
415+
return (' '.join(command), logfile)
416+
417+
389418
def make_examples_pangenome_aware_dv_command(
390419
ref: str,
391420
reads: str,
@@ -690,6 +719,15 @@ def create_all_commands_and_logfiles(
690719
else:
691720
runtime_by_region_path = None
692721

722+
# Load pangenome GBZ into shared memory.
723+
if _PANGENOME.value is not None and _PANGENOME.value.endswith('.gbz'):
724+
commands.append(
725+
load_gbz_into_shared_memory_command(
726+
gbz=_PANGENOME.value,
727+
gbz_shared_memory_name='GBZ_SHARED_MEMORY',
728+
gbz_shared_memory_size_gb=10,
729+
)
730+
)
693731
model_ckpt = get_model_ckpt(_MODEL_TYPE.value, _CUSTOMIZED_MODEL.value)
694732
commands.append(
695733
make_examples_pangenome_aware_dv_command(

0 commit comments

Comments
 (0)