Skip to content

Commit

Permalink
Fixing the memory consumption problem of mapping db update
Browse files Browse the repository at this point in the history
other small bug fixes
  • Loading branch information
AlexanderGress committed Nov 17, 2021
1 parent f86e3ef commit 8f5ceea
Show file tree
Hide file tree
Showing 9 changed files with 89 additions and 49 deletions.
2 changes: 2 additions & 0 deletions structman_source/MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ include structman/lib/rinerator/reduce
include structman/lib/rinerator/probe
include structman/lib/rinerator/reduce_wwPDB_het_dict.txt
include structman/scripts/pdb-rsync.sh
include structman/scripts/struct_man_db_uniprot.sql
include structman/scripts/database_structure.sql
5 changes: 3 additions & 2 deletions structman_source/structman/base_utils/ray_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
def ray_init(config):
if ray.is_initialized():
return

"""
os.environ["PYTHONPATH"] = f'{settings.ROOT_DIR}:{os.environ.get("PYTHONPATH", "")}'
os.environ["PYTHONPATH"] = f'{settings.LIB_DIR}:{os.environ.get("PYTHONPATH", "")}'
os.environ["PYTHONPATH"] = f'{settings.RINERATOR_DIR}:{os.environ.get("PYTHONPATH", "")}'
os.environ["PYTHONPATH"] = f'{settings.OUTPUT_DIR}:{os.environ.get("PYTHONPATH", "")}'
"""
if config.iupred_path != '':
os.environ["PYTHONPATH"] = f'{os.path.abspath(os.path.realpath(config.iupred_path))}:{os.environ.get("PYTHONPATH", "")}'

logging_level = 20
if config.verbosity <= 1:
logging_level = 0
Expand Down
2 changes: 1 addition & 1 deletion structman_source/structman/lib/output/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ def classificationOutput(config, outfolder, session_name, session_id, ligand_fil
print("Time for classificationOutput part 5: ", t5 - t4, main_loop_counter)

if not already_unpacked:
from structman.utils import unpack

for row in results:
m = row[0]
position_number = row[1]
Expand Down
4 changes: 2 additions & 2 deletions structman_source/structman/lib/output/indel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from structman.lib import rin
from structman.lib.database import database
from structman.lib.output import out_generator
from structman.base_utils import base_utils

def add_aggregate_results(aggregates, indel_output, aggregate_type, raw_aggregate_header_base_names):
if aggregates is None:
Expand Down Expand Up @@ -124,14 +125,13 @@ def create_indel_results_table(config, output_path, session_name, session_id):

protein_dict = database.getProteinDict(prot_id_list, session_id, config)

from structman.utils import unpack
for row in results:
indel_id = row[0]
indel_output.add_value('Indel', row[1])
indel_output.add_value('Tags', tag_map[indel_id])

if row[2] is not None:
(size, ddC, wt_aggregates, mut_aggregates, left_flank_wt_aggregates, left_flank_mut_aggregates, right_flank_wt_aggregates, right_flank_mut_aggregates) = unpack(row[2])
(size, ddC, wt_aggregates, mut_aggregates, left_flank_wt_aggregates, left_flank_mut_aggregates, right_flank_wt_aggregates, right_flank_mut_aggregates) = base_utils.unpack(row[2])
else:
(size, ddC, wt_aggregates, mut_aggregates, left_flank_wt_aggregates, left_flank_mut_aggregates, right_flank_wt_aggregates, right_flank_mut_aggregates) = [None] * 8

Expand Down
9 changes: 9 additions & 0 deletions structman_source/structman/lib/serializedPipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2425,6 +2425,15 @@ def main(filename, config):

t0 = time.time()

if config.iupred_path != '':
sys.path.append(f'{os.path.abspath(os.path.realpath(config.iupred_path))}')
#os.environ["PYTHONPATH"] = f'{os.path.abspath(os.path.realpath(config.iupred_path))}:{os.environ.get("PYTHONPATH", "")}'
try:
import iupred3_lib
except:
config.errorlog.add_error(f'IUpred path was given, but import failed: {config.iupred_path}')
config.iupred_path = ''

# need structman package path for ray
ray_init(config)

Expand Down
4 changes: 2 additions & 2 deletions structman_source/structman/scripts/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from structman.scripts import createPdbBaDb, updateMappingDB


def main(config, skipUpdatePDB=False, skip_rindb=False, rin_fromScratch=False, update_mapping_db = False, mapping_db_from_scratch = False):
def main(config, skipUpdatePDB=False, skip_rindb=False, rin_fromScratch=False, update_mapping_db = False, mapping_db_from_scratch = False, update_mapping_db_keep_raw_files = False):
mmseqs_fromScratch = False
skipStructureDBs = False

Expand Down Expand Up @@ -107,7 +107,7 @@ def main(config, skipUpdatePDB=False, skip_rindb=False, rin_fromScratch=False, u

# update the mapping database
if update_mapping_db:
updateMappingDB.main(config, fromScratch = mapping_db_from_scratch)
updateMappingDB.main(config, fromScratch = mapping_db_from_scratch, update_mapping_db_keep_raw_files = update_mapping_db_keep_raw_files)

# update the human proteome mmseqs db, TODO if we want a simple mutation-calling for fasta inputs.

Expand Down
56 changes: 31 additions & 25 deletions structman_source/structman/scripts/updateDockerSource.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,46 @@
target_folder = sys.argv[1]
config_path = sys.argv[2]

build_mmseqs_db = True
skip_database_structure = False
if len(sys.argv) > 3:
additional_flags = set(sys.argv[3:])
if 'skip_mmseqs_db' in additional_flags:
build_mmseqs_db = False
if 'skip_database_structure' in additional_flags:
skip_database_structure = True

if not os.path.isfile(config_path):
print('ERROR: Need path to config file as second argument.')
sys.exit(1)

structman_target_folder = f'{target_folder}/structman'
lib_target_folder = f'{structman_target_folder}/lib'
rinerator_target_folder = f'{lib_target_folder}/rinerator'
database_target_file = f'{target_folder}/StructMAn_db/struct_man_db.sql.gz'

f = open(settings.STRUCTMAN_DB_SQL, 'r')
if not skip_database_structure:
database_target_file = f'{target_folder}/StructMAn_db/struct_man_db.sql.gz'

lines = f.readlines()
f.close()
new_lines = []
for pos, line in enumerate(lines):
#line = line.decode('ascii')
if line[:13] == '-- Datenbank:' or line[:4] == 'USE ':

new_lines.append(b'--\n')
new_lines.append(b'-- Database: `struct_man_db_1`\n')
new_lines.append(b'--\n')
new_lines.append(b'CREATE DATABASE IF NOT EXISTS `struct_man_db_1` DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci;\n')
new_lines.append(b'USE `struct_man_db_1`;\n')
else:
new_lines.append(line.encode())

f = gzip.open(database_target_file, 'wb')
f.write(b''.join(new_lines))
f.close()
f = open(settings.STRUCTMAN_DB_SQL, 'r')

lines = f.readlines()
f.close()
new_lines = []
for pos, line in enumerate(lines):
#line = line.decode('ascii')
if line[:13] == '-- Datenbank:' or line[:4] == 'USE ':

new_lines.append(b'--\n')
new_lines.append(b'-- Database: `struct_man_db_1`\n')
new_lines.append(b'--\n')
new_lines.append(b'CREATE DATABASE IF NOT EXISTS `struct_man_db_1` DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci;\n')
new_lines.append(b'USE `struct_man_db_1`;\n')
else:
new_lines.append(line.encode())

f = gzip.open(database_target_file, 'wb')
f.write(b''.join(new_lines))
f.close()

#p = subprocess.Popen(['split', '-b', '45M', 'struct_man_db.sql.gz', 'db_split'], cwd='%s/StructMAn_db/' % target_folder)
#p.wait()
Expand Down Expand Up @@ -131,11 +142,6 @@
with open(setup_target_path, 'w') as f:
f.write(''.join(new_lines))

build_mmseqs_db = True
if len(sys.argv) > 3:
if sys.argv[3] == 'skip_mmseqs_db':
build_mmseqs_db = False

if build_mmseqs_db:
config = Config(config_path, external_call=True)

Expand Down
44 changes: 28 additions & 16 deletions structman_source/structman/scripts/updateMappingDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,15 @@ def check_instance(config, fromScratch = False):

return fresh_instance

def retrieve_raw_data(config, fromScratch = False):
mapping_file_path = retrieve_data_from_uniprot(config, 'idmapping', 'idmapping.dat.gz', fromScratch = fromScratch)
def retrieve_raw_data(config, raw_files_folder_path, fromScratch = False):
mapping_file_path = retrieve_data_from_uniprot(config, raw_files_folder_path, 'idmapping', 'idmapping.dat.gz', fromScratch = fromScratch)
sequence_file_paths = []
for seq_file_name in ['uniprot_sprot_varsplic.fasta.gz','uniprot_sprot.fasta.gz','uniprot_trembl.fasta.gz']:
sequence_file_paths.append(retrieve_data_from_uniprot(config, 'complete', seq_file_name, fromScratch = fromScratch))
sequence_file_paths.append(retrieve_data_from_uniprot(config, raw_files_folder_path, 'complete', seq_file_name, fromScratch = fromScratch))
return mapping_file_path, sequence_file_paths

def retrieve_data_from_uniprot(config, uniprot_sub_folder, uniprot_file_name, fromScratch = False):
file_path = '%s/%s' % (config.tmp_folder, uniprot_file_name)
def retrieve_data_from_uniprot(config, raw_files_folder_path, uniprot_sub_folder, uniprot_file_name, fromScratch = False):
file_path = '%s/%s' % (raw_files_folder_path, uniprot_file_name)
if os.path.isfile(file_path) and fromScratch:
os.remove(file_path)
if not os.path.isfile(file_path):
Expand All @@ -86,24 +86,35 @@ def put_seqs_to_database(seq_map, config):

database.update(config, 'UNIPROT', ['Uniprot_Ac','Sequence'], values, mapping_db = True)

def main(config, fromScratch = False):
def main(config, fromScratch = False, update_mapping_db_keep_raw_files = False):
#Step 1: Check if mapping SQL DB instance is there, create if not. Recreate for fromScratch mode
fresh_instance = check_instance(config, fromScratch = fromScratch)

print('Mapping DB instance checked, fresh_instance:', fresh_instance)

if update_mapping_db_keep_raw_files:
if config.container_version:
raw_files_folder_path = '/structman/resources/'
else:
raw_files_folder_path = config.base_path
else:
raw_files_folder_path = config.tmp_folder

#Step 2: Check for raw files and download if necessary
mapping_file_path, seq_file_paths = retrieve_raw_data(config, fromScratch = fromScratch)
mapping_file_path, seq_file_paths = retrieve_raw_data(config, raw_files_folder_path, fromScratch = fromScratch)

print('\nDownloading all raw data files done.\n')
if update_mapping_db_keep_raw_files:
print(f'\nDownloading all raw data files done. The files are stored in: {raw_files_folder_path}\n')
else:
print(f'\nDownloading all raw data files done. The files are temporarily stored in: {raw_files_folder_path}\n')

#Step 3: Update the database

ac_id_values = []
ac_ref_values = []
ac_ref_nt_values = []

max_values_at_a_time = 1000000 * config.gigs_of_ram
max_values_at_a_time = int(1000000 * config.gigs_of_ram)

with gzip.open(mapping_file_path, 'rb') as f:
for line in f:
Expand Down Expand Up @@ -152,7 +163,7 @@ def main(config, fromScratch = False):
database.update(config, 'UNIPROT', ['Uniprot_Ac', 'RefSeq_NT'], ac_ref_nt_values, mapping_db = True)
print('\nDatabase update of RefSeq NTs done.\n')

max_seqs_at_a_time = 100000 * config.gigs_of_ram
max_seqs_at_a_time = int(70000 * config.gigs_of_ram)

for seq_file in seq_file_paths:
with gzip.open(seq_file, 'rb') as f:
Expand All @@ -174,12 +185,13 @@ def main(config, fromScratch = False):
seq_map[u_ac] += (line)
if len(seq_map) > 0:
put_seqs_to_database(seq_map, config)

seq_map = {}
print('\nDatabase update of sequences done.\n')

#Step 4: Remove the raw files
os.remove(mapping_file_path)
for seq_file in seq_file_paths:
os.remove(seq_file)
if not update_mapping_db_keep_raw_files:
#Step 4: Remove the raw files
os.remove(mapping_file_path)
for seq_file in seq_file_paths:
os.remove(seq_file)

print('\nRemoving raw data files done.\n')
print('\nRemoving raw data files done.\n')
12 changes: 11 additions & 1 deletion structman_source/structman/structman_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,7 @@ def structman_cli():
update_rindb_from_scratch = False
update_mapping_db = False
update_mapping_db_from_scratch = False
update_mapping_db_keep_raw_files = False

if len(argv) > 0:
if argv[0] == 'update':
Expand All @@ -596,6 +597,9 @@ def structman_cli():
if 'mapping_db_from_scratch' in argv:
update_mapping_db = True
update_mapping_db_from_scratch = True
if 'update_mapping_db_keep_raw_files' in argv:
update_mapping_db = True
update_mapping_db_keep_raw_files = True
if not (update_pdb or update_rindb or update_mapping_db):
print(update_util_disclaimer)
sys.exit(1)
Expand Down Expand Up @@ -1036,7 +1040,13 @@ def structman_cli():
f = open(config_path, 'w')
config.config_parser_obj.write(f)
f.close()
update.main(config, skipUpdatePDB=not update_pdb, skip_rindb=not update_rindb, rin_fromScratch=update_rindb_from_scratch, update_mapping_db = update_mapping_db, mapping_db_from_scratch = update_mapping_db_from_scratch)
update.main(config, skipUpdatePDB=not update_pdb,
skip_rindb=not update_rindb,
rin_fromScratch=update_rindb_from_scratch,
update_mapping_db = update_mapping_db,
mapping_db_from_scratch = update_mapping_db_from_scratch,
update_mapping_db_keep_raw_files = update_mapping_db_keep_raw_files
)

elif configure_mode:
if conf_update_pdb_path is not None:
Expand Down

0 comments on commit 8f5ceea

Please sign in to comment.