diff --git a/structman_source/MANIFEST.in b/structman_source/MANIFEST.in index 22881d7..7fc10fb 100644 --- a/structman_source/MANIFEST.in +++ b/structman_source/MANIFEST.in @@ -2,3 +2,5 @@ include structman/lib/rinerator/reduce include structman/lib/rinerator/probe include structman/lib/rinerator/reduce_wwPDB_het_dict.txt include structman/scripts/pdb-rsync.sh +include structman/scripts/struct_man_db_uniprot.sql +include structman/scripts/database_structure.sql diff --git a/structman_source/structman/base_utils/ray_utils.py b/structman_source/structman/base_utils/ray_utils.py index bb814d6..5819805 100644 --- a/structman_source/structman/base_utils/ray_utils.py +++ b/structman_source/structman/base_utils/ray_utils.py @@ -6,14 +6,15 @@ def ray_init(config): if ray.is_initialized(): return - + """ os.environ["PYTHONPATH"] = f'{settings.ROOT_DIR}:{os.environ.get("PYTHONPATH", "")}' os.environ["PYTHONPATH"] = f'{settings.LIB_DIR}:{os.environ.get("PYTHONPATH", "")}' os.environ["PYTHONPATH"] = f'{settings.RINERATOR_DIR}:{os.environ.get("PYTHONPATH", "")}' os.environ["PYTHONPATH"] = f'{settings.OUTPUT_DIR}:{os.environ.get("PYTHONPATH", "")}' + """ if config.iupred_path != '': os.environ["PYTHONPATH"] = f'{os.path.abspath(os.path.realpath(config.iupred_path))}:{os.environ.get("PYTHONPATH", "")}' - + logging_level = 20 if config.verbosity <= 1: logging_level = 0 diff --git a/structman_source/structman/lib/output/classification.py b/structman_source/structman/lib/output/classification.py index 4bb462d..093ce70 100644 --- a/structman_source/structman/lib/output/classification.py +++ b/structman_source/structman/lib/output/classification.py @@ -580,7 +580,7 @@ def classificationOutput(config, outfolder, session_name, session_id, ligand_fil print("Time for classificationOutput part 5: ", t5 - t4, main_loop_counter) if not already_unpacked: - from structman.utils import unpack + for row in results: m = row[0] position_number = row[1] diff --git a/structman_source/structman/lib/output/indel.py b/structman_source/structman/lib/output/indel.py index eec62bb..9d14c48 100644 --- a/structman_source/structman/lib/output/indel.py +++ b/structman_source/structman/lib/output/indel.py @@ -3,6 +3,7 @@ from structman.lib import rin from structman.lib.database import database from structman.lib.output import out_generator +from structman.base_utils import base_utils def add_aggregate_results(aggregates, indel_output, aggregate_type, raw_aggregate_header_base_names): if aggregates is None: @@ -124,14 +125,13 @@ def create_indel_results_table(config, output_path, session_name, session_id): protein_dict = database.getProteinDict(prot_id_list, session_id, config) - from structman.utils import unpack for row in results: indel_id = row[0] indel_output.add_value('Indel', row[1]) indel_output.add_value('Tags', tag_map[indel_id]) if row[2] is not None: - (size, ddC, wt_aggregates, mut_aggregates, left_flank_wt_aggregates, left_flank_mut_aggregates, right_flank_wt_aggregates, right_flank_mut_aggregates) = unpack(row[2]) + (size, ddC, wt_aggregates, mut_aggregates, left_flank_wt_aggregates, left_flank_mut_aggregates, right_flank_wt_aggregates, right_flank_mut_aggregates) = base_utils.unpack(row[2]) else: (size, ddC, wt_aggregates, mut_aggregates, left_flank_wt_aggregates, left_flank_mut_aggregates, right_flank_wt_aggregates, right_flank_mut_aggregates) = [None] * 8 diff --git a/structman_source/structman/lib/serializedPipeline.py b/structman_source/structman/lib/serializedPipeline.py index 0789095..d542487 100644 --- a/structman_source/structman/lib/serializedPipeline.py +++ b/structman_source/structman/lib/serializedPipeline.py @@ -2425,6 +2425,15 @@ def main(filename, config): t0 = time.time() + if config.iupred_path != '': + sys.path.append(f'{os.path.abspath(os.path.realpath(config.iupred_path))}') + #os.environ["PYTHONPATH"] = f'{os.path.abspath(os.path.realpath(config.iupred_path))}:{os.environ.get("PYTHONPATH", "")}' + try: + import iupred3_lib + except: + config.errorlog.add_error(f'IUpred path was given, but import failed: {config.iupred_path}') + config.iupred_path = '' + # need structman package path for ray ray_init(config) diff --git a/structman_source/structman/scripts/update.py b/structman_source/structman/scripts/update.py index 58a5a4a..e368467 100644 --- a/structman_source/structman/scripts/update.py +++ b/structman_source/structman/scripts/update.py @@ -11,7 +11,7 @@ from structman.scripts import createPdbBaDb, updateMappingDB -def main(config, skipUpdatePDB=False, skip_rindb=False, rin_fromScratch=False, update_mapping_db = False, mapping_db_from_scratch = False): +def main(config, skipUpdatePDB=False, skip_rindb=False, rin_fromScratch=False, update_mapping_db = False, mapping_db_from_scratch = False, update_mapping_db_keep_raw_files = False): mmseqs_fromScratch = False skipStructureDBs = False @@ -107,7 +107,7 @@ def main(config, skipUpdatePDB=False, skip_rindb=False, rin_fromScratch=False, u # update the mapping database if update_mapping_db: - updateMappingDB.main(config, fromScratch = mapping_db_from_scratch) + updateMappingDB.main(config, fromScratch = mapping_db_from_scratch, update_mapping_db_keep_raw_files = update_mapping_db_keep_raw_files) # update the human proteome mmseqs db, TODO if we want a simple mutation-calling for fasta inputs. diff --git a/structman_source/structman/scripts/updateDockerSource.py b/structman_source/structman/scripts/updateDockerSource.py index d985dcb..d6e68d9 100644 --- a/structman_source/structman/scripts/updateDockerSource.py +++ b/structman_source/structman/scripts/updateDockerSource.py @@ -15,6 +15,15 @@ target_folder = sys.argv[1] config_path = sys.argv[2] + build_mmseqs_db = True + skip_database_structure = False + if len(sys.argv) > 3: + additional_flags = set(sys.argv[3:]) + if 'skip_mmseqs_db' in additional_flags: + build_mmseqs_db = False + if 'skip_database_structure' in additional_flags: + skip_database_structure = True + if not os.path.isfile(config_path): print('ERROR: Need path to config file as second argument.') sys.exit(1) @@ -22,28 +31,30 @@ structman_target_folder = f'{target_folder}/structman' lib_target_folder = f'{structman_target_folder}/lib' rinerator_target_folder = f'{lib_target_folder}/rinerator' - database_target_file = f'{target_folder}/StructMAn_db/struct_man_db.sql.gz' - f = open(settings.STRUCTMAN_DB_SQL, 'r') + if not skip_database_structure: + database_target_file = f'{target_folder}/StructMAn_db/struct_man_db.sql.gz' - lines = f.readlines() - f.close() - new_lines = [] - for pos, line in enumerate(lines): - #line = line.decode('ascii') - if line[:13] == '-- Datenbank:' or line[:4] == 'USE ': - - new_lines.append(b'--\n') - new_lines.append(b'-- Database: `struct_man_db_1`\n') - new_lines.append(b'--\n') - new_lines.append(b'CREATE DATABASE IF NOT EXISTS `struct_man_db_1` DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci;\n') - new_lines.append(b'USE `struct_man_db_1`;\n') - else: - new_lines.append(line.encode()) - - f = gzip.open(database_target_file, 'wb') - f.write(b''.join(new_lines)) - f.close() + f = open(settings.STRUCTMAN_DB_SQL, 'r') + + lines = f.readlines() + f.close() + new_lines = [] + for pos, line in enumerate(lines): + #line = line.decode('ascii') + if line[:13] == '-- Datenbank:' or line[:4] == 'USE ': + + new_lines.append(b'--\n') + new_lines.append(b'-- Database: `struct_man_db_1`\n') + new_lines.append(b'--\n') + new_lines.append(b'CREATE DATABASE IF NOT EXISTS `struct_man_db_1` DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci;\n') + new_lines.append(b'USE `struct_man_db_1`;\n') + else: + new_lines.append(line.encode()) + + f = gzip.open(database_target_file, 'wb') + f.write(b''.join(new_lines)) + f.close() #p = subprocess.Popen(['split', '-b', '45M', 'struct_man_db.sql.gz', 'db_split'], cwd='%s/StructMAn_db/' % target_folder) #p.wait() @@ -131,11 +142,6 @@ with open(setup_target_path, 'w') as f: f.write(''.join(new_lines)) - build_mmseqs_db = True - if len(sys.argv) > 3: - if sys.argv[3] == 'skip_mmseqs_db': - build_mmseqs_db = False - if build_mmseqs_db: config = Config(config_path, external_call=True) diff --git a/structman_source/structman/scripts/updateMappingDB.py b/structman_source/structman/scripts/updateMappingDB.py index 986d6f5..1043b67 100644 --- a/structman_source/structman/scripts/updateMappingDB.py +++ b/structman_source/structman/scripts/updateMappingDB.py @@ -62,15 +62,15 @@ def check_instance(config, fromScratch = False): return fresh_instance -def retrieve_raw_data(config, fromScratch = False): - mapping_file_path = retrieve_data_from_uniprot(config, 'idmapping', 'idmapping.dat.gz', fromScratch = fromScratch) +def retrieve_raw_data(config, raw_files_folder_path, fromScratch = False): + mapping_file_path = retrieve_data_from_uniprot(config, raw_files_folder_path, 'idmapping', 'idmapping.dat.gz', fromScratch = fromScratch) sequence_file_paths = [] for seq_file_name in ['uniprot_sprot_varsplic.fasta.gz','uniprot_sprot.fasta.gz','uniprot_trembl.fasta.gz']: - sequence_file_paths.append(retrieve_data_from_uniprot(config, 'complete', seq_file_name, fromScratch = fromScratch)) + sequence_file_paths.append(retrieve_data_from_uniprot(config, raw_files_folder_path, 'complete', seq_file_name, fromScratch = fromScratch)) return mapping_file_path, sequence_file_paths -def retrieve_data_from_uniprot(config, uniprot_sub_folder, uniprot_file_name, fromScratch = False): - file_path = '%s/%s' % (config.tmp_folder, uniprot_file_name) +def retrieve_data_from_uniprot(config, raw_files_folder_path, uniprot_sub_folder, uniprot_file_name, fromScratch = False): + file_path = '%s/%s' % (raw_files_folder_path, uniprot_file_name) if os.path.isfile(file_path) and fromScratch: os.remove(file_path) if not os.path.isfile(file_path): @@ -86,16 +86,27 @@ def put_seqs_to_database(seq_map, config): database.update(config, 'UNIPROT', ['Uniprot_Ac','Sequence'], values, mapping_db = True) -def main(config, fromScratch = False): +def main(config, fromScratch = False, update_mapping_db_keep_raw_files = False): #Step 1: Check if mapping SQL DB instance is there, create if not. Recreate for fromScratch mode fresh_instance = check_instance(config, fromScratch = fromScratch) print('Mapping DB instance checked, fresh_instance:', fresh_instance) + if update_mapping_db_keep_raw_files: + if config.container_version: + raw_files_folder_path = '/structman/resources/' + else: + raw_files_folder_path = config.base_path + else: + raw_files_folder_path = config.tmp_folder + #Step 2: Check for raw files and download if necessary - mapping_file_path, seq_file_paths = retrieve_raw_data(config, fromScratch = fromScratch) + mapping_file_path, seq_file_paths = retrieve_raw_data(config, raw_files_folder_path, fromScratch = fromScratch) - print('\nDownloading all raw data files done.\n') + if update_mapping_db_keep_raw_files: + print(f'\nDownloading all raw data files done. The files are stored in: {raw_files_folder_path}\n') + else: + print(f'\nDownloading all raw data files done. The files are temporarily stored in: {raw_files_folder_path}\n') #Step 3: Update the database @@ -103,7 +114,7 @@ def main(config, fromScratch = False): ac_ref_values = [] ac_ref_nt_values = [] - max_values_at_a_time = 1000000 * config.gigs_of_ram + max_values_at_a_time = int(1000000 * config.gigs_of_ram) with gzip.open(mapping_file_path, 'rb') as f: for line in f: @@ -152,7 +163,7 @@ def main(config, fromScratch = False): database.update(config, 'UNIPROT', ['Uniprot_Ac', 'RefSeq_NT'], ac_ref_nt_values, mapping_db = True) print('\nDatabase update of RefSeq NTs done.\n') - max_seqs_at_a_time = 100000 * config.gigs_of_ram + max_seqs_at_a_time = int(70000 * config.gigs_of_ram) for seq_file in seq_file_paths: with gzip.open(seq_file, 'rb') as f: @@ -174,12 +185,13 @@ def main(config, fromScratch = False): seq_map[u_ac] += (line) if len(seq_map) > 0: put_seqs_to_database(seq_map, config) - + seq_map = {} print('\nDatabase update of sequences done.\n') - #Step 4: Remove the raw files - os.remove(mapping_file_path) - for seq_file in seq_file_paths: - os.remove(seq_file) + if not update_mapping_db_keep_raw_files: + #Step 4: Remove the raw files + os.remove(mapping_file_path) + for seq_file in seq_file_paths: + os.remove(seq_file) - print('\nRemoving raw data files done.\n') + print('\nRemoving raw data files done.\n') diff --git a/structman_source/structman/structman_main.py b/structman_source/structman/structman_main.py index f925b31..bcd6e13 100644 --- a/structman_source/structman/structman_main.py +++ b/structman_source/structman/structman_main.py @@ -575,6 +575,7 @@ def structman_cli(): update_rindb_from_scratch = False update_mapping_db = False update_mapping_db_from_scratch = False + update_mapping_db_keep_raw_files = False if len(argv) > 0: if argv[0] == 'update': @@ -596,6 +597,9 @@ def structman_cli(): if 'mapping_db_from_scratch' in argv: update_mapping_db = True update_mapping_db_from_scratch = True + if 'update_mapping_db_keep_raw_files' in argv: + update_mapping_db = True + update_mapping_db_keep_raw_files = True if not (update_pdb or update_rindb or update_mapping_db): print(update_util_disclaimer) sys.exit(1) @@ -1036,7 +1040,13 @@ def structman_cli(): f = open(config_path, 'w') config.config_parser_obj.write(f) f.close() - update.main(config, skipUpdatePDB=not update_pdb, skip_rindb=not update_rindb, rin_fromScratch=update_rindb_from_scratch, update_mapping_db = update_mapping_db, mapping_db_from_scratch = update_mapping_db_from_scratch) + update.main(config, skipUpdatePDB=not update_pdb, + skip_rindb=not update_rindb, + rin_fromScratch=update_rindb_from_scratch, + update_mapping_db = update_mapping_db, + mapping_db_from_scratch = update_mapping_db_from_scratch, + update_mapping_db_keep_raw_files = update_mapping_db_keep_raw_files + ) elif configure_mode: if conf_update_pdb_path is not None: