From 82d0079159c924a155a4817de7a3f40cfcc0f59b Mon Sep 17 00:00:00 2001 From: sean-bam <34106755+sean-bam@users.noreply.github.com> Date: Wed, 21 Feb 2024 15:22:05 -0800 Subject: [PATCH] blackify; rename to pyIsland --- PyIsland/Drivers/hhsuite.py | 3 +- PyIsland/Drivers/mmseqs.py | 33 +- PyIsland/FastaUtils/MSA.py | 354 ++++++++++--------- PyIsland/FastaUtils/__init__.py | 35 +- PyIsland/GCP/bigquery.py | 53 +-- PyIsland/GCP/storage.py | 7 +- PyIsland/Islands/__init__.py | 4 +- PyIsland/Parsers/blast.py | 12 +- PyIsland/Parsers/hhsuite.py | 3 - PyIsland/Parsers/mmseqs.py | 12 +- PyIsland/tests/__init__.py | 7 +- PyIsland/tests/fastautils/test_fastautils.py | 96 +++-- PyIsland/tests/parsers/test_parsers.py | 109 +++--- docs/conf.py | 46 ++- setup.py | 20 +- test_environment.py | 9 +- 16 files changed, 421 insertions(+), 382 deletions(-) diff --git a/PyIsland/Drivers/hhsuite.py b/PyIsland/Drivers/hhsuite.py index 8178080..3fddb65 100644 --- a/PyIsland/Drivers/hhsuite.py +++ b/PyIsland/Drivers/hhsuite.py @@ -67,6 +67,7 @@ def get_hhsuite_neff(msa): neff = float(p1.stdout.strip().split()[10]) return round(neff, 2) + def align_clusters_with_hhalign(input_dir, output_dir, df_clustering): """ accepts a table of profile, length and cluster assignment @@ -115,4 +116,4 @@ def align_clusters_with_hhalign(input_dir, output_dir, df_clustering): f"hhalign -i {query_profile} {templates} -M 50 -glob -id 100 -diff inf -oa3m {output_a3m}", shell=True, check=True, - ) \ No newline at end of file + ) diff --git a/PyIsland/Drivers/mmseqs.py b/PyIsland/Drivers/mmseqs.py index 846cdbe..1fb7705 100644 --- a/PyIsland/Drivers/mmseqs.py +++ b/PyIsland/Drivers/mmseqs.py @@ -3,7 +3,7 @@ import shutil import tempfile -from pyIsland import Parsers +from PyIsland import Parsers def run_mmseqs_cls( @@ -26,7 +26,7 @@ def run_mmseqs_cls( # seqdb = Path(f'{dbname}seqDB') # clsdb = Path(f'{dbname}clsDB') - tmp_dir = tempfile.TemporaryDirectory(dir='.') + tmp_dir = tempfile.TemporaryDirectory(dir=".") tmp_dir_path = Path(tmp_dir.name) tsv = Path(tsv) @@ -62,21 +62,21 @@ def run_mmseqs_cls( # file.unlink() # except IsADirectoryError: # pass - #shutil.rmtree("tmpmmseqsdb/") - + # shutil.rmtree("tmpmmseqsdb/") + + def get_reps_from_clustering(seqDB, clsDB, fasta): - - tmp_dir = tempfile.TemporaryDirectory(dir='.') + tmp_dir = tempfile.TemporaryDirectory(dir=".") tmp_dir_path = Path(tmp_dir.name) - repdb = tmp_dir_path / Path('repDB') - + repdb = tmp_dir_path / Path("repDB") + subprocess.run( f"mmseqs createsubdb {clsDB} {seqDB} {repdb}", shell=True, check=True, stdout=subprocess.DEVNULL, ) - + subprocess.run( f"mmseqs convert2fasta {repdb} {fasta}", shell=True, @@ -85,7 +85,6 @@ def get_reps_from_clustering(seqDB, clsDB, fasta): ) - def cls2pro( seqDB, clsDB, proDB, evalue=0.001, qid=0.0, qsc=-20, diff=1000, max_seq_id=0.9 ): @@ -94,11 +93,17 @@ def cls2pro( ) subprocess.run( - f"mmseqs createsubdb {clsDB} {seqDB} {clsDB}rep", shell=True, check=True, stdout=subprocess.DEVNULL + f"mmseqs createsubdb {clsDB} {seqDB} {clsDB}rep", + shell=True, + check=True, + stdout=subprocess.DEVNULL, ) subprocess.run( - f"mmseqs createsubdb {clsDB} {seqDB}_h {clsDB}rep_h", shell=True, check=True, stdout=subprocess.DEVNULL + f"mmseqs createsubdb {clsDB} {seqDB}_h {clsDB}rep_h", + shell=True, + check=True, + stdout=subprocess.DEVNULL, ) subprocess.run( @@ -298,7 +303,9 @@ def get_missing_seqs_as_fasta_using_tsv(db1, tsv, output): assert db1_lookup.is_file(), f"Can't find {db1_lookup}" # get headers in DB1 missing from DB2 - db1_dict = Parsers.mmseqs.get_mmseqs_internal_ids_as_dict(db1_lookup) # internal id : header + db1_dict = Parsers.mmseqs.get_mmseqs_internal_ids_as_dict( + db1_lookup + ) # internal id : header tsv_dict = Parsers.mmseqs.mmseqs_tsv_to_dict(tsv) # member : representative missing_ids = db1_dict.values() - tsv_dict.keys() diff --git a/PyIsland/FastaUtils/MSA.py b/PyIsland/FastaUtils/MSA.py index eabc178..6cb1fe5 100644 --- a/PyIsland/FastaUtils/MSA.py +++ b/PyIsland/FastaUtils/MSA.py @@ -11,82 +11,84 @@ from Bio.Align import substitution_matrices from Bio.SeqIO.FastaIO import SimpleFastaParser + def get_tables_for_msa_filtering(): - - #set a background frequency of AAs + # set a background frequency of AAs background_aa_freq = { - "A" : 0.07422, - "C" : 0.02469, - "D" : 0.05363, - "E" : 0.05431, - "F" : 0.04742, - "G" : 0.07415, - "H" : 0.02621, - "I" : 0.06792, - "K" : 0.05816, - "L" : 0.09891, - "M" : 0.02499, - "N" : 0.04465, - "P" : 0.03854, - "Q" : 0.03426, - "R" : 0.05161, - "S" : 0.05723, - "T" : 0.05089, - "V" : 0.07292, - "W" : 0.01303, - "Y" : 0.03228, - "X" : 0.00001, - } - - #get a blosum62 matrix + "A": 0.07422, + "C": 0.02469, + "D": 0.05363, + "E": 0.05431, + "F": 0.04742, + "G": 0.07415, + "H": 0.02621, + "I": 0.06792, + "K": 0.05816, + "L": 0.09891, + "M": 0.02499, + "N": 0.04465, + "P": 0.03854, + "Q": 0.03426, + "R": 0.05161, + "S": 0.05723, + "T": 0.05089, + "V": 0.07292, + "W": 0.01303, + "Y": 0.03228, + "X": 0.00001, + } + + # get a blosum62 matrix blosum62 = substitution_matrices.load("BLOSUM62") - #set a expectation of AAs - #expectation_aa = {} - #for aa1 in background_aa_freq.keys(): + # set a expectation of AAs + # expectation_aa = {} + # for aa1 in background_aa_freq.keys(): # expectation_aa[aa1] = 0 # for aa2 in background_aa_freq.keys(): # expectation_aa[aa1] = expectation_aa[aa1] + blosum62[aa1,aa2]*background_aa_freq[aa2] - + expectation_aa = aa_freq_to_score(background_aa_freq) return background_aa_freq, blosum62, expectation_aa + def aa_freq_to_score(col_aa_frq): """ expects a dictionary of AA : frequency returns a dictionary of AA : score - + score is sum of blosum62 scores of an index AA versus all other AAs """ - #set all AA weights as zero + # set all AA weights as zero weights = {} for aa in col_aa_frq.keys(): weights[aa] = 0 - - #get a blosum62 matrix + + # get a blosum62 matrix blosum62 = substitution_matrices.load("BLOSUM62") - - #assign weights to each AA + + # assign weights to each AA for aa1 in col_aa_frq.keys(): for aa2 in col_aa_frq.keys(): - weights[aa1] = weights[aa1] + (blosum62[aa1,aa2] * col_aa_frq[aa2]) - #print(f"AA1 is {aa1}, AA2 is {aa2}, frequency of AA2 is {col_aa_frq[aa2]}, blosum62 of {aa1}{aa2} is {blosum62[aa1,aa2]}, updating weight of AA1 to {weights[aa1]}") - + weights[aa1] = weights[aa1] + (blosum62[aa1, aa2] * col_aa_frq[aa2]) + # print(f"AA1 is {aa1}, AA2 is {aa2}, frequency of AA2 is {col_aa_frq[aa2]}, blosum62 of {aa1}{aa2} is {blosum62[aa1,aa2]}, updating weight of AA1 to {weights[aa1]}") + return weights -def aln_to_df(aln, msa_format='fasta'): +def aln_to_df(aln, msa_format="fasta"): """ #deprecated convertsa biopython alignment object into a pandas dataframe """ - + headers = [rec.id for rec in aln] seqs = [list(rec) for rec in aln] - df = pd.DataFrame.from_records(seqs, index = headers).astype('str') + df = pd.DataFrame.from_records(seqs, index=headers).astype("str") return df + def fa_to_df(fa, is_msa=False): """ Converts a fasta-formatted file to a pandas dataframe @@ -98,173 +100,178 @@ def fa_to_df(fa, is_msa=False): headers.append(title) seqs.append(list(seq.upper())) - - df = pd.DataFrame.from_records(seqs, index = headers) - - #check to make sure the MSA is aligned properly + df = pd.DataFrame.from_records(seqs, index=headers) + + # check to make sure the MSA is aligned properly if is_msa: assert df.isna().sum().sum() == 0, f"All sequences must be the same length" - + return df + def df_to_fa(df, output): """ converts a pandas dataframe of sequences to a fasta-formatted file """ - - #combine the columns into a single string + + # combine the columns into a single string series = df.T.apply("".join) - - #write the output - with open(output, 'w') as o: + + # write the output + with open(output, "w") as o: for header, seq in series.items(): - print(f">{header}", seq, sep = '\n', file = o) - + print(f">{header}", seq, sep="\n", file=o) + + def get_gap_cols_from_msa(df, grcut=1): """ -grcut = no more than r fraction of gaps - + returns a list of columns that are more than r fraction of gaps """ - #df = msa_to_df(msa) + # df = msa_to_df(msa) num_seqs, num_columns = df.shape EPSILON = 1e-6 - #calculate the minimum number of non-NA values per column + # calculate the minimum number of non-NA values per column max_gaps_in_column = round((num_seqs * grcut) + EPSILON, 3) - #convert gaps to NA, count how many NAs per column + # convert gaps to NA, count how many NAs per column gap_count_series = df.replace("-", np.nan).isnull().sum() - #get columns where number of NAs is >= max_gaps - gap_col_list = gap_count_series.loc[lambda s: s >= max_gaps_in_column].index.tolist() - #df2 = df.loc[:, good_cols] - + # get columns where number of NAs is >= max_gaps + gap_col_list = gap_count_series.loc[ + lambda s: s >= max_gaps_in_column + ].index.tolist() + # df2 = df.loc[:, good_cols] + ##RO - + return gap_col_list + def calc_HH94(df, grswe=0.51): """ - sequences have an initial weight of 1/nseq - Calculates the score of a position in an MSA as 1/rd r = # of dif characters (excluding gaps) in a column - d = number of times the character of the considered sequence + d = number of times the character of the considered sequence and position appears in the considered alignment column - + - columns in MSA that are >grswe fraction of gap positions are ignored - + - the scores of a sequence's aligned positions are added to its weight - + Returns a series object containing the sequence name and HH94 weight """ - #df = msa_to_df(msa) - - #convert gaps to NA so they are not counted in the weights + # df = msa_to_df(msa) + + # convert gaps to NA so they are not counted in the weights df2 = df.replace("-", np.nan) - - #get a list of gap columns that are more than grswe fraction of gaps + + # get a list of gap columns that are more than grswe fraction of gaps gap_cols = get_gap_cols_from_msa(df, grswe) - - #initial sequence weight is just 1 / nseq + + # initial sequence weight is just 1 / nseq initial_weight = 1 / len(df) - colaa_weights_nest_dict = {} for column in df2.columns: if column not in gap_cols: - - #get a count of the frequency of each AA in the column - #by default, NAs (gaps) are not counted + # get a count of the frequency of each AA in the column + # by default, NAs (gaps) are not counted col_series = df2[[column]].value_counts() - #calculate the AA weights in the column - #the length of the series is the number of unique AAs + # calculate the AA weights in the column + # the length of the series is the number of unique AAs colaa_weights = round(1 / (col_series * len(col_series)), 3) - #store the column-specific results in a dict of dict + # store the column-specific results in a dict of dict colaa_weights_dict = colaa_weights.to_dict() colaa_weights_nest_dict[column] = colaa_weights_dict - #replace the AAs in the MSA with their column-specific weights + # replace the AAs in the MSA with their column-specific weights df3 = df2.replace(colaa_weights_nest_dict) - - #for each sequence, sum all the values and add the initial wieght - hh94_series = df3.sum(axis = 1).round(3) + initial_weight - - #normalize + + # for each sequence, sum all the values and add the initial wieght + hh94_series = df3.sum(axis=1).round(3) + initial_weight + + # normalize hh94_mean = hh94_series.mean() if hh94_mean < 0: hh94_mean = 1 hh94_norm = round(hh94_series / hh94_series.mean(), 4) - - #return a dictionary containing the sequence and its normalized weight + + # return a dictionary containing the sequence and its normalized weight return hh94_norm.to_dict() - + + def get_col_aa_frequencies(column, hh94_scores, background_aa_freq): """ Accepts a dictionary or pandas series containing seq : aligned char - + calculates the column=specific AA weights, given HH94 scores of sequences - + returns a dictionary of AA : weight """ - - #set all AA frequencies as zero + + # set all AA frequencies as zero aafreq = {} for aa in background_aa_freq.keys(): aafreq[aa] = 0 - - #set gap score as zero + + # set gap score as zero ngap = 0 - - for seq,char in column.items(): + + for seq, char in column.items(): if char == "-": - - #increment the background frequencies of all AAs + # increment the background frequencies of all AAs for aa, score in aafreq.items(): - aafreq[aa] = aafreq[aa] + background_aa_freq[aa]*hh94_scores[seq] - - #increment the gap frequency + aafreq[aa] = aafreq[aa] + background_aa_freq[aa] * hh94_scores[seq] + + # increment the gap frequency ngap += hh94_scores[seq] else: try: aafreq[char] = aafreq[char] + hh94_scores[seq] except KeyError: pass - - #add in ngaps to the dict + + # add in ngaps to the dict aafreq["-"] = ngap - + return aafreq + def get_best_aa_and_score(aascores): """ Expects a dictionary of AA : score Selects the AA with the highest score - + Returns AA and score """ - - #get top AA and score, as a tuple - bestaa_tuple = max(aascores.items(), key = lambda k : k[1]) - - return bestaa_tuple[0],bestaa_tuple[1] - + + # get top AA and score, as a tuple + bestaa_tuple = max(aascores.items(), key=lambda k: k[1]) + + return bestaa_tuple[0], bestaa_tuple[1] + + def calc_homogeneity(aa, score, nseqs, expectaascores, blosum62): """ calculates the homogeneity of a column in an MSA homogeneity ranges from 0-1 """ - - homogeneity = (score/nseqs - expectaascores[aa]) / (blosum62[aa, aa] - expectaascores[aa]) + + homogeneity = (score / nseqs - expectaascores[aa]) / ( + blosum62[aa, aa] - expectaascores[aa] + ) return round(homogeneity, 4) + def assign_consensus(bestaa, grcon, homo, hocon, ngaps, nseqs, ambiguousAA="X"): - """ - - """ - #Default consensus is gap + """ """ + # Default consensus is gap caa = "-" if ngaps <= nseqs * grcon: @@ -272,13 +279,23 @@ def assign_consensus(bestaa, grcon, homo, hocon, ngaps, nseqs, ambiguousAA="X"): caa = bestaa else: caa = ambiguousAA - #caa = bestaa + # caa = bestaa return caa -def filter_msa(msa, msa_out, grcut=1.1, hocut=-100, gcon=0.499, hcon=0.01, conplus=False, grswe=0.51): + +def filter_msa( + msa, + msa_out, + grcut=1.1, + hocut=-100, + gcon=0.499, + hcon=0.01, + conplus=False, + grswe=0.51, +): """ - MSA filtering + trimming, as described in + MSA filtering + trimming, as described in Esterman et. al (2021) https://doi.org/10.1093/ve/veab015 """ @@ -287,73 +304,72 @@ def filter_msa(msa, msa_out, grcut=1.1, hocut=-100, gcon=0.499, hcon=0.01, conpl df = fa_to_df(msa, is_msa=True) if grcut <= 1: - #print(f'getting a list of positions that are > {grcut * 100}% gaps') - - #get a list of gap positions + # print(f'getting a list of positions that are > {grcut * 100}% gaps') + + # get a list of gap positions gapcols = get_gap_cols_from_msa(df, grcut) - + bad_cols += gapcols - + if hocut >= 0 or conplus: - - #then we need to iterate through each position in the MSA - #print(f'iterating through the alignment') - - #consensus seq is empty + # then we need to iterate through each position in the MSA + # print(f'iterating through the alignment') + + # consensus seq is empty conseq = [] - - #get the necessary tables - background_aa_freq,blosum62,expectedAAscores = get_tables_for_msa_filtering() - - #convert the msa to a dataframe + + # get the necessary tables + background_aa_freq, blosum62, expectedAAscores = get_tables_for_msa_filtering() + + # convert the msa to a dataframe nseqs, ncols = df.shape - - #get the HH94 scores of the sequences + + # get the HH94 scores of the sequences hh94_scores = calc_HH94(df, grswe) - - for col in range(0,ncols): - - #get the column + + for col in range(0, ncols): + # get the column column = df[col] - #calculate the AA frequencies + # calculate the AA frequencies col_aa_frq = get_col_aa_frequencies(column, hh94_scores, background_aa_freq) - #the last item in the dictionary is the gap frequencies - #record it and remove from the dictionary, - #otherwise the next fxn throws an error + # the last item in the dictionary is the gap frequencies + # record it and remove from the dictionary, + # otherwise the next fxn throws an error ngaps = col_aa_frq.popitem()[1] - #convert the frequencies to scores + # convert the frequencies to scores col_aa_scores = aa_freq_to_score(col_aa_frq) - #get the top scoring AA + # get the top scoring AA bestaa, bestaascore = get_best_aa_and_score(col_aa_scores) - - #assess the homogeneity of the AA, if the bestscoring AA is above expectation - #if bestaascore/nseqs > expectedAAscores[bestaa]: - homo = calc_homogeneity(bestaa, bestaascore, nseqs, expectedAAscores, blosum62) - - #record columns that are below the homogeneity threshold + + # assess the homogeneity of the AA, if the bestscoring AA is above expectation + # if bestaascore/nseqs > expectedAAscores[bestaa]: + homo = calc_homogeneity( + bestaa, bestaascore, nseqs, expectedAAscores, blosum62 + ) + + # record columns that are below the homogeneity threshold if homo < hocut: bad_cols.append(col) - #assign the consensus + # assign the consensus con = assign_consensus(bestaa, gcon, homo, hcon, ngaps, nseqs) conseq.append(con) - + if conplus: - - #make a dataframe from the consensus - df_c = pd.DataFrame.from_records([conseq], index = ["CONSENSUS"]) - - #update the original MSA datafarme - df = pd.concat([df_c,df]) - - #remove gap/nonhomogenous columns + # make a dataframe from the consensus + df_c = pd.DataFrame.from_records([conseq], index=["CONSENSUS"]) + + # update the original MSA datafarme + df = pd.concat([df_c, df]) + + # remove gap/nonhomogenous columns if len(bad_cols) > 0: - df = df.drop(columns = bad_cols) + df = df.drop(columns=bad_cols) - #write the output - with open(msa_out, 'w') as f: - df_to_fa(df, msa_out) \ No newline at end of file + # write the output + with open(msa_out, "w") as f: + df_to_fa(df, msa_out) diff --git a/PyIsland/FastaUtils/__init__.py b/PyIsland/FastaUtils/__init__.py index 97057c9..f646ca5 100644 --- a/PyIsland/FastaUtils/__init__.py +++ b/PyIsland/FastaUtils/__init__.py @@ -104,42 +104,41 @@ def fa_strict(fasta_in, fasta_out, min_len=0): with open(fasta_out, "w") as o: SeqIO.write(seq_records, fasta_out, "fasta") print(f"wrote {len(seq_records)}/{i} sequences") - -def get_subseq_coords(query,seq_record): + + +def get_subseq_coords(query, seq_record): """ reports 1-based start/stop/strand of a query subsequence in a subject seq query subsequence is a string subject is BioPython SeqRecord - + Returns a tuple of start,stop, strand Start = -1 if the subsequence is not found """ - + qlen = len(query) - - - #search the positive strand + + # search the positive strand start = seq_record.seq.find(query) stop = start + qlen strand = "1" - - #if no hit, search the RC + + # if no hit, search the RC if start == -1: - - #reverse complement the query + # reverse complement the query query = str(Seq(query).reverse_complement()) - #search + # search start = seq_record.seq.find(query) stop = start + qlen strand = "-1" - - #no hits on either strand + + # no hits on either strand if start == -1: start = -2 stop = -1 print(f"could not find the qseq on either + or - strand") - - #update to 1-based - start +=1 - return start,stop,strand + + # update to 1-based + start += 1 + return start, stop, strand diff --git a/PyIsland/GCP/bigquery.py b/PyIsland/GCP/bigquery.py index 9b70b0f..c0daee7 100644 --- a/PyIsland/GCP/bigquery.py +++ b/PyIsland/GCP/bigquery.py @@ -5,6 +5,7 @@ from google.cloud import bigquery from atxlib import GCP + def upload_gff_to_bq(gff, table_id): """ Accepts a GFF file and a BQ location @@ -126,10 +127,11 @@ def csv2fasta(csv, output): i += 1 return i -def write_table_to_gcs(input_table:str, uri:str, location='us-central1'): + +def write_table_to_gcs(input_table: str, uri: str, location="us-central1"): client = bigquery.Client() job_config = bigquery.ExtractJobConfig() - #job_config.destination_format = bigquery.DestinationFormat.CSV + # job_config.destination_format = bigquery.DestinationFormat.CSV job_config.print_header = False extract_job = client.extract_table( @@ -232,18 +234,21 @@ def get_seqs_from_bq( print(f"Deleting local output {output_blob_name}") Path(output_blob_name).unlink() -def blast_rep_to_member(seq_table:str, clustering_table:str, blast_table:str, output_table:str): + +def blast_rep_to_member( + seq_table: str, clustering_table: str, blast_table: str, output_table: str +): """ Accepts a bigquery table of sequences, a bigquery table of clustering information, and a bigquery table of blast hits and returns a table of sequences that have a blast hit to a representative sequence """ client = bigquery.Client() - #extract seqs to table + # extract seqs to table job_config = bigquery.QueryJobConfig( - dry_run = False, - destination = output_table, - write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE, + dry_run=False, + destination=output_table, + write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, ) sql = f""" @@ -265,19 +270,25 @@ def blast_rep_to_member(seq_table:str, clustering_table:str, blast_table:str, ou destination_table = client.get_table(output_table) print("Loaded {} rows.".format(destination_table.num_rows)) -def get_seqs_from_bq_as_fasta(seq_table:str, output_uri:str, keep_local=True): + +def get_seqs_from_bq_as_fasta(seq_table: str, output_uri: str, keep_local=True): """ - Accepts a bigquery table with two rows "protein_id" and "sequence" and returns a fasta file + Accepts a bigquery table with two rows "protein_id" and "sequence" and returns a fasta file """ - #set paths - bucket_name, output_blob_path = GCP.storage.extract_bucketname_and_blob_path_from_uri( - output_uri - ) - output_blob_name = output_blob_path.split("/")[-1] # e.g. 'output.faa' - output_blob_dir = "/".join(output_blob_path.split("/")[:-1]) + "/" # e.g. 'path/to/folder/' - tmp_blob_dir = output_blob_dir + "tmp_blobs/" # e.g. 'path/to/folder/tmp_blobs/' - tmp_blobs = "gs://" + bucket_name + "/" + tmp_blob_dir + "split*" #e.g. 'gs://bucket/path/to/folder/tmp_blobs/split*' + # set paths + ( + bucket_name, + output_blob_path, + ) = GCP.storage.extract_bucketname_and_blob_path_from_uri(output_uri) + output_blob_name = output_blob_path.split("/")[-1] # e.g. 'output.faa' + output_blob_dir = ( + "/".join(output_blob_path.split("/")[:-1]) + "/" + ) # e.g. 'path/to/folder/' + tmp_blob_dir = output_blob_dir + "tmp_blobs/" # e.g. 'path/to/folder/tmp_blobs/' + tmp_blobs = ( + "gs://" + bucket_name + "/" + tmp_blob_dir + "split*" + ) # e.g. 'gs://bucket/path/to/folder/tmp_blobs/split*' print(f"Writing {seq_table} to {tmp_blobs}") write_table_to_gcs(seq_table, tmp_blobs) @@ -294,7 +305,7 @@ def get_seqs_from_bq_as_fasta(seq_table:str, output_uri:str, keep_local=True): num_seqs = 0 with open(output_blob_name, "w") as outfile: for split in Path(".").glob("split*"): - #print(f"Converting {split} to {split}.faa") + # print(f"Converting {split} to {split}.faa") i = csv2fasta(split, split.with_suffix(".faa")) num_seqs += i with open(split.with_suffix(".faa")) as infile: @@ -313,9 +324,9 @@ def get_seqs_from_bq_as_fasta(seq_table:str, output_uri:str, keep_local=True): if not keep_local: Path(output_blob_name).unlink() - - #remove temporary CSV files from bucket + + # remove temporary CSV files from bucket print(f"Removing {tmp_blobs} from bucket") blobs = GCP.storage.list_blobs_with_prefix(bucket_name, tmp_blob_dir) for blob in blobs: - GCP.storage.delete_blob(bucket_name, blob.name) \ No newline at end of file + GCP.storage.delete_blob(bucket_name, blob.name) diff --git a/PyIsland/GCP/storage.py b/PyIsland/GCP/storage.py index 0fc56ac..ca73e51 100644 --- a/PyIsland/GCP/storage.py +++ b/PyIsland/GCP/storage.py @@ -12,7 +12,7 @@ def get_blobs_in_bucket_as_set(bucket, prefix): Accepts strings pointing to a bucket name and a "prefix" (i.e., a "folder") The prefix should end with a "/" Returns a python set of "filenames" - + """ client = storage.Client() bucket = client.get_bucket(bucket) @@ -21,8 +21,8 @@ def get_blobs_in_bucket_as_set(bucket, prefix): blob_set = set() for blob in blobs: - f = Path(blob.name) #.e.g, path/to/file.txt - blob_set.add(f.name) #e.g., file.txt + f = Path(blob.name) # .e.g, path/to/file.txt + blob_set.add(f.name) # e.g., file.txt return blob_set @@ -75,6 +75,7 @@ def delete_blob(bucket_name, blob_name): blob = bucket.blob(blob_name) blob.delete() + def list_blobs_with_prefix(bucket_name, prefix, delimiter="/"): """Lists all the blobs in the bucket.""" # bucket_name = "your-bucket-name" diff --git a/PyIsland/Islands/__init__.py b/PyIsland/Islands/__init__.py index 27df4bf..8229f38 100644 --- a/PyIsland/Islands/__init__.py +++ b/PyIsland/Islands/__init__.py @@ -244,8 +244,8 @@ def select_neighborhood(gff: str, seedgff: str, window: int, check=True): df_neighborhood = df.query( "contig == @contig and start <= @stop and stop >= @start" ) - - #label island + + # label island isl_start = str(df_neighborhood.start.min()) isl_stop = str(df_neighborhood.stop.max()) df_neighborhood["island"] = str(contig) + ":" + isl_start + "-" + isl_stop diff --git a/PyIsland/Parsers/blast.py b/PyIsland/Parsers/blast.py index a9f95a8..a7da080 100644 --- a/PyIsland/Parsers/blast.py +++ b/PyIsland/Parsers/blast.py @@ -129,12 +129,12 @@ def remove_overlapping_hits(df, threshold): def select_nonoverlapping_hits(df, threshold=0.33): df_list = [] - - #qstart must be > qend - #otherwise, overlap can be falsely calculated - df[["qstart","qend"]] = (df.loc[:,["qstart","qend"]] - .mask(df.qstart > df.qend, df[["qend","qstart"]], axis = 'index') - ) + + # qstart must be > qend + # otherwise, overlap can be falsely calculated + df[["qstart", "qend"]] = df.loc[:, ["qstart", "qend"]].mask( + df.qstart > df.qend, df[["qend", "qstart"]], axis="index" + ) for name, group in df.groupby("qaccver"): df2 = overlap_filtering.remove_overlapping_hits(group, threshold) diff --git a/PyIsland/Parsers/hhsuite.py b/PyIsland/Parsers/hhsuite.py index 79af895..67a2cdd 100644 --- a/PyIsland/Parsers/hhsuite.py +++ b/PyIsland/Parsers/hhsuite.py @@ -89,6 +89,3 @@ def normalize_scores_by_selfhit(df): df["score_norm"] = -np.log(df["score"].astype("float") / selfscore) return df - - - diff --git a/PyIsland/Parsers/mmseqs.py b/PyIsland/Parsers/mmseqs.py index 45bb843..d5423b9 100644 --- a/PyIsland/Parsers/mmseqs.py +++ b/PyIsland/Parsers/mmseqs.py @@ -146,21 +146,22 @@ def mmseqs_lookup_to_dict(lookup): def mmseqs_linear_tsv_to_df(tsv): - #df_cls = pd.read_csv(tsv, sep="\t", names=["cluster", "members"]) + # df_cls = pd.read_csv(tsv, sep="\t", names=["cluster", "members"]) df_cls = mmseqs_tsv_to_df(tsv) - + # Explode the members column df_cls["members"] = df_cls.members.str.split(" ").to_list() df_cls = df_cls.explode("members") return df_cls + def mmseqs_tsv_to_df(tsv): df_cls = pd.read_csv(tsv, sep="\t", names=["cluster", "members"]) ## Explode the members column - #df_cls["members"] = df_cls.members.str.split(" ").to_list() - #df_cls = df_cls.explode("members") + # df_cls["members"] = df_cls.members.str.split(" ").to_list() + # df_cls = df_cls.explode("members") return df_cls @@ -178,7 +179,8 @@ def cls2fa(fasta, member_to_cluster, outfolder): with open(cluster_fasta_name, "a") as fasta_file: fasta_file.write(seq.format("fasta")) - + + def get_mmseqs_internal_ids_as_set(index): mmseqs_internal_ids = set() with open(index) as infile: diff --git a/PyIsland/tests/__init__.py b/PyIsland/tests/__init__.py index f585fa6..eb5c095 100644 --- a/PyIsland/tests/__init__.py +++ b/PyIsland/tests/__init__.py @@ -1,9 +1,10 @@ import hashlib from functools import partial + def md5sum(filename): - with open(filename, mode='rb') as f: + with open(filename, mode="rb") as f: d = hashlib.md5() - for buf in iter(partial(f.read, 128), b''): + for buf in iter(partial(f.read, 128), b""): d.update(buf) - return d.hexdigest() \ No newline at end of file + return d.hexdigest() diff --git a/PyIsland/tests/fastautils/test_fastautils.py b/PyIsland/tests/fastautils/test_fastautils.py index cbeddb0..c152589 100644 --- a/PyIsland/tests/fastautils/test_fastautils.py +++ b/PyIsland/tests/fastautils/test_fastautils.py @@ -3,79 +3,77 @@ from PyIsland import tests import pytest + @pytest.fixture def get_test_fasta(): - return Path('PyIsland/tests/example.fasta') + return Path("PyIsland/tests/example.fasta") + def test_subseq_fasta_biopython(get_test_fasta): - seqs_to_get = ['seq1'] - - outfasta = Path('PyIsland/tests/fastautils/subseq_fasta_biopython.fasta') - expected_output = Path('PyIsland/tests/fastautils/subseq_fasta_expected_output.fasta') - - FastaUtils.subseq_fasta_biopython(seqs_to_get, - get_test_fasta, - outfasta) - + seqs_to_get = ["seq1"] + + outfasta = Path("PyIsland/tests/fastautils/subseq_fasta_biopython.fasta") + expected_output = Path( + "PyIsland/tests/fastautils/subseq_fasta_expected_output.fasta" + ) + + FastaUtils.subseq_fasta_biopython(seqs_to_get, get_test_fasta, outfasta) + output_hash = tests.md5sum(outfasta) expected_output_hash = tests.md5sum(expected_output) - + assert output_hash == expected_output_hash outfasta.unlink() - + + def test_slice_fasta_biopython(get_test_fasta): - seqs_to_get = 'seq2' + seqs_to_get = "seq2" start = 4 stop = 9 - outfasta = Path('PyIsland/tests/fastautils/slice_fasta_biopython.fasta') - expected_output = Path('PyIsland/tests/fastautils/slice_fasta_expected_output.fasta') - - FastaUtils.slice_fasta_biopython(seqs_to_get, - start, - stop, - get_test_fasta, - outfasta - ) - + outfasta = Path("PyIsland/tests/fastautils/slice_fasta_biopython.fasta") + expected_output = Path( + "PyIsland/tests/fastautils/slice_fasta_expected_output.fasta" + ) + + FastaUtils.slice_fasta_biopython(seqs_to_get, start, stop, get_test_fasta, outfasta) + output_hash = tests.md5sum(outfasta) expected_output_hash = tests.md5sum(expected_output) - + assert output_hash == expected_output_hash outfasta.unlink() - + + def test_fa_strict(get_test_fasta): - - outfasta = Path('PyIsland/tests/fastautils/fa_strict.fasta') - expected_output = Path('PyIsland/tests/fastautils/fa_strict_expected_output.fasta') - - FastaUtils.fa_strict(get_test_fasta, - outfasta, - min_len=5 - ) - + outfasta = Path("PyIsland/tests/fastautils/fa_strict.fasta") + expected_output = Path("PyIsland/tests/fastautils/fa_strict_expected_output.fasta") + + FastaUtils.fa_strict(get_test_fasta, outfasta, min_len=5) + output_hash = tests.md5sum(outfasta) expected_output_hash = tests.md5sum(expected_output) - + assert output_hash == expected_output_hash outfasta.unlink() - + + def test_count_seqs_in_fasta(get_test_fasta): - num_seqs = FastaUtils.count_seqs_in_fasta(get_test_fasta) - + assert num_seqs == 6 - + + def test_deduplicate_fasta(get_test_fasta): - - outfasta = Path('PyIsland/tests/fastautils/dedup.fasta') - expected_output = Path('PyIsland/tests/fastautils/dedup_expected_output.fasta') - - FastaUtils.deduplicate_fasta(get_test_fasta, - outfasta, - ) - + outfasta = Path("PyIsland/tests/fastautils/dedup.fasta") + expected_output = Path("PyIsland/tests/fastautils/dedup_expected_output.fasta") + + FastaUtils.deduplicate_fasta( + get_test_fasta, + outfasta, + ) + output_hash = tests.md5sum(outfasta) expected_output_hash = tests.md5sum(expected_output) - + assert output_hash == expected_output_hash - outfasta.unlink() \ No newline at end of file + outfasta.unlink() diff --git a/PyIsland/tests/parsers/test_parsers.py b/PyIsland/tests/parsers/test_parsers.py index eb6c57d..de576ca 100644 --- a/PyIsland/tests/parsers/test_parsers.py +++ b/PyIsland/tests/parsers/test_parsers.py @@ -3,87 +3,90 @@ from PyIsland import tests import pytest + @pytest.fixture def get_test_gff(): - return Path('PyIsland/tests/lambda.gff') + return Path("PyIsland/tests/lambda.gff") + @pytest.fixture def get_test_gbk(): - return Path('PyIsland/tests/lambda.gbk') + return Path("PyIsland/tests/lambda.gbk") + @pytest.fixture def get_test_fasta(): - return Path('PyIsland/tests/lambda.fna') + return Path("PyIsland/tests/lambda.fna") + @pytest.fixture def get_test_orfs(): - return Path('PyIsland/tests/lambda.faa') + return Path("PyIsland/tests/lambda.faa") + def test_gff_to_df(get_test_gff): - df = Parsers.gff_to_df(get_test_gff) - - #check I can get a contig using the .query method and a string + + # check I can get a contig using the .query method and a string assert not df.query('contig == "NC_001416.1"').empty - -#def test_gff_to_df_with_prodigal_gff(): - + + +# def test_gff_to_df_with_prodigal_gff(): + # df = Parsers.gff_to_df('PyIsland/tests/example_prodigal.gff') - - #check I can get a contig using the .query method and a string + +# check I can get a contig using the .query method and a string # assert not df.query('contig == "10007077.1"').empty - + + def test_gbk_to_df(get_test_gbk): - df = Parsers.genbank_to_df(get_test_gbk) - + features = df.feature_type.unique().tolist() - expected_features = ['source', - 'gene', - 'CDS', - 'sig_peptide', - 'mat_peptide', - 'mRNA', - 'variation', - 'misc_recomb', - 'misc_binding', - 'regulatory', - 'misc_feature', - 'unsure' - ] - - - #check I get all features + expected_features = [ + "source", + "gene", + "CDS", + "sig_peptide", + "mat_peptide", + "mRNA", + "variation", + "misc_recomb", + "misc_binding", + "regulatory", + "misc_feature", + "unsure", + ] + + # check I get all features assert features == expected_features - - #check I labelled protein_ids + + # check I labelled protein_ids assert df.protein_id.notna().sum() == 73 - - #check products are labelled - assert len(df.query('product.notna()')) == 73 - - + # check products are labelled + assert len(df.query("product.notna()")) == 73 + + def _test_gff_to_df(get_test_gff): - seqs_to_get = ['seq1'] - - outfasta = Path('PyIsland/tests/fastautils/subseq_fasta_biopython.fasta') - + seqs_to_get = ["seq1"] + + outfasta = Path("PyIsland/tests/fastautils/subseq_fasta_biopython.fasta") + df = Parsers.gff_to_df(get_test_gff) - + output_hash = tests.md5sum(outfasta) expected_output_hash = tests.md5sum(expected_output) - + assert output_hash == expected_output_hash outfasta.unlink() - - + + def test_prodigal_gff_to_df(get_test_gff): - - df = Parsers.prodigal.prodigal_gff_to_df('PyIsland/tests/example_prodigal.gff') - - #check I can get a contig using the .query method and a string + df = Parsers.prodigal.prodigal_gff_to_df("PyIsland/tests/example_prodigal.gff") + + # check I can get a contig using the .query method and a string assert not df.query('contig == "10007077.1"').empty - - #check the protein_id was parsed - assert df.protein_id.isna().sum() == 0 \ No newline at end of file + + # check the protein_id was parsed + assert df.protein_id.isna().sum() == 0 diff --git a/docs/conf.py b/docs/conf.py index 210b332..22c4ded 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,28 +29,28 @@ extensions = [] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'iscb_screen' +project = "iscb_screen" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '0.1' +version = "0.1" # The full version, including alpha/beta/rc tags. -release = '0.1' +release = "0.1" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -64,7 +64,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all documents. # default_role = None @@ -81,7 +81,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -91,7 +91,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +html_theme = "default" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -120,7 +120,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. @@ -164,7 +164,7 @@ # html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'iscb_screendoc' +htmlhelp_basename = "iscb_screendoc" # -- Options for LaTeX output -------------------------------------------------- @@ -172,10 +172,8 @@ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # 'preamble': '', } @@ -183,10 +181,7 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', - 'iscb_screen.tex', - u'iscb_screen Documentation', - u"sean benler", 'manual'), + ("index", "iscb_screen.tex", "iscb_screen Documentation", "sean benler", "manual"), ] # The name of an image file (relative to this directory) to place at the top of @@ -214,10 +209,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'iscb_screen', u'iscb_screen Documentation', - [u"sean benler"], 1) -] +man_pages = [("index", "iscb_screen", "iscb_screen Documentation", ["sean benler"], 1)] # If true, show URL addresses after external links. # man_show_urls = False @@ -229,9 +221,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'iscb_screen', u'iscb_screen Documentation', - u"sean benler", 'iscb_screen', - 'Finding IscB/Cas9 in genomic databases', 'Miscellaneous'), + ( + "index", + "iscb_screen", + "iscb_screen Documentation", + "sean benler", + "iscb_screen", + "Finding IscB/Cas9 in genomic databases", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. diff --git a/setup.py b/setup.py index 729836b..fdcbe91 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,19 @@ import glob from setuptools import find_packages, setup -requirements = [req.strip() for req in open('requirements.txt', 'r').readlines() if not req.startswith('#')] +requirements = [ + req.strip() + for req in open("requirements.txt", "r").readlines() + if not req.startswith("#") +] setup( - name='PyIsland', - scripts = [script for script in glob.glob('bin/*')], + name="PyIsland", + scripts=[script for script in glob.glob("bin/*")], packages=find_packages(), - install_requires = requirements, - version='0.0.1', - description='Libary for comparative genomics', - author='SMB', - license='', + install_requires=requirements, + version="0.0.1", + description="Libary for comparative genomics", + author="SMB", + license="", ) diff --git a/test_environment.py b/test_environment.py index d0ac4a7..5381850 100644 --- a/test_environment.py +++ b/test_environment.py @@ -10,16 +10,17 @@ def main(): elif REQUIRED_PYTHON == "python3": required_major = 3 else: - raise ValueError("Unrecognized python interpreter: {}".format( - REQUIRED_PYTHON)) + raise ValueError("Unrecognized python interpreter: {}".format(REQUIRED_PYTHON)) if system_major != required_major: raise TypeError( "This project requires Python {}. Found: Python {}".format( - required_major, sys.version)) + required_major, sys.version + ) + ) else: print(">>> Development environment passes all tests!") -if __name__ == '__main__': +if __name__ == "__main__": main()