From 82d0079159c924a155a4817de7a3f40cfcc0f59b Mon Sep 17 00:00:00 2001
From: sean-bam <34106755+sean-bam@users.noreply.github.com>
Date: Wed, 21 Feb 2024 15:22:05 -0800
Subject: [PATCH] blackify; rename to pyIsland

---
 PyIsland/Drivers/hhsuite.py                  |   3 +-
 PyIsland/Drivers/mmseqs.py                   |  33 +-
 PyIsland/FastaUtils/MSA.py                   | 354 ++++++++++---------
 PyIsland/FastaUtils/__init__.py              |  35 +-
 PyIsland/GCP/bigquery.py                     |  53 +--
 PyIsland/GCP/storage.py                      |   7 +-
 PyIsland/Islands/__init__.py                 |   4 +-
 PyIsland/Parsers/blast.py                    |  12 +-
 PyIsland/Parsers/hhsuite.py                  |   3 -
 PyIsland/Parsers/mmseqs.py                   |  12 +-
 PyIsland/tests/__init__.py                   |   7 +-
 PyIsland/tests/fastautils/test_fastautils.py |  96 +++--
 PyIsland/tests/parsers/test_parsers.py       | 109 +++---
 docs/conf.py                                 |  46 ++-
 setup.py                                     |  20 +-
 test_environment.py                          |   9 +-
 16 files changed, 421 insertions(+), 382 deletions(-)

diff --git a/PyIsland/Drivers/hhsuite.py b/PyIsland/Drivers/hhsuite.py
index 8178080..3fddb65 100644
--- a/PyIsland/Drivers/hhsuite.py
+++ b/PyIsland/Drivers/hhsuite.py
@@ -67,6 +67,7 @@ def get_hhsuite_neff(msa):
     neff = float(p1.stdout.strip().split()[10])
     return round(neff, 2)
 
+
 def align_clusters_with_hhalign(input_dir, output_dir, df_clustering):
     """
     accepts a table of profile, length and cluster assignment
@@ -115,4 +116,4 @@ def align_clusters_with_hhalign(input_dir, output_dir, df_clustering):
                 f"hhalign -i {query_profile} {templates} -M 50 -glob -id 100 -diff inf -oa3m {output_a3m}",
                 shell=True,
                 check=True,
-            )
\ No newline at end of file
+            )
diff --git a/PyIsland/Drivers/mmseqs.py b/PyIsland/Drivers/mmseqs.py
index 846cdbe..1fb7705 100644
--- a/PyIsland/Drivers/mmseqs.py
+++ b/PyIsland/Drivers/mmseqs.py
@@ -3,7 +3,7 @@
 import shutil
 import tempfile
 
-from pyIsland import Parsers
+from PyIsland import Parsers
 
 
 def run_mmseqs_cls(
@@ -26,7 +26,7 @@ def run_mmseqs_cls(
 
     # seqdb = Path(f'{dbname}seqDB')
     # clsdb = Path(f'{dbname}clsDB')
-    tmp_dir = tempfile.TemporaryDirectory(dir='.')
+    tmp_dir = tempfile.TemporaryDirectory(dir=".")
     tmp_dir_path = Path(tmp_dir.name)
     tsv = Path(tsv)
 
@@ -62,21 +62,21 @@ def run_mmseqs_cls(
     #        file.unlink()
     #    except IsADirectoryError:
     #        pass
-    #shutil.rmtree("tmpmmseqsdb/")
-    
+    # shutil.rmtree("tmpmmseqsdb/")
+
+
 def get_reps_from_clustering(seqDB, clsDB, fasta):
-    
-    tmp_dir = tempfile.TemporaryDirectory(dir='.')
+    tmp_dir = tempfile.TemporaryDirectory(dir=".")
     tmp_dir_path = Path(tmp_dir.name)
-    repdb = tmp_dir_path / Path('repDB')
-    
+    repdb = tmp_dir_path / Path("repDB")
+
     subprocess.run(
         f"mmseqs createsubdb {clsDB} {seqDB} {repdb}",
         shell=True,
         check=True,
         stdout=subprocess.DEVNULL,
     )
-    
+
     subprocess.run(
         f"mmseqs convert2fasta {repdb} {fasta}",
         shell=True,
@@ -85,7 +85,6 @@ def get_reps_from_clustering(seqDB, clsDB, fasta):
     )
 
 
-
 def cls2pro(
     seqDB, clsDB, proDB, evalue=0.001, qid=0.0, qsc=-20, diff=1000, max_seq_id=0.9
 ):
@@ -94,11 +93,17 @@ def cls2pro(
     )
 
     subprocess.run(
-        f"mmseqs createsubdb {clsDB} {seqDB} {clsDB}rep", shell=True, check=True, stdout=subprocess.DEVNULL
+        f"mmseqs createsubdb {clsDB} {seqDB} {clsDB}rep",
+        shell=True,
+        check=True,
+        stdout=subprocess.DEVNULL,
     )
 
     subprocess.run(
-        f"mmseqs createsubdb {clsDB} {seqDB}_h {clsDB}rep_h", shell=True, check=True, stdout=subprocess.DEVNULL
+        f"mmseqs createsubdb {clsDB} {seqDB}_h {clsDB}rep_h",
+        shell=True,
+        check=True,
+        stdout=subprocess.DEVNULL,
     )
 
     subprocess.run(
@@ -298,7 +303,9 @@ def get_missing_seqs_as_fasta_using_tsv(db1, tsv, output):
     assert db1_lookup.is_file(), f"Can't find {db1_lookup}"
 
     # get headers in DB1 missing from DB2
-    db1_dict = Parsers.mmseqs.get_mmseqs_internal_ids_as_dict(db1_lookup)  # internal id : header
+    db1_dict = Parsers.mmseqs.get_mmseqs_internal_ids_as_dict(
+        db1_lookup
+    )  # internal id : header
     tsv_dict = Parsers.mmseqs.mmseqs_tsv_to_dict(tsv)  # member : representative
 
     missing_ids = db1_dict.values() - tsv_dict.keys()
diff --git a/PyIsland/FastaUtils/MSA.py b/PyIsland/FastaUtils/MSA.py
index eabc178..6cb1fe5 100644
--- a/PyIsland/FastaUtils/MSA.py
+++ b/PyIsland/FastaUtils/MSA.py
@@ -11,82 +11,84 @@
 from Bio.Align import substitution_matrices
 from Bio.SeqIO.FastaIO import SimpleFastaParser
 
+
 def get_tables_for_msa_filtering():
-    
-    #set a background frequency of AAs
+    # set a background frequency of AAs
     background_aa_freq = {
-                            "A" : 0.07422,
-                            "C" : 0.02469,
-                            "D" : 0.05363,
-                            "E" : 0.05431,
-                            "F" : 0.04742,
-                            "G" : 0.07415,
-                            "H" : 0.02621,
-                            "I" : 0.06792,
-                            "K" : 0.05816,
-                            "L" : 0.09891,
-                            "M" : 0.02499,
-                            "N" : 0.04465,
-                            "P" : 0.03854,
-                            "Q" : 0.03426,
-                            "R" : 0.05161,
-                            "S" : 0.05723,
-                            "T" : 0.05089,
-                            "V" : 0.07292,
-                            "W" : 0.01303,
-                            "Y" : 0.03228,
-                            "X" : 0.00001,
-                         }
-        
-    #get a blosum62 matrix
+        "A": 0.07422,
+        "C": 0.02469,
+        "D": 0.05363,
+        "E": 0.05431,
+        "F": 0.04742,
+        "G": 0.07415,
+        "H": 0.02621,
+        "I": 0.06792,
+        "K": 0.05816,
+        "L": 0.09891,
+        "M": 0.02499,
+        "N": 0.04465,
+        "P": 0.03854,
+        "Q": 0.03426,
+        "R": 0.05161,
+        "S": 0.05723,
+        "T": 0.05089,
+        "V": 0.07292,
+        "W": 0.01303,
+        "Y": 0.03228,
+        "X": 0.00001,
+    }
+
+    # get a blosum62 matrix
     blosum62 = substitution_matrices.load("BLOSUM62")
 
-    #set a expectation of AAs
-    #expectation_aa = {}
-    #for aa1 in background_aa_freq.keys():
+    # set a expectation of AAs
+    # expectation_aa = {}
+    # for aa1 in background_aa_freq.keys():
     #    expectation_aa[aa1] = 0
     #    for aa2 in background_aa_freq.keys():
     #        expectation_aa[aa1] = expectation_aa[aa1] + blosum62[aa1,aa2]*background_aa_freq[aa2]
-    
+
     expectation_aa = aa_freq_to_score(background_aa_freq)
 
     return background_aa_freq, blosum62, expectation_aa
 
+
 def aa_freq_to_score(col_aa_frq):
     """
     expects a dictionary of AA : frequency
     returns a dictionary of AA : score
-    
+
     score is sum of blosum62 scores of an index AA versus all other AAs
     """
-    #set all AA weights as zero
+    # set all AA weights as zero
     weights = {}
     for aa in col_aa_frq.keys():
         weights[aa] = 0
-        
-    #get a blosum62 matrix
+
+    # get a blosum62 matrix
     blosum62 = substitution_matrices.load("BLOSUM62")
-    
-    #assign weights to each AA
+
+    # assign weights to each AA
     for aa1 in col_aa_frq.keys():
         for aa2 in col_aa_frq.keys():
-            weights[aa1] = weights[aa1] + (blosum62[aa1,aa2] * col_aa_frq[aa2])
-            #print(f"AA1 is {aa1}, AA2 is {aa2}, frequency of AA2 is {col_aa_frq[aa2]}, blosum62 of {aa1}{aa2} is {blosum62[aa1,aa2]}, updating weight of AA1 to {weights[aa1]}")
-    
+            weights[aa1] = weights[aa1] + (blosum62[aa1, aa2] * col_aa_frq[aa2])
+            # print(f"AA1 is {aa1}, AA2 is {aa2}, frequency of AA2 is {col_aa_frq[aa2]}, blosum62 of {aa1}{aa2} is {blosum62[aa1,aa2]}, updating weight of AA1 to {weights[aa1]}")
+
     return weights
 
 
-def aln_to_df(aln, msa_format='fasta'):
+def aln_to_df(aln, msa_format="fasta"):
     """
     #deprecated
     convertsa  biopython alignment object into a pandas dataframe
     """
-    
+
     headers = [rec.id for rec in aln]
     seqs = [list(rec) for rec in aln]
-    df = pd.DataFrame.from_records(seqs, index = headers).astype('str')
+    df = pd.DataFrame.from_records(seqs, index=headers).astype("str")
     return df
 
+
 def fa_to_df(fa, is_msa=False):
     """
     Converts a fasta-formatted file to a pandas dataframe
@@ -98,173 +100,178 @@ def fa_to_df(fa, is_msa=False):
             headers.append(title)
             seqs.append(list(seq.upper()))
 
-        
-    df = pd.DataFrame.from_records(seqs, index = headers)
-    
-    #check to make sure the MSA is aligned properly
+    df = pd.DataFrame.from_records(seqs, index=headers)
+
+    # check to make sure the MSA is aligned properly
     if is_msa:
         assert df.isna().sum().sum() == 0, f"All sequences must be the same length"
-   
+
     return df
 
+
 def df_to_fa(df, output):
     """
     converts a pandas dataframe of sequences to a fasta-formatted file
     """
-    
-    #combine the columns into a single string
+
+    # combine the columns into a single string
     series = df.T.apply("".join)
-    
-    #write the output
-    with open(output, 'w') as o:
+
+    # write the output
+    with open(output, "w") as o:
         for header, seq in series.items():
-            print(f">{header}", seq, sep = '\n', file = o)
-            
+            print(f">{header}", seq, sep="\n", file=o)
+
+
 def get_gap_cols_from_msa(df, grcut=1):
     """
     -grcut = no more than r fraction of gaps
-    
+
     returns a list of columns that are more than r fraction of gaps
     """
-    #df = msa_to_df(msa)
+    # df = msa_to_df(msa)
     num_seqs, num_columns = df.shape
     EPSILON = 1e-6
 
-    #calculate the minimum number of non-NA values per column
+    # calculate the minimum number of non-NA values per column
     max_gaps_in_column = round((num_seqs * grcut) + EPSILON, 3)
 
-    #convert gaps to NA, count how many NAs per column
+    # convert gaps to NA, count how many NAs per column
     gap_count_series = df.replace("-", np.nan).isnull().sum()
 
-    #get columns where number of NAs is >= max_gaps
-    gap_col_list = gap_count_series.loc[lambda s: s >= max_gaps_in_column].index.tolist()
-    #df2 = df.loc[:, good_cols]
-    
+    # get columns where number of NAs is >= max_gaps
+    gap_col_list = gap_count_series.loc[
+        lambda s: s >= max_gaps_in_column
+    ].index.tolist()
+    # df2 = df.loc[:, good_cols]
+
     ##RO
-    
+
     return gap_col_list
 
+
 def calc_HH94(df, grswe=0.51):
     """
     - sequences have an initial weight of 1/nseq
     - Calculates the score of a position in an MSA as 1/rd
         r = # of dif characters (excluding gaps) in a column
-        d = number of times the character of the considered sequence 
+        d = number of times the character of the considered sequence
         and position appears in the considered alignment column
-    
+
     - columns in MSA that are >grswe fraction of gap positions are ignored
-    
+
     - the scores of a sequence's aligned positions are added to its weight
-    
+
     Returns a series object containing the sequence name and HH94 weight
     """
-    #df = msa_to_df(msa)
-    
-    #convert gaps to NA so they are not counted in the weights 
+    # df = msa_to_df(msa)
+
+    # convert gaps to NA so they are not counted in the weights
     df2 = df.replace("-", np.nan)
-    
-    #get a list of gap columns that are more than grswe fraction of gaps
+
+    # get a list of gap columns that are more than grswe fraction of gaps
     gap_cols = get_gap_cols_from_msa(df, grswe)
-    
-    #initial sequence weight is just 1 / nseq
+
+    # initial sequence weight is just 1 / nseq
     initial_weight = 1 / len(df)
 
-    
     colaa_weights_nest_dict = {}
     for column in df2.columns:
         if column not in gap_cols:
-        
-            #get a count of the frequency of each AA in the column
-            #by default, NAs (gaps) are not counted
+            # get a count of the frequency of each AA in the column
+            # by default, NAs (gaps) are not counted
             col_series = df2[[column]].value_counts()
 
-            #calculate the AA weights in the column
-            #the length of the series is the number of unique AAs
+            # calculate the AA weights in the column
+            # the length of the series is the number of unique AAs
             colaa_weights = round(1 / (col_series * len(col_series)), 3)
 
-            #store the column-specific results in a dict of dict
+            # store the column-specific results in a dict of dict
             colaa_weights_dict = colaa_weights.to_dict()
             colaa_weights_nest_dict[column] = colaa_weights_dict
 
-    #replace the AAs in the MSA with their column-specific weights
+    # replace the AAs in the MSA with their column-specific weights
     df3 = df2.replace(colaa_weights_nest_dict)
-    
-    #for each sequence, sum all the values and add the initial wieght
-    hh94_series = df3.sum(axis = 1).round(3) + initial_weight
-    
-    #normalize
+
+    # for each sequence, sum all the values and add the initial wieght
+    hh94_series = df3.sum(axis=1).round(3) + initial_weight
+
+    # normalize
     hh94_mean = hh94_series.mean()
     if hh94_mean < 0:
         hh94_mean = 1
     hh94_norm = round(hh94_series / hh94_series.mean(), 4)
-    
-    #return a dictionary containing the sequence and its normalized weight
+
+    # return a dictionary containing the sequence and its normalized weight
     return hh94_norm.to_dict()
-    
+
+
 def get_col_aa_frequencies(column, hh94_scores, background_aa_freq):
     """
     Accepts a dictionary or pandas series containing seq : aligned char
-    
+
     calculates the column=specific AA weights, given HH94 scores of sequences
-    
+
     returns a dictionary of AA : weight
     """
-    
-    #set all AA frequencies as zero
+
+    # set all AA frequencies as zero
     aafreq = {}
     for aa in background_aa_freq.keys():
         aafreq[aa] = 0
-    
-    #set gap score as zero
+
+    # set gap score as zero
     ngap = 0
-    
-    for seq,char in column.items():
+
+    for seq, char in column.items():
         if char == "-":
-            
-            #increment the background frequencies of all AAs
+            # increment the background frequencies of all AAs
             for aa, score in aafreq.items():
-                aafreq[aa] = aafreq[aa] + background_aa_freq[aa]*hh94_scores[seq]
-            
-            #increment the gap frequency 
+                aafreq[aa] = aafreq[aa] + background_aa_freq[aa] * hh94_scores[seq]
+
+            # increment the gap frequency
             ngap += hh94_scores[seq]
         else:
             try:
                 aafreq[char] = aafreq[char] + hh94_scores[seq]
             except KeyError:
                 pass
-    
-    #add in ngaps to the dict
+
+    # add in ngaps to the dict
     aafreq["-"] = ngap
-    
+
     return aafreq
 
+
 def get_best_aa_and_score(aascores):
     """
     Expects a dictionary of AA : score
     Selects the AA with the highest score
-    
+
     Returns AA and score
     """
-    
-    #get top AA and score, as a tuple
-    bestaa_tuple = max(aascores.items(), key = lambda k : k[1])
-    
-    return bestaa_tuple[0],bestaa_tuple[1]
-    
+
+    # get top AA and score, as a tuple
+    bestaa_tuple = max(aascores.items(), key=lambda k: k[1])
+
+    return bestaa_tuple[0], bestaa_tuple[1]
+
+
 def calc_homogeneity(aa, score, nseqs, expectaascores, blosum62):
     """
     calculates the homogeneity of a column in an MSA
     homogeneity ranges from 0-1
     """
-    
-    homogeneity = (score/nseqs - expectaascores[aa]) / (blosum62[aa, aa] - expectaascores[aa])
+
+    homogeneity = (score / nseqs - expectaascores[aa]) / (
+        blosum62[aa, aa] - expectaascores[aa]
+    )
     return round(homogeneity, 4)
 
+
 def assign_consensus(bestaa, grcon, homo, hocon, ngaps, nseqs, ambiguousAA="X"):
-    """
-    
-    """
-    #Default consensus is gap
+    """ """
+    # Default consensus is gap
     caa = "-"
 
     if ngaps <= nseqs * grcon:
@@ -272,13 +279,23 @@ def assign_consensus(bestaa, grcon, homo, hocon, ngaps, nseqs, ambiguousAA="X"):
             caa = bestaa
         else:
             caa = ambiguousAA
-            #caa = bestaa
+            # caa = bestaa
 
     return caa
 
-def filter_msa(msa, msa_out, grcut=1.1, hocut=-100, gcon=0.499, hcon=0.01, conplus=False, grswe=0.51):
+
+def filter_msa(
+    msa,
+    msa_out,
+    grcut=1.1,
+    hocut=-100,
+    gcon=0.499,
+    hcon=0.01,
+    conplus=False,
+    grswe=0.51,
+):
     """
-    MSA filtering + trimming, as described in 
+    MSA filtering + trimming, as described in
     Esterman et. al (2021) https://doi.org/10.1093/ve/veab015
     """
 
@@ -287,73 +304,72 @@ def filter_msa(msa, msa_out, grcut=1.1, hocut=-100, gcon=0.499, hcon=0.01, conpl
     df = fa_to_df(msa, is_msa=True)
 
     if grcut <= 1:
-        #print(f'getting a list of positions that are > {grcut * 100}% gaps')
-        
-        #get a list of gap positions
+        # print(f'getting a list of positions that are > {grcut * 100}% gaps')
+
+        # get a list of gap positions
         gapcols = get_gap_cols_from_msa(df, grcut)
-        
+
         bad_cols += gapcols
-        
+
     if hocut >= 0 or conplus:
-        
-        #then we need to iterate through each position in the MSA
-        #print(f'iterating through the alignment')
-        
-        #consensus seq is empty
+        # then we need to iterate through each position in the MSA
+        # print(f'iterating through the alignment')
+
+        # consensus seq is empty
         conseq = []
-        
-        #get the necessary tables
-        background_aa_freq,blosum62,expectedAAscores = get_tables_for_msa_filtering()
-        
-        #convert the msa to a dataframe 
+
+        # get the necessary tables
+        background_aa_freq, blosum62, expectedAAscores = get_tables_for_msa_filtering()
+
+        # convert the msa to a dataframe
         nseqs, ncols = df.shape
-        
-        #get the HH94 scores of the sequences
+
+        # get the HH94 scores of the sequences
         hh94_scores = calc_HH94(df, grswe)
-        
-        for col in range(0,ncols):
-            
-            #get the column
+
+        for col in range(0, ncols):
+            # get the column
             column = df[col]
 
-            #calculate the AA frequencies
+            # calculate the AA frequencies
             col_aa_frq = get_col_aa_frequencies(column, hh94_scores, background_aa_freq)
 
-            #the last item in the dictionary is the gap frequencies
-            #record it and remove from the dictionary,
-            #otherwise the next fxn throws an error
+            # the last item in the dictionary is the gap frequencies
+            # record it and remove from the dictionary,
+            # otherwise the next fxn throws an error
             ngaps = col_aa_frq.popitem()[1]
 
-            #convert the frequencies to scores
+            # convert the frequencies to scores
             col_aa_scores = aa_freq_to_score(col_aa_frq)
 
-            #get the top scoring AA
+            # get the top scoring AA
             bestaa, bestaascore = get_best_aa_and_score(col_aa_scores)
-                
-            #assess the homogeneity of the AA, if the bestscoring AA is above expectation
-            #if bestaascore/nseqs > expectedAAscores[bestaa]:
-            homo = calc_homogeneity(bestaa, bestaascore, nseqs, expectedAAscores, blosum62)
-            
-            #record columns that are below the homogeneity threshold
+
+            # assess the homogeneity of the AA, if the bestscoring AA is above expectation
+            # if bestaascore/nseqs > expectedAAscores[bestaa]:
+            homo = calc_homogeneity(
+                bestaa, bestaascore, nseqs, expectedAAscores, blosum62
+            )
+
+            # record columns that are below the homogeneity threshold
             if homo < hocut:
                 bad_cols.append(col)
 
-            #assign the consensus
+            # assign the consensus
             con = assign_consensus(bestaa, gcon, homo, hcon, ngaps, nseqs)
             conseq.append(con)
-        
+
         if conplus:
-            
-            #make a dataframe from the consensus
-            df_c = pd.DataFrame.from_records([conseq], index = ["CONSENSUS"])
-            
-            #update the original MSA datafarme
-            df = pd.concat([df_c,df])
-        
-    #remove gap/nonhomogenous columns
+            # make a dataframe from the consensus
+            df_c = pd.DataFrame.from_records([conseq], index=["CONSENSUS"])
+
+            # update the original MSA datafarme
+            df = pd.concat([df_c, df])
+
+    # remove gap/nonhomogenous columns
     if len(bad_cols) > 0:
-        df = df.drop(columns = bad_cols)
+        df = df.drop(columns=bad_cols)
 
-    #write the output
-    with open(msa_out, 'w') as f:
-        df_to_fa(df, msa_out)
\ No newline at end of file
+    # write the output
+    with open(msa_out, "w") as f:
+        df_to_fa(df, msa_out)
diff --git a/PyIsland/FastaUtils/__init__.py b/PyIsland/FastaUtils/__init__.py
index 97057c9..f646ca5 100644
--- a/PyIsland/FastaUtils/__init__.py
+++ b/PyIsland/FastaUtils/__init__.py
@@ -104,42 +104,41 @@ def fa_strict(fasta_in, fasta_out, min_len=0):
     with open(fasta_out, "w") as o:
         SeqIO.write(seq_records, fasta_out, "fasta")
         print(f"wrote {len(seq_records)}/{i} sequences")
-        
-def get_subseq_coords(query,seq_record):
+
+
+def get_subseq_coords(query, seq_record):
     """
     reports 1-based start/stop/strand of a query subsequence in a subject seq
     query subsequence is a string
     subject is BioPython SeqRecord
-    
+
     Returns a tuple of start,stop, strand
     Start = -1 if the subsequence is not found
     """
-    
+
     qlen = len(query)
-    
-    
-    #search the positive strand
+
+    # search the positive strand
     start = seq_record.seq.find(query)
     stop = start + qlen
     strand = "1"
-    
-    #if no hit, search the RC
+
+    # if no hit, search the RC
     if start == -1:
-    
-        #reverse complement the query
+        # reverse complement the query
         query = str(Seq(query).reverse_complement())
 
-        #search
+        # search
         start = seq_record.seq.find(query)
         stop = start + qlen
         strand = "-1"
-    
-    #no hits on either strand
+
+    # no hits on either strand
     if start == -1:
         start = -2
         stop = -1
         print(f"could not find the qseq on either + or - strand")
-    
-    #update to 1-based
-    start +=1
-    return start,stop,strand
+
+    # update to 1-based
+    start += 1
+    return start, stop, strand
diff --git a/PyIsland/GCP/bigquery.py b/PyIsland/GCP/bigquery.py
index 9b70b0f..c0daee7 100644
--- a/PyIsland/GCP/bigquery.py
+++ b/PyIsland/GCP/bigquery.py
@@ -5,6 +5,7 @@
 from google.cloud import bigquery
 from atxlib import GCP
 
+
 def upload_gff_to_bq(gff, table_id):
     """
     Accepts a GFF file and a BQ location
@@ -126,10 +127,11 @@ def csv2fasta(csv, output):
                 i += 1
     return i
 
-def write_table_to_gcs(input_table:str, uri:str, location='us-central1'):
+
+def write_table_to_gcs(input_table: str, uri: str, location="us-central1"):
     client = bigquery.Client()
     job_config = bigquery.ExtractJobConfig()
-    #job_config.destination_format = bigquery.DestinationFormat.CSV
+    # job_config.destination_format = bigquery.DestinationFormat.CSV
     job_config.print_header = False
 
     extract_job = client.extract_table(
@@ -232,18 +234,21 @@ def get_seqs_from_bq(
         print(f"Deleting local output {output_blob_name}")
         Path(output_blob_name).unlink()
 
-def blast_rep_to_member(seq_table:str, clustering_table:str, blast_table:str, output_table:str):
+
+def blast_rep_to_member(
+    seq_table: str, clustering_table: str, blast_table: str, output_table: str
+):
     """
     Accepts a bigquery table of sequences, a bigquery table of clustering information, and a bigquery table of blast hits
     and returns a table of sequences that have a blast hit to a representative sequence
     """
     client = bigquery.Client()
 
-    #extract seqs to table
+    # extract seqs to table
     job_config = bigquery.QueryJobConfig(
-        dry_run = False,
-        destination = output_table,
-        write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE,
+        dry_run=False,
+        destination=output_table,
+        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
     )
 
     sql = f"""
@@ -265,19 +270,25 @@ def blast_rep_to_member(seq_table:str, clustering_table:str, blast_table:str, ou
     destination_table = client.get_table(output_table)
     print("Loaded {} rows.".format(destination_table.num_rows))
 
-def get_seqs_from_bq_as_fasta(seq_table:str, output_uri:str, keep_local=True):
+
+def get_seqs_from_bq_as_fasta(seq_table: str, output_uri: str, keep_local=True):
     """
-    Accepts a bigquery table with two rows "protein_id" and "sequence" and returns a fasta file 
+    Accepts a bigquery table with two rows "protein_id" and "sequence" and returns a fasta file
     """
 
-    #set paths
-    bucket_name, output_blob_path = GCP.storage.extract_bucketname_and_blob_path_from_uri(
-        output_uri
-    )
-    output_blob_name = output_blob_path.split("/")[-1]                      # e.g. 'output.faa'
-    output_blob_dir = "/".join(output_blob_path.split("/")[:-1]) + "/"      # e.g. 'path/to/folder/'
-    tmp_blob_dir = output_blob_dir + "tmp_blobs/"                           # e.g. 'path/to/folder/tmp_blobs/'
-    tmp_blobs = "gs://" + bucket_name + "/" + tmp_blob_dir + "split*"       #e.g. 'gs://bucket/path/to/folder/tmp_blobs/split*' 
+    # set paths
+    (
+        bucket_name,
+        output_blob_path,
+    ) = GCP.storage.extract_bucketname_and_blob_path_from_uri(output_uri)
+    output_blob_name = output_blob_path.split("/")[-1]  # e.g. 'output.faa'
+    output_blob_dir = (
+        "/".join(output_blob_path.split("/")[:-1]) + "/"
+    )  # e.g. 'path/to/folder/'
+    tmp_blob_dir = output_blob_dir + "tmp_blobs/"  # e.g. 'path/to/folder/tmp_blobs/'
+    tmp_blobs = (
+        "gs://" + bucket_name + "/" + tmp_blob_dir + "split*"
+    )  # e.g. 'gs://bucket/path/to/folder/tmp_blobs/split*'
 
     print(f"Writing {seq_table} to {tmp_blobs}")
     write_table_to_gcs(seq_table, tmp_blobs)
@@ -294,7 +305,7 @@ def get_seqs_from_bq_as_fasta(seq_table:str, output_uri:str, keep_local=True):
     num_seqs = 0
     with open(output_blob_name, "w") as outfile:
         for split in Path(".").glob("split*"):
-            #print(f"Converting {split} to {split}.faa")
+            # print(f"Converting {split} to {split}.faa")
             i = csv2fasta(split, split.with_suffix(".faa"))
             num_seqs += i
             with open(split.with_suffix(".faa")) as infile:
@@ -313,9 +324,9 @@ def get_seqs_from_bq_as_fasta(seq_table:str, output_uri:str, keep_local=True):
 
     if not keep_local:
         Path(output_blob_name).unlink()
-    
-    #remove temporary CSV files from bucket
+
+    # remove temporary CSV files from bucket
     print(f"Removing {tmp_blobs} from bucket")
     blobs = GCP.storage.list_blobs_with_prefix(bucket_name, tmp_blob_dir)
     for blob in blobs:
-        GCP.storage.delete_blob(bucket_name, blob.name)
\ No newline at end of file
+        GCP.storage.delete_blob(bucket_name, blob.name)
diff --git a/PyIsland/GCP/storage.py b/PyIsland/GCP/storage.py
index 0fc56ac..ca73e51 100644
--- a/PyIsland/GCP/storage.py
+++ b/PyIsland/GCP/storage.py
@@ -12,7 +12,7 @@ def get_blobs_in_bucket_as_set(bucket, prefix):
     Accepts strings pointing to a bucket name and a "prefix" (i.e., a "folder")
     The prefix should end with a "/"
     Returns a python set of "filenames"
-    
+
     """
     client = storage.Client()
     bucket = client.get_bucket(bucket)
@@ -21,8 +21,8 @@ def get_blobs_in_bucket_as_set(bucket, prefix):
 
     blob_set = set()
     for blob in blobs:
-        f = Path(blob.name)  #.e.g, path/to/file.txt
-        blob_set.add(f.name) #e.g., file.txt
+        f = Path(blob.name)  # .e.g, path/to/file.txt
+        blob_set.add(f.name)  # e.g., file.txt
 
     return blob_set
 
@@ -75,6 +75,7 @@ def delete_blob(bucket_name, blob_name):
     blob = bucket.blob(blob_name)
     blob.delete()
 
+
 def list_blobs_with_prefix(bucket_name, prefix, delimiter="/"):
     """Lists all the blobs in the bucket."""
     # bucket_name = "your-bucket-name"
diff --git a/PyIsland/Islands/__init__.py b/PyIsland/Islands/__init__.py
index 27df4bf..8229f38 100644
--- a/PyIsland/Islands/__init__.py
+++ b/PyIsland/Islands/__init__.py
@@ -244,8 +244,8 @@ def select_neighborhood(gff: str, seedgff: str, window: int, check=True):
             df_neighborhood = df.query(
                 "contig == @contig and start <= @stop and stop >= @start"
             )
-            
-            #label island
+
+            # label island
             isl_start = str(df_neighborhood.start.min())
             isl_stop = str(df_neighborhood.stop.max())
             df_neighborhood["island"] = str(contig) + ":" + isl_start + "-" + isl_stop
diff --git a/PyIsland/Parsers/blast.py b/PyIsland/Parsers/blast.py
index a9f95a8..a7da080 100644
--- a/PyIsland/Parsers/blast.py
+++ b/PyIsland/Parsers/blast.py
@@ -129,12 +129,12 @@ def remove_overlapping_hits(df, threshold):
 
     def select_nonoverlapping_hits(df, threshold=0.33):
         df_list = []
-        
-        #qstart must be > qend
-        #otherwise, overlap can be falsely calculated
-        df[["qstart","qend"]] = (df.loc[:,["qstart","qend"]]
-                                   .mask(df.qstart > df.qend, df[["qend","qstart"]], axis = 'index')
-                                )
+
+        # qstart must be > qend
+        # otherwise, overlap can be falsely calculated
+        df[["qstart", "qend"]] = df.loc[:, ["qstart", "qend"]].mask(
+            df.qstart > df.qend, df[["qend", "qstart"]], axis="index"
+        )
 
         for name, group in df.groupby("qaccver"):
             df2 = overlap_filtering.remove_overlapping_hits(group, threshold)
diff --git a/PyIsland/Parsers/hhsuite.py b/PyIsland/Parsers/hhsuite.py
index 79af895..67a2cdd 100644
--- a/PyIsland/Parsers/hhsuite.py
+++ b/PyIsland/Parsers/hhsuite.py
@@ -89,6 +89,3 @@ def normalize_scores_by_selfhit(df):
     df["score_norm"] = -np.log(df["score"].astype("float") / selfscore)
 
     return df
-
-
-
diff --git a/PyIsland/Parsers/mmseqs.py b/PyIsland/Parsers/mmseqs.py
index 45bb843..d5423b9 100644
--- a/PyIsland/Parsers/mmseqs.py
+++ b/PyIsland/Parsers/mmseqs.py
@@ -146,21 +146,22 @@ def mmseqs_lookup_to_dict(lookup):
 
 
 def mmseqs_linear_tsv_to_df(tsv):
-    #df_cls = pd.read_csv(tsv, sep="\t", names=["cluster", "members"])
+    # df_cls = pd.read_csv(tsv, sep="\t", names=["cluster", "members"])
     df_cls = mmseqs_tsv_to_df(tsv)
-    
+
     # Explode the members column
     df_cls["members"] = df_cls.members.str.split(" ").to_list()
     df_cls = df_cls.explode("members")
 
     return df_cls
 
+
 def mmseqs_tsv_to_df(tsv):
     df_cls = pd.read_csv(tsv, sep="\t", names=["cluster", "members"])
 
     ## Explode the members column
-    #df_cls["members"] = df_cls.members.str.split(" ").to_list()
-    #df_cls = df_cls.explode("members")
+    # df_cls["members"] = df_cls.members.str.split(" ").to_list()
+    # df_cls = df_cls.explode("members")
 
     return df_cls
 
@@ -178,7 +179,8 @@ def cls2fa(fasta, member_to_cluster, outfolder):
 
                 with open(cluster_fasta_name, "a") as fasta_file:
                     fasta_file.write(seq.format("fasta"))
-                    
+
+
 def get_mmseqs_internal_ids_as_set(index):
     mmseqs_internal_ids = set()
     with open(index) as infile:
diff --git a/PyIsland/tests/__init__.py b/PyIsland/tests/__init__.py
index f585fa6..eb5c095 100644
--- a/PyIsland/tests/__init__.py
+++ b/PyIsland/tests/__init__.py
@@ -1,9 +1,10 @@
 import hashlib
 from functools import partial
 
+
 def md5sum(filename):
-    with open(filename, mode='rb') as f:
+    with open(filename, mode="rb") as f:
         d = hashlib.md5()
-        for buf in iter(partial(f.read, 128), b''):
+        for buf in iter(partial(f.read, 128), b""):
             d.update(buf)
-    return d.hexdigest()
\ No newline at end of file
+    return d.hexdigest()
diff --git a/PyIsland/tests/fastautils/test_fastautils.py b/PyIsland/tests/fastautils/test_fastautils.py
index cbeddb0..c152589 100644
--- a/PyIsland/tests/fastautils/test_fastautils.py
+++ b/PyIsland/tests/fastautils/test_fastautils.py
@@ -3,79 +3,77 @@
 from PyIsland import tests
 import pytest
 
+
 @pytest.fixture
 def get_test_fasta():
-    return Path('PyIsland/tests/example.fasta')
+    return Path("PyIsland/tests/example.fasta")
+
 
 def test_subseq_fasta_biopython(get_test_fasta):
-    seqs_to_get = ['seq1']
-    
-    outfasta = Path('PyIsland/tests/fastautils/subseq_fasta_biopython.fasta')
-    expected_output = Path('PyIsland/tests/fastautils/subseq_fasta_expected_output.fasta')
-    
-    FastaUtils.subseq_fasta_biopython(seqs_to_get,
-                                      get_test_fasta,
-                                      outfasta)
-    
+    seqs_to_get = ["seq1"]
+
+    outfasta = Path("PyIsland/tests/fastautils/subseq_fasta_biopython.fasta")
+    expected_output = Path(
+        "PyIsland/tests/fastautils/subseq_fasta_expected_output.fasta"
+    )
+
+    FastaUtils.subseq_fasta_biopython(seqs_to_get, get_test_fasta, outfasta)
+
     output_hash = tests.md5sum(outfasta)
     expected_output_hash = tests.md5sum(expected_output)
-    
+
     assert output_hash == expected_output_hash
     outfasta.unlink()
-    
+
+
 def test_slice_fasta_biopython(get_test_fasta):
-    seqs_to_get = 'seq2'
+    seqs_to_get = "seq2"
     start = 4
     stop = 9
-    outfasta = Path('PyIsland/tests/fastautils/slice_fasta_biopython.fasta')
-    expected_output = Path('PyIsland/tests/fastautils/slice_fasta_expected_output.fasta')
-    
-    FastaUtils.slice_fasta_biopython(seqs_to_get,
-                                     start,
-                                     stop,
-                                     get_test_fasta,
-                                     outfasta
-                                    )
-    
+    outfasta = Path("PyIsland/tests/fastautils/slice_fasta_biopython.fasta")
+    expected_output = Path(
+        "PyIsland/tests/fastautils/slice_fasta_expected_output.fasta"
+    )
+
+    FastaUtils.slice_fasta_biopython(seqs_to_get, start, stop, get_test_fasta, outfasta)
+
     output_hash = tests.md5sum(outfasta)
     expected_output_hash = tests.md5sum(expected_output)
-    
+
     assert output_hash == expected_output_hash
     outfasta.unlink()
-    
+
+
 def test_fa_strict(get_test_fasta):
-    
-    outfasta = Path('PyIsland/tests/fastautils/fa_strict.fasta')
-    expected_output = Path('PyIsland/tests/fastautils/fa_strict_expected_output.fasta')
-    
-    FastaUtils.fa_strict(get_test_fasta,
-                         outfasta,
-                         min_len=5
-                        )
-    
+    outfasta = Path("PyIsland/tests/fastautils/fa_strict.fasta")
+    expected_output = Path("PyIsland/tests/fastautils/fa_strict_expected_output.fasta")
+
+    FastaUtils.fa_strict(get_test_fasta, outfasta, min_len=5)
+
     output_hash = tests.md5sum(outfasta)
     expected_output_hash = tests.md5sum(expected_output)
-    
+
     assert output_hash == expected_output_hash
     outfasta.unlink()
-    
+
+
 def test_count_seqs_in_fasta(get_test_fasta):
-    
     num_seqs = FastaUtils.count_seqs_in_fasta(get_test_fasta)
-    
+
     assert num_seqs == 6
-    
+
+
 def test_deduplicate_fasta(get_test_fasta):
-    
-    outfasta = Path('PyIsland/tests/fastautils/dedup.fasta')
-    expected_output = Path('PyIsland/tests/fastautils/dedup_expected_output.fasta')
-    
-    FastaUtils.deduplicate_fasta(get_test_fasta,
-                                 outfasta,
-                                )
-    
+    outfasta = Path("PyIsland/tests/fastautils/dedup.fasta")
+    expected_output = Path("PyIsland/tests/fastautils/dedup_expected_output.fasta")
+
+    FastaUtils.deduplicate_fasta(
+        get_test_fasta,
+        outfasta,
+    )
+
     output_hash = tests.md5sum(outfasta)
     expected_output_hash = tests.md5sum(expected_output)
-    
+
     assert output_hash == expected_output_hash
-    outfasta.unlink()
\ No newline at end of file
+    outfasta.unlink()
diff --git a/PyIsland/tests/parsers/test_parsers.py b/PyIsland/tests/parsers/test_parsers.py
index eb6c57d..de576ca 100644
--- a/PyIsland/tests/parsers/test_parsers.py
+++ b/PyIsland/tests/parsers/test_parsers.py
@@ -3,87 +3,90 @@
 from PyIsland import tests
 import pytest
 
+
 @pytest.fixture
 def get_test_gff():
-    return Path('PyIsland/tests/lambda.gff')
+    return Path("PyIsland/tests/lambda.gff")
+
 
 @pytest.fixture
 def get_test_gbk():
-    return Path('PyIsland/tests/lambda.gbk')
+    return Path("PyIsland/tests/lambda.gbk")
+
 
 @pytest.fixture
 def get_test_fasta():
-    return Path('PyIsland/tests/lambda.fna')
+    return Path("PyIsland/tests/lambda.fna")
+
 
 @pytest.fixture
 def get_test_orfs():
-    return Path('PyIsland/tests/lambda.faa')
+    return Path("PyIsland/tests/lambda.faa")
+
 
 def test_gff_to_df(get_test_gff):
-    
     df = Parsers.gff_to_df(get_test_gff)
-    
-    #check I can get a contig using the .query method and a string
+
+    # check I can get a contig using the .query method and a string
     assert not df.query('contig == "NC_001416.1"').empty
-    
-#def test_gff_to_df_with_prodigal_gff():
-    
+
+
+# def test_gff_to_df_with_prodigal_gff():
+
 #    df = Parsers.gff_to_df('PyIsland/tests/example_prodigal.gff')
-    
-    #check I can get a contig using the .query method and a string
+
+# check I can get a contig using the .query method and a string
 #    assert not df.query('contig == "10007077.1"').empty
-    
+
+
 def test_gbk_to_df(get_test_gbk):
-    
     df = Parsers.genbank_to_df(get_test_gbk)
-    
+
     features = df.feature_type.unique().tolist()
-    expected_features = ['source',
-                         'gene',
-                         'CDS',
-                         'sig_peptide',
-                         'mat_peptide',
-                         'mRNA',
-                         'variation',
-                         'misc_recomb',
-                         'misc_binding',
-                         'regulatory',
-                         'misc_feature',
-                         'unsure'
-                        ]
-    
-    
-    #check I get all features
+    expected_features = [
+        "source",
+        "gene",
+        "CDS",
+        "sig_peptide",
+        "mat_peptide",
+        "mRNA",
+        "variation",
+        "misc_recomb",
+        "misc_binding",
+        "regulatory",
+        "misc_feature",
+        "unsure",
+    ]
+
+    # check I get all features
     assert features == expected_features
-    
-    #check I labelled protein_ids
+
+    # check I labelled protein_ids
     assert df.protein_id.notna().sum() == 73
-    
-    #check products are labelled
-    assert len(df.query('product.notna()')) == 73
-    
 
-                
+    # check products are labelled
+    assert len(df.query("product.notna()")) == 73
+
+
 def _test_gff_to_df(get_test_gff):
-    seqs_to_get = ['seq1']
-    
-    outfasta = Path('PyIsland/tests/fastautils/subseq_fasta_biopython.fasta')
-    
+    seqs_to_get = ["seq1"]
+
+    outfasta = Path("PyIsland/tests/fastautils/subseq_fasta_biopython.fasta")
+
     df = Parsers.gff_to_df(get_test_gff)
-    
+
     output_hash = tests.md5sum(outfasta)
     expected_output_hash = tests.md5sum(expected_output)
-    
+
     assert output_hash == expected_output_hash
     outfasta.unlink()
-    
-    
+
+
 def test_prodigal_gff_to_df(get_test_gff):
-    
-    df = Parsers.prodigal.prodigal_gff_to_df('PyIsland/tests/example_prodigal.gff')
-    
-    #check I can get a contig using the .query method and a string
+    df = Parsers.prodigal.prodigal_gff_to_df("PyIsland/tests/example_prodigal.gff")
+
+    # check I can get a contig using the .query method and a string
     assert not df.query('contig == "10007077.1"').empty
-    
-    #check the protein_id was parsed
-    assert df.protein_id.isna().sum() == 0
\ No newline at end of file
+
+    # check the protein_id was parsed
+    assert df.protein_id.isna().sum() == 0
diff --git a/docs/conf.py b/docs/conf.py
index 210b332..22c4ded 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -29,28 +29,28 @@
 extensions = []
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The encoding of source files.
 # source_encoding = 'utf-8-sig'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = u'iscb_screen'
+project = "iscb_screen"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '0.1'
+version = "0.1"
 # The full version, including alpha/beta/rc tags.
-release = '0.1'
+release = "0.1"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -64,7 +64,7 @@
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ["_build"]
 
 # The reST default role (used for this markup: `text`) to use for all documents.
 # default_role = None
@@ -81,7 +81,7 @@
 # show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # A list of ignored prefixes for module index sorting.
 # modindex_common_prefix = []
@@ -91,7 +91,7 @@
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_theme = 'default'
+html_theme = "default"
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -120,7 +120,7 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
@@ -164,7 +164,7 @@
 # html_file_suffix = None
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'iscb_screendoc'
+htmlhelp_basename = "iscb_screendoc"
 
 
 # -- Options for LaTeX output --------------------------------------------------
@@ -172,10 +172,8 @@
 latex_elements = {
     # The paper size ('letterpaper' or 'a4paper').
     # 'papersize': 'letterpaper',
-
     # The font size ('10pt', '11pt' or '12pt').
     # 'pointsize': '10pt',
-
     # Additional stuff for the LaTeX preamble.
     # 'preamble': '',
 }
@@ -183,10 +181,7 @@
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
-    ('index',
-     'iscb_screen.tex',
-     u'iscb_screen Documentation',
-     u"sean benler", 'manual'),
+    ("index", "iscb_screen.tex", "iscb_screen Documentation", "sean benler", "manual"),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
@@ -214,10 +209,7 @@
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    ('index', 'iscb_screen', u'iscb_screen Documentation',
-     [u"sean benler"], 1)
-]
+man_pages = [("index", "iscb_screen", "iscb_screen Documentation", ["sean benler"], 1)]
 
 # If true, show URL addresses after external links.
 # man_show_urls = False
@@ -229,9 +221,15 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    ('index', 'iscb_screen', u'iscb_screen Documentation',
-     u"sean benler", 'iscb_screen',
-     'Finding IscB/Cas9 in genomic databases', 'Miscellaneous'),
+    (
+        "index",
+        "iscb_screen",
+        "iscb_screen Documentation",
+        "sean benler",
+        "iscb_screen",
+        "Finding IscB/Cas9 in genomic databases",
+        "Miscellaneous",
+    ),
 ]
 
 # Documents to append as an appendix to all manuals.
diff --git a/setup.py b/setup.py
index 729836b..fdcbe91 100644
--- a/setup.py
+++ b/setup.py
@@ -1,15 +1,19 @@
 import glob
 from setuptools import find_packages, setup
 
-requirements = [req.strip() for req in open('requirements.txt', 'r').readlines() if not req.startswith('#')]
+requirements = [
+    req.strip()
+    for req in open("requirements.txt", "r").readlines()
+    if not req.startswith("#")
+]
 
 setup(
-    name='PyIsland',
-    scripts = [script for script in glob.glob('bin/*')],
+    name="PyIsland",
+    scripts=[script for script in glob.glob("bin/*")],
     packages=find_packages(),
-    install_requires = requirements,
-    version='0.0.1',
-    description='Libary for comparative genomics',
-    author='SMB',
-    license='',
+    install_requires=requirements,
+    version="0.0.1",
+    description="Libary for comparative genomics",
+    author="SMB",
+    license="",
 )
diff --git a/test_environment.py b/test_environment.py
index d0ac4a7..5381850 100644
--- a/test_environment.py
+++ b/test_environment.py
@@ -10,16 +10,17 @@ def main():
     elif REQUIRED_PYTHON == "python3":
         required_major = 3
     else:
-        raise ValueError("Unrecognized python interpreter: {}".format(
-            REQUIRED_PYTHON))
+        raise ValueError("Unrecognized python interpreter: {}".format(REQUIRED_PYTHON))
 
     if system_major != required_major:
         raise TypeError(
             "This project requires Python {}. Found: Python {}".format(
-                required_major, sys.version))
+                required_major, sys.version
+            )
+        )
     else:
         print(">>> Development environment passes all tests!")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()