From 921d35b1fed9432a43c9e218a4b15475633b44aa Mon Sep 17 00:00:00 2001 From: michtrofimov <92677906+michtrofimov@users.noreply.github.com> Date: Tue, 26 Sep 2023 18:40:21 +0300 Subject: [PATCH 01/25] Create folder for work --- HW4_Trofimov/folder_file | 1 + 1 file changed, 1 insertion(+) create mode 100644 HW4_Trofimov/folder_file diff --git a/HW4_Trofimov/folder_file b/HW4_Trofimov/folder_file new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/HW4_Trofimov/folder_file @@ -0,0 +1 @@ + From 49a006a9de21165a7d5065fa59f0b779bc30315d Mon Sep 17 00:00:00 2001 From: michtrofimov <92677906+michtrofimov@users.noreply.github.com> Date: Tue, 26 Sep 2023 18:41:00 +0300 Subject: [PATCH 02/25] Delete HW4_Trofimov/folder_file --- HW4_Trofimov/folder_file | 1 - 1 file changed, 1 deletion(-) delete mode 100644 HW4_Trofimov/folder_file diff --git a/HW4_Trofimov/folder_file b/HW4_Trofimov/folder_file deleted file mode 100644 index 8b13789..0000000 --- a/HW4_Trofimov/folder_file +++ /dev/null @@ -1 +0,0 @@ - From b8a58af1b90cb7b0693407556648311c88afda37 Mon Sep 17 00:00:00 2001 From: michtrofimov <92677906+michtrofimov@users.noreply.github.com> Date: Tue, 26 Sep 2023 18:43:00 +0300 Subject: [PATCH 03/25] Create working folder --- HW4_Trofimov/test_file | 1 + 1 file changed, 1 insertion(+) create mode 100644 HW4_Trofimov/test_file diff --git a/HW4_Trofimov/test_file b/HW4_Trofimov/test_file new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/HW4_Trofimov/test_file @@ -0,0 +1 @@ + From 53527e62d4b492bc87aebc3c37fa9cb3575540f6 Mon Sep 17 00:00:00 2001 From: Michil Trofimov Date: Tue, 26 Sep 2023 18:51:23 +0300 Subject: [PATCH 04/25] Rename main script --- HW4_Trofimov/{test_file => das_protein_tools.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename HW4_Trofimov/{test_file => das_protein_tools.py} (100%) diff --git a/HW4_Trofimov/test_file b/HW4_Trofimov/das_protein_tools.py similarity index 100% rename from HW4_Trofimov/test_file rename to HW4_Trofimov/das_protein_tools.py From 2e0b8263a83359a4fd0b755cdb40089aadab2796 Mon Sep 17 00:00:00 2001 From: Michil Trofimov Date: Wed, 27 Sep 2023 16:09:07 +0300 Subject: [PATCH 05/25] Add function for isoelectric point calculation --- HW4_Trofimov/das_protein_tools.py | 123 ++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index 8b13789..5a340cc 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -1 +1,124 @@ +def calculate_pI( + sequence: str, + pKa_values: dict = { + "D": 3.86, # Aspartic acid (COOH side chain) + "E": 4.25, # Glutamic acid (COOH side chain) + "C": 8.33, # Cysteine (R-SH) + "Y": 10.46, # Tyrosine (phenolic OH) + "H": 6.0, # Histidine (imidazole group) + "K": 10.67, # Lysine (amino group) + "R": 12.48, # Arginine (guanidinium group) + "N": 3.22, # Asparagine (amino group) + "Q": 3.65, # Glutamine (amino group) + "T": 2.95, # Threonine (amino group) + "S": 2.19, # Serine (hydroxyl group) + "W": 11.55, # Tryptophan (imidazole group) + "Y": 10.46, # Tyrosine (phenolic OH) + }, +): + aminoacid_pIs = [] + total_charge = 0.0 + # Calculate pI for each amino acid in the sequence while preserving case + for aa in sequence: + aa_upper = aa.upper() + if aa_upper in pKa_values: + pI = pKa_values[aa_upper] + + if aa.isupper(): + aminoacid_pIs.append((aa_upper, pI)) + else: + aminoacid_pIs.append((aa, pI)) + total_charge += pI + + # Calculate the overall pI of the sequence + overall_pI = total_charge / len(sequence) + overall_pI = round(overall_pI, 2) + + return aminoacid_pIs, overall_pI + + +def build_scoring_matrix(match_score, mismatch_score): + amino_acids = "ACDEFGHIKLMNPQRSTVWY" # Amino acid alphabet + + # Initialize an empty scoring matrix as a dictionary of dictionaries + scoring_matrix = {} + + for aa1 in amino_acids: + scoring_matrix[aa1] = {} + for aa2 in amino_acids: + scoring_matrix[aa1][aa2] = ( + match_score if aa1.upper() == aa2.upper() else mismatch_score + ) + + return scoring_matrix + + +def needleman_wunsch( + seq1, seq2, scoring_matrix=None, gap_penalty=-1, match_score=1, mismatch_score=-1 +): + if scoring_matrix is None: + # Default scoring matrix if not provided + scoring_matrix = build_scoring_matrix(match_score, mismatch_score) + + seq1_upper = seq1.upper() # Convert seq1 to uppercase + seq2_upper = seq2.upper() # Convert seq2 to uppercase + + m, n = len(seq1_upper), len(seq2_upper) + + # Initialize matrices + dp = [[0] * (n + 1) for _ in range(m + 1)] + traceback = [[""] * (n + 1) for _ in range(m + 1)] + + # Fill in the scoring matrix and traceback matrix + for i in range(1, m + 1): + for j in range(1, n + 1): + match = dp[i - 1][j - 1] + scoring_matrix.get(seq1_upper[i - 1], {}).get( + seq2_upper[j - 1], mismatch_score + ) + delete = dp[i - 1][j] + gap_penalty + insert = dp[i][j - 1] + gap_penalty + + dp[i][j] = max(match, delete, insert) + + if dp[i][j] == match: + traceback[i][j] = "D" # Diagonal (indicates a match/mismatch) + elif dp[i][j] == delete: + traceback[i][j] = "U" # Up (indicates a gap in seq2) + else: + traceback[i][j] = "L" # Left (indicates a gap in seq1) + + # Traceback to find the aligned sequences while preserving case + aligned_seq1, aligned_seq2 = [], [] + i, j = m, n + while i > 0 or j > 0: + if i > 0 and j > 0 and traceback[i][j] == "D": + if seq1[i - 1].isupper(): + aligned_seq1.append(seq1_upper[i - 1]) + else: + aligned_seq1.append(seq1[i - 1]) + if seq2[j - 1].isupper(): + aligned_seq2.append(seq2_upper[j - 1]) + else: + aligned_seq2.append(seq2[j - 1]) + i -= 1 + j -= 1 + elif i > 0 and traceback[i][j] == "U": + if seq1[i - 1].isupper(): + aligned_seq1.append(seq1_upper[i - 1]) + else: + aligned_seq1.append(seq1[i - 1]) + aligned_seq2.append("-") + i -= 1 + else: + aligned_seq1.append("-") + if seq2[j - 1].isupper(): + aligned_seq2.append(seq2_upper[j - 1]) + else: + aligned_seq2.append(seq2[j - 1]) + j -= 1 + + aligned_seq1 = "".join(reversed(aligned_seq1)) + aligned_seq2 = "".join(reversed(aligned_seq2)) + + return aligned_seq1, aligned_seq2, dp[m][n] From 27a72c7c94d4f1be5754a76f43669f577c8f4a86 Mon Sep 17 00:00:00 2001 From: Michil Trofimov Date: Wed, 27 Sep 2023 16:21:23 +0300 Subject: [PATCH 06/25] Add auxiliary function build_scoring_matrix for the needleman-wunsch function --- HW4_Trofimov/das_protein_tools.py | 111 +++++++++--------------------- 1 file changed, 34 insertions(+), 77 deletions(-) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index 5a340cc..38db14b 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -15,7 +15,21 @@ def calculate_pI( "W": 11.55, # Tryptophan (imidazole group) "Y": 10.46, # Tyrosine (phenolic OH) }, -): +) -> str: + """ + Calculates isoelectric point of a whole aminoacid sequence and for each aminoacid individually + + Args: + - sequence (str): sequence for which to calculate isoelectric point + - pKa_values (dict): acid dissociation constants for each aminoacid + + Return: + - str: string, which contains: + - an original sequence, + - list of tuple pairs of aminoacid and corresponding isoelectric point, + - overall isoelectric point of sequence + """ + aminoacid_pIs = [] total_charge = 0.0 @@ -35,90 +49,33 @@ def calculate_pI( overall_pI = total_charge / len(sequence) overall_pI = round(overall_pI, 2) - return aminoacid_pIs, overall_pI + return f"Sequence: {sequence}. Isoelectric point of each aminoacid: {aminoacid_pIs}, Sequence's isoelectric point: {overall_pI}" -def build_scoring_matrix(match_score, mismatch_score): - amino_acids = "ACDEFGHIKLMNPQRSTVWY" # Amino acid alphabet +def build_scoring_matrix( + match_score: int, + mismatch_score: int, + amino_acid_alphabet: str = "ACDEFGHIKLMNPQRSTVWY", +) -> dict: + """ + Build a default scoring matrix, if not provided in needleman-wunsch function parameter, as a dictionary of dictionaries + + Args: + - match_score (int): integer value of a matching score of aminoacids + - mismatch_score (int): integer value of a mismatching score of aminoacids + - amino_acid_alphabet (str): upper case amino acid alphabet + + Returns: + - dictionary of dictionaries of aminoacids scores. In which this dictionary contain aminoacid as a key and its value a dictionary of scores + """ - # Initialize an empty scoring matrix as a dictionary of dictionaries scoring_matrix = {} - for aa1 in amino_acids: + for aa1 in amino_acid_alphabet: scoring_matrix[aa1] = {} - for aa2 in amino_acids: + for aa2 in amino_acid_alphabet: scoring_matrix[aa1][aa2] = ( match_score if aa1.upper() == aa2.upper() else mismatch_score ) return scoring_matrix - - -def needleman_wunsch( - seq1, seq2, scoring_matrix=None, gap_penalty=-1, match_score=1, mismatch_score=-1 -): - if scoring_matrix is None: - # Default scoring matrix if not provided - scoring_matrix = build_scoring_matrix(match_score, mismatch_score) - - seq1_upper = seq1.upper() # Convert seq1 to uppercase - seq2_upper = seq2.upper() # Convert seq2 to uppercase - - m, n = len(seq1_upper), len(seq2_upper) - - # Initialize matrices - dp = [[0] * (n + 1) for _ in range(m + 1)] - traceback = [[""] * (n + 1) for _ in range(m + 1)] - - # Fill in the scoring matrix and traceback matrix - for i in range(1, m + 1): - for j in range(1, n + 1): - match = dp[i - 1][j - 1] + scoring_matrix.get(seq1_upper[i - 1], {}).get( - seq2_upper[j - 1], mismatch_score - ) - delete = dp[i - 1][j] + gap_penalty - insert = dp[i][j - 1] + gap_penalty - - dp[i][j] = max(match, delete, insert) - - if dp[i][j] == match: - traceback[i][j] = "D" # Diagonal (indicates a match/mismatch) - elif dp[i][j] == delete: - traceback[i][j] = "U" # Up (indicates a gap in seq2) - else: - traceback[i][j] = "L" # Left (indicates a gap in seq1) - - # Traceback to find the aligned sequences while preserving case - aligned_seq1, aligned_seq2 = [], [] - i, j = m, n - while i > 0 or j > 0: - if i > 0 and j > 0 and traceback[i][j] == "D": - if seq1[i - 1].isupper(): - aligned_seq1.append(seq1_upper[i - 1]) - else: - aligned_seq1.append(seq1[i - 1]) - if seq2[j - 1].isupper(): - aligned_seq2.append(seq2_upper[j - 1]) - else: - aligned_seq2.append(seq2[j - 1]) - i -= 1 - j -= 1 - elif i > 0 and traceback[i][j] == "U": - if seq1[i - 1].isupper(): - aligned_seq1.append(seq1_upper[i - 1]) - else: - aligned_seq1.append(seq1[i - 1]) - aligned_seq2.append("-") - i -= 1 - else: - aligned_seq1.append("-") - if seq2[j - 1].isupper(): - aligned_seq2.append(seq2_upper[j - 1]) - else: - aligned_seq2.append(seq2[j - 1]) - j -= 1 - - aligned_seq1 = "".join(reversed(aligned_seq1)) - aligned_seq2 = "".join(reversed(aligned_seq2)) - - return aligned_seq1, aligned_seq2, dp[m][n] From 399b7f2b90233e9e59df428b5d672a2ee8a8182e Mon Sep 17 00:00:00 2001 From: Michil Trofimov Date: Wed, 27 Sep 2023 16:43:45 +0300 Subject: [PATCH 07/25] Add needleman-wunsch function --- HW4_Trofimov/das_protein_tools.py | 102 +++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 2 deletions(-) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index 38db14b..0e4e948 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -58,7 +58,7 @@ def build_scoring_matrix( amino_acid_alphabet: str = "ACDEFGHIKLMNPQRSTVWY", ) -> dict: """ - Build a default scoring matrix, if not provided in needleman-wunsch function parameter, as a dictionary of dictionaries + Build a default scoring matrix, if not provided in needleman-wunsch function parameter Args: - match_score (int): integer value of a matching score of aminoacids @@ -66,7 +66,7 @@ def build_scoring_matrix( - amino_acid_alphabet (str): upper case amino acid alphabet Returns: - - dictionary of dictionaries of aminoacids scores. In which this dictionary contain aminoacid as a key and its value a dictionary of scores + - a dictionary of dictionaries representing a scoring matrix for aminoacids paris. Key of a dictionary is an aminoacid and its value is a dictionary of scores """ scoring_matrix = {} @@ -79,3 +79,101 @@ def build_scoring_matrix( ) return scoring_matrix + + +def needleman_wunsch( + seq1: str, + seq2: str, + scoring_matrix: dict = None, + gap_penalty: int = -1, + match_score: int = 1, + mismatch_score: int = -1, +) -> str: + """ + Uses Needleman-Wunsch algorithm to make a global alignment of two sequences. + + Args: + - seq1 (str): first aminoacid sequence for alignment + - seq2 (str): second aminoacid sequence for alignment + - scoring_matrix (dict): A dictionary representing a scoring matrix for amino acid pairs + If not provided, a default scoring_matrix is generated based on match and mismatch scores + - gap_penalty (int): integer va;ue of a penalty score for introducing a gap in the alignment + - match_score (int): integer value of a matching score for matching aminoacids + - mismatch_score (int): integer value of a mismatching score for mismatched aminoacids + + Returns: + - string: a string containing the aligned sequences (str), the aligned score (int) + """ + if scoring_matrix is None: + # Default scoring matrix if not provided + scoring_matrix = build_scoring_matrix(match_score, mismatch_score) + + seq1_upper = seq1.upper() # Convert seq1 to uppercase + seq2_upper = seq2.upper() # Convert seq2 to uppercase + + m, n = len(seq1_upper), len(seq2_upper) + + # Initialize matrices + dp = [[0] * (n + 1) for _ in range(m + 1)] + traceback = [[""] * (n + 1) for _ in range(m + 1)] + + # Fill in the scoring matrix and traceback matrix + for i in range(1, m + 1): + for j in range(1, n + 1): + match = dp[i - 1][j - 1] + scoring_matrix.get(seq1_upper[i - 1], {}).get( + seq2_upper[j - 1], mismatch_score + ) + delete = dp[i - 1][j] + gap_penalty + insert = dp[i][j - 1] + gap_penalty + + dp[i][j] = max(match, delete, insert) + + if dp[i][j] == match: + traceback[i][j] = "D" # Diagonal (indicates a match/mismatch) + elif dp[i][j] == delete: + traceback[i][j] = "U" # Up (indicates a gap in seq2) + else: + traceback[i][j] = "L" # Left (indicates a gap in seq1) + + # Traceback to find the aligned sequences while preserving case + aligned_seq1, aligned_seq2 = [], [] + i, j = m, n + while i > 0 or j > 0: + if i > 0 and j > 0 and traceback[i][j] == "D": + # check original case of amionacid in seq1 + if seq1[i - 1].isupper(): + aligned_seq1.append(seq1_upper[i - 1]) + else: + aligned_seq1.append(seq1[i - 1]) + + # check original case of amionacid in seq2 + if seq2[j - 1].isupper(): + aligned_seq2.append(seq2_upper[j - 1]) + else: + aligned_seq2.append(seq2[j - 1]) + + i -= 1 + j -= 1 + elif i > 0 and traceback[i][j] == "U": + # check original case of amionacid in seq1 + if seq1[i - 1].isupper(): + aligned_seq1.append(seq1_upper[i - 1]) + else: + aligned_seq1.append(seq1[i - 1]) + aligned_seq2.append("-") + + i -= 1 + else: + aligned_seq1.append("-") + # check original case of amionacid in seq2 + if seq2[j - 1].isupper(): + aligned_seq2.append(seq2_upper[j - 1]) + else: + aligned_seq2.append(seq2[j - 1]) + + j -= 1 + + aligned_seq1 = "".join(reversed(aligned_seq1)) + aligned_seq2 = "".join(reversed(aligned_seq2)) + + return f"{aligned_seq1}, {aligned_seq2}, final score: {dp[m][n]}" From 40af0025bef15eed7521af113b1fc35036ad5328 Mon Sep 17 00:00:00 2001 From: michtrofimov <92677906+michtrofimov@users.noreply.github.com> Date: Wed, 27 Sep 2023 17:03:09 +0300 Subject: [PATCH 08/25] Create README.md --- HW4_Trofimov/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 HW4_Trofimov/README.md diff --git a/HW4_Trofimov/README.md b/HW4_Trofimov/README.md new file mode 100644 index 0000000..975059e --- /dev/null +++ b/HW4_Trofimov/README.md @@ -0,0 +1,14 @@ +Screenshot 2023-09-27 at 17 01 47 + +# Das biotools.aminoacids +> **The great and terrifying successor of biopython** + +Das biotools strikes again! Now it works only with aminoacid sequences! + +## Features + +- **calculate_pI**: Calculate the isoelectric point of a given amino acid sequence, both individually for each amino acid and for the entire sequence. + +- **build_scoring_matrix**: Build a scoring matrix for amino acid pairs, which can be used in sequence alignment algorithms. + +- **needleman_wunsch**: Implement the Needleman-Wunsch algorithm for global sequence alignment of two amino acid sequences. From 8612f1726773b61c033667d870ec394ea54acce9 Mon Sep 17 00:00:00 2001 From: Alisa Fedorenko Date: Thu, 28 Sep 2023 16:58:15 +0300 Subject: [PATCH 09/25] Add function calculating aminoacids fruquencies --- HW4_Trofimov/das_protein_tools.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index 0e4e948..b92795e 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -177,3 +177,27 @@ def needleman_wunsch( aligned_seq2 = "".join(reversed(aligned_seq2)) return f"{aligned_seq1}, {aligned_seq2}, final score: {dp[m][n]}" + +def calculate_aa_freq(seq: str) -> dict: + """ + Calculates the frequency of each amino acid in a protein sequence or sequences. + + :param sequences: protein sequence or sequences + :type sequences: str or list of str + :return: dictionary with the frequency of each amino acid + :rtype: dict + """ + sequences = '' + + # Creating a dictionary with aminoacid frequencies: + amino_acid_frequency = {} + + for amino_acid in sequences: + # If the aminoacid has been already in: + if amino_acid in amino_acid_frequency: + amino_acid_frequency[amino_acid] += 1 + # If the aminoacid hasn't been already in: + else: + amino_acid_frequency[amino_acid] = 1 + + return amino_acid_frequency From 264e6cf8aa3ffc8adcf039ab84ec5200655e0680 Mon Sep 17 00:00:00 2001 From: ShakirSuleimanov Date: Thu, 28 Sep 2023 17:48:18 +0300 Subject: [PATCH 10/25] Add file with dictionaries --- HW4_Trofimov/protein_dict.py | 73 ++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 HW4_Trofimov/protein_dict.py diff --git a/HW4_Trofimov/protein_dict.py b/HW4_Trofimov/protein_dict.py new file mode 100644 index 0000000..b95c713 --- /dev/null +++ b/HW4_Trofimov/protein_dict.py @@ -0,0 +1,73 @@ +#Dictionary: keys - single-letter amino acid designations; values - list of RNA codons +aa_codon_dict = { + 'G': ['GGA', 'GGU', 'GGC', 'GGG'], + 'R': ['AGA', 'AGG', 'CGA', 'CGC', 'CGG', 'CGU'], + 'S': ['AGC', 'AGU', 'UCA', 'UCC', 'UCG', 'UCU'], + 'E': ['GAA', 'GAG'], + 'P': ['CCA', 'CCC', 'CCG', 'CCU'], + 'L': ['CUA', 'CUC', 'CUG', 'CUU', 'UUA', 'UUG'], + 'V': ['GUA', 'GUC', 'GUG', 'GUU'], + 'T': ['ACA', 'ACC', 'ACG', 'ACU'], + 'A': ['GCA', 'GCC', 'GCG', 'GCU'], + 'I': ['AUA', 'AUC', 'AUU'], + 'F': ['UUC', 'UUU'], + 'H': ['CAC', 'CAU'], + 'Y': ['UAC', 'UAU'], + 'Q': ['CAA', 'CAG'], + 'C': ['UGC', 'UGU'], + 'N': ['AAC', 'AAU'], + 'D': ['GAC', 'GAU'], + 'K': ['AAA', 'AAG'], + 'M': ['AUG'], + 'W': ['UGG'], +} + + +#Dictionary: keys - single-letter amino acid designations; values - names of amino acids +aa_one_to_three_letter = { + 'A' : 'Ala-', + 'C' : 'Cys-', + 'D' : 'Asp-', + 'E' : 'Glu-', + 'F' : 'Phe-', + 'G' : 'Gly-', + 'H' : 'His-', + 'I' : 'Ile-', + 'K' : 'Lys-', + 'L' : 'Leu-', + 'M' : 'Met-', + 'N' : 'Asn-', + 'P' : 'Pro-', + 'Q' : 'Gln-', + 'R' : 'Arg-', + 'S' : 'Ser-', + 'T' : 'Thr-', + 'V' : 'Val-', + 'W' : 'Trp-', + 'Y' : 'Tyr-', +} + + +# aminoacids mass dictionary +aa_monoistopic_mass_dict = { + 'A' : 71.03711, + 'C' : 103.00919, + 'D' : 115.02694, + 'E' : 129.04259, + 'F' : 147.06841, + 'G' : 57.02146, + 'H' : 137.05891, + 'I' : 113.08406, + 'K' : 128.09496, + 'L' : 113.08406, + 'M' : 131.04049, + 'N' : 114.04293, + 'P' : 97.05276, + 'Q' : 128.05858, + 'R' : 156.10111, + 'S' : 87.03203, + 'T' : 101.04768, + 'V' : 99.06841, + 'W' : 186.07931, + 'Y' : 163.06333, +} From 428a23145de85683b84246713897550a5651b7fb Mon Sep 17 00:00:00 2001 From: ShakirSuleimanov Date: Thu, 28 Sep 2023 17:50:08 +0300 Subject: [PATCH 11/25] Add modules import and is_protein function --- HW4_Trofimov/das_protein_tools.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index b92795e..53a995b 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -1,3 +1,24 @@ +# importing necessary modules +import protein_dict as pd +from random import choice + + +# Function to determine is the sequence is a protein or not +def is_protein(seq: str) -> bool: + """ + This function checks if the sequence is a protein or not + + Arguments: + seq (str): A sequence of aminoacids + + Output: + returns True or False + """ + unique_chars = set(seq) + aminoacids = set(pd.aa_monoistopic_mass_dict.keys()) + return bool(unique_chars <= aminoacids) + + def calculate_pI( sequence: str, pKa_values: dict = { From 7954e4afaa8938b3656ccfc6752c9241bf5527d9 Mon Sep 17 00:00:00 2001 From: ShakirSuleimanov Date: Thu, 28 Sep 2023 17:53:25 +0300 Subject: [PATCH 12/25] Add comments to previously written functions --- HW4_Trofimov/das_protein_tools.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index 53a995b..31fe1d8 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -19,6 +19,7 @@ def is_protein(seq: str) -> bool: return bool(unique_chars <= aminoacids) +# Function to calculate pI def calculate_pI( sequence: str, pKa_values: dict = { @@ -73,6 +74,7 @@ def calculate_pI( return f"Sequence: {sequence}. Isoelectric point of each aminoacid: {aminoacid_pIs}, Sequence's isoelectric point: {overall_pI}" +#Function to build scoring matrix for needleman_wunsch function def build_scoring_matrix( match_score: int, mismatch_score: int, @@ -102,6 +104,7 @@ def build_scoring_matrix( return scoring_matrix +# Function to perform alignment based on needleman_wunsch algorithm def needleman_wunsch( seq1: str, seq2: str, @@ -199,6 +202,8 @@ def needleman_wunsch( return f"{aligned_seq1}, {aligned_seq2}, final score: {dp[m][n]}" + +# Function to calculate frequency of unique aminoacid in the sequence def calculate_aa_freq(seq: str) -> dict: """ Calculates the frequency of each amino acid in a protein sequence or sequences. @@ -222,3 +227,5 @@ def calculate_aa_freq(seq: str) -> dict: amino_acid_frequency[amino_acid] = 1 return amino_acid_frequency + + From 93feb9f2a026cec29b5fe493a6c2b35b5cd98f56 Mon Sep 17 00:00:00 2001 From: ShakirSuleimanov Date: Thu, 28 Sep 2023 18:08:10 +0300 Subject: [PATCH 13/25] Add function to convert one-letter aminoacid sequence to three-letter --- HW4_Trofimov/das_protein_tools.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index 31fe1d8..4c4e4e8 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -229,3 +229,22 @@ def calculate_aa_freq(seq: str) -> dict: return amino_acid_frequency +# Convert one-letter protein sequence to three-letter protein sequence +def convert_to_3L_code(seq: str) -> str: + """ + This function takes one letter aminoacids sequence and convert's it to three leter coding + + Arguments: + seq (str): A sequence of aminoacids + + Output: + same sequence but in three-letter coding + """ + seq = seq.upper() + if is_protein(seq) is True: + sequence = ''.join(pd.aa_one_to_three_letter.get(aa) for aa in seq) + return sequence[:-1] + else: + raise ValueError("Sequence is not a protein, input should be protein") + + From 8d1dbb86bf1e414ee36b149691822161c7faf2ad Mon Sep 17 00:00:00 2001 From: ShakirSuleimanov Date: Thu, 28 Sep 2023 18:19:28 +0300 Subject: [PATCH 14/25] Add function to calculate protein mass --- HW4_Trofimov/das_protein_tools.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index 4c4e4e8..e1fcd4e 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -248,3 +248,22 @@ def convert_to_3L_code(seq: str) -> str: raise ValueError("Sequence is not a protein, input should be protein") +# Function to calculate protein mass +def protein_mass (seq: str) -> float: + """ + This function takes aminoacids sequence and counts it's summary molecular weight using monoisotopic masses + + Arguments: + seq (str): A sequence of aminoacids + + Output: + returns molecular weight + """ + seq = seq.upper() + if is_protein(seq) is True: + mass = sum(pd.aa_monoistopic_mass_dict.get(aa) for aa in seq) + return mass + else: + raise ValueError("Sequence is not a protein, input should be protein") + + From c24525987d28fd293376cf0feb5dff5392d53278 Mon Sep 17 00:00:00 2001 From: ShakirSuleimanov Date: Thu, 28 Sep 2023 18:34:17 +0300 Subject: [PATCH 15/25] Add function to translate protein sequence to RNA --- HW4_Trofimov/das_protein_tools.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index e1fcd4e..a5183d6 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -229,7 +229,7 @@ def calculate_aa_freq(seq: str) -> dict: return amino_acid_frequency -# Convert one-letter protein sequence to three-letter protein sequence +# Function to convert one-letter protein sequence to three-letter protein sequence def convert_to_3L_code(seq: str) -> str: """ This function takes one letter aminoacids sequence and convert's it to three leter coding @@ -267,3 +267,25 @@ def protein_mass (seq: str) -> float: raise ValueError("Sequence is not a protein, input should be protein") +# Function to translate Protein to RNA +def translate_protein_rna(seq: str) -> str: + """ + This function takes aminoacid sequence and translates in to the RNA. + As most of the aminoacids are coded with several different codons, + this function will take a random codon of the set for such aminoacids. + + Arguments: + seq (str): A sequence of RNA molecule + + Output: + returns sequence of aminoacids + """ + seq = seq.upper() + if is_protein(seq) is True: + rna = '' + for aa in seq: + codon = choice(pd.aa_codon_dict.get(aa)) + rna += codon + return rna + else: + raise ValueError("Sequence is not a protein, input should be a protein") From e6b19dd3c687d09e0362997386ed443b53f9662c Mon Sep 17 00:00:00 2001 From: ShakirSuleimanov Date: Thu, 28 Sep 2023 23:01:39 +0300 Subject: [PATCH 16/25] Add description for my functions --- HW4_Trofimov/README.md | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/HW4_Trofimov/README.md b/HW4_Trofimov/README.md index 975059e..b201973 100644 --- a/HW4_Trofimov/README.md +++ b/HW4_Trofimov/README.md @@ -7,8 +7,26 @@ Das biotools strikes again! Now it works only with aminoacid sequences! ## Features -- **calculate_pI**: Calculate the isoelectric point of a given amino acid sequence, both individually for each amino acid and for the entire sequence. +- **calculate_pI()**: Calculate the isoelectric point of a given amino acid sequence, both individually for each amino acid and for the entire sequence. -- **build_scoring_matrix**: Build a scoring matrix for amino acid pairs, which can be used in sequence alignment algorithms. +- **build_scoring_matri())**: Build a scoring matrix for amino acid pairs, which can be used in sequence alignment algorithms. -- **needleman_wunsch**: Implement the Needleman-Wunsch algorithm for global sequence alignment of two amino acid sequences. +- **needleman_wunsch()**: Implement the Needleman-Wunsch algorithm for global sequence alignment of two amino acid sequences. + +- **convert_to_3L_code()**: Converts one letter animoacid sequence to three letter aminoacid sequence. + +```python +convert_to_3L_code('ACDEF') -> 'Ala-Cys-Asp-Glu-Phe' +``` + +- **protein_mass()**: Calculates molecular weight of the aminoacid sequence using monoisotopic masses. + +```python +protein_mass('ACDEF') -> 565.184 +``` + +- **translate_protein_rna()**: Converts aminoacid sequence to RNA sequence. For those aminoacids that are coded with more than one codon, this function randomly chooses one codon from the set. + +```python +translate_protein_rna('ACDEF') -> 'GCCUGCGACGAGUUC' +``` \ No newline at end of file From 8b5985f6362f6d5bac3a1db428bc412535376fd4 Mon Sep 17 00:00:00 2001 From: Alisa Fedorenko Date: Fri, 29 Sep 2023 23:29:47 +0300 Subject: [PATCH 17/25] add main function --- HW4_Trofimov/das_protein_tools.py | 46 ++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index a5183d6..d179c94 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -3,6 +3,10 @@ from random import choice +AMINO_LETTERS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', + 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] + + # Function to determine is the sequence is a protein or not def is_protein(seq: str) -> bool: """ @@ -12,7 +16,7 @@ def is_protein(seq: str) -> bool: seq (str): A sequence of aminoacids Output: - returns True or False + returns True or False """ unique_chars = set(seq) aminoacids = set(pd.aa_monoistopic_mass_dict.keys()) @@ -257,7 +261,7 @@ def protein_mass (seq: str) -> float: seq (str): A sequence of aminoacids Output: - returns molecular weight + returns molecular weight """ seq = seq.upper() if is_protein(seq) is True: @@ -265,20 +269,20 @@ def protein_mass (seq: str) -> float: return mass else: raise ValueError("Sequence is not a protein, input should be protein") - -# Function to translate Protein to RNA + +# Function to translate Protein to RNA def translate_protein_rna(seq: str) -> str: """ - This function takes aminoacid sequence and translates in to the RNA. - As most of the aminoacids are coded with several different codons, + This function takes aminoacid sequence and translates in to the RNA. + As most of the aminoacids are coded with several different codons, this function will take a random codon of the set for such aminoacids. Arguments: seq (str): A sequence of RNA molecule Output: - returns sequence of aminoacids + returns sequence of aminoacids """ seq = seq.upper() if is_protein(seq) is True: @@ -289,3 +293,31 @@ def translate_protein_rna(seq: str) -> str: return rna else: raise ValueError("Sequence is not a protein, input should be a protein") + + +def main(*args): + action = args[-1] + action_list = { + "calculate_pI": calculate_pI, + "build_scoring_matrix": build_scoring_matrix, + "needleman_wunsch": needleman_wunsch, + "calculate_aa_freq": calculate_aa_freq, + "translate_protein_rna": translate_protein_rna, + "convert_to_3L_code": convert_to_3L_code, + "protein_mass": protein_mass + } + + if action not in action_list: + raise ValueError(f"No such action: {action}") + + if not (action == "needleman_wunsch" and len(args) == 3 or + action != "needleman_wunsch" and len(args) == 2): + raise ValueError("Error in number of sequences") + + for sequence in args[:-1]: + if not all([letter.capitalize() in AMINO_LETTERS for letter in sequence]): + raise ValueError(f"The sequence is not protein sequence: {sequence}") + + result = action_list[action](*args[:-1]) + + return result From 15f5db5117649674a3c3f2c588498800c0d76cab Mon Sep 17 00:00:00 2001 From: Alisa Fedorenko Date: Sat, 30 Sep 2023 07:09:47 +0300 Subject: [PATCH 18/25] Add description and usage example of calculate_aa_freq function --- HW4_Trofimov/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/HW4_Trofimov/README.md b/HW4_Trofimov/README.md index d80ff30..ca1bea3 100644 --- a/HW4_Trofimov/README.md +++ b/HW4_Trofimov/README.md @@ -19,6 +19,8 @@ Das biotools strikes again! Now it works only with aminoacid sequences! - **translate_protein_rna()**: Converts aminoacid sequence to RNA sequence. For those aminoacids that are coded with more than one codon, this function randomly chooses one codon from the set. +- **calculate_aa_freq()**: Calculate the frequences of aminoacids in protein sequences. + ## Examples - **get_pI** @@ -50,3 +52,9 @@ protein_mass('ACDEF') -> 565.184 ```python translate_protein_rna('ACDEF') -> 'GCCUGCGACGAGUUC' ``` + +- **calculate_aa_freq** + +```python +calculate_aa_freq('ACDEF') -> {'A': 1, 'C': 1, 'D': 1, 'E': 1, 'F': 1} +``` From b45a72d41d525ac2983ba158639d7115b850d658 Mon Sep 17 00:00:00 2001 From: Michil Trofimov Date: Sat, 30 Sep 2023 15:39:32 +0300 Subject: [PATCH 19/25] Redo function for isoelectric point calc --- HW4_Trofimov/das_protein_tools.py | 73 +++++++-------- HW4_Trofimov/protein_dict.py | 148 +++++++++++++++++------------- 2 files changed, 118 insertions(+), 103 deletions(-) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index d179c94..cb90b5b 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -1,4 +1,4 @@ -# importing necessary modules +# importing necessary modules import protein_dict as pd from random import choice @@ -23,66 +23,53 @@ def is_protein(seq: str) -> bool: return bool(unique_chars <= aminoacids) -# Function to calculate pI -def calculate_pI( +# Function to get pI for each aa +def get_pI( sequence: str, - pKa_values: dict = { - "D": 3.86, # Aspartic acid (COOH side chain) - "E": 4.25, # Glutamic acid (COOH side chain) - "C": 8.33, # Cysteine (R-SH) - "Y": 10.46, # Tyrosine (phenolic OH) - "H": 6.0, # Histidine (imidazole group) - "K": 10.67, # Lysine (amino group) - "R": 12.48, # Arginine (guanidinium group) - "N": 3.22, # Asparagine (amino group) - "Q": 3.65, # Glutamine (amino group) - "T": 2.95, # Threonine (amino group) - "S": 2.19, # Serine (hydroxyl group) - "W": 11.55, # Tryptophan (imidazole group) - "Y": 10.46, # Tyrosine (phenolic OH) - }, + pI_values: dict = None, ) -> str: """ - Calculates isoelectric point of a whole aminoacid sequence and for each aminoacid individually + Gives isoelectric point value for each aminoacid individually Args: - sequence (str): sequence for which to calculate isoelectric point - - pKa_values (dict): acid dissociation constants for each aminoacid + - pI_values (dict): acid dissociation constants for each aminoacid Return: - str: string, which contains: - an original sequence, - list of tuple pairs of aminoacid and corresponding isoelectric point, - - overall isoelectric point of sequence """ + if pI_values is None: + # Default pKa_values if not provided + pI_values = pd.aa_pI + aminoacid_pIs = [] - total_charge = 0.0 # Calculate pI for each amino acid in the sequence while preserving case + analysed_aa = [] for aa in sequence: aa_upper = aa.upper() - if aa_upper in pKa_values: - pI = pKa_values[aa_upper] - - if aa.isupper(): - aminoacid_pIs.append((aa_upper, pI)) - else: - aminoacid_pIs.append((aa, pI)) - total_charge += pI - - # Calculate the overall pI of the sequence - overall_pI = total_charge / len(sequence) - overall_pI = round(overall_pI, 2) + if aa_upper not in analysed_aa: + if aa_upper in pI_values: + pI = pI_values[aa_upper] + analysed_aa.append(aa_upper) + if aa.isupper(): + aminoacid_pIs.append((aa_upper, pI)) + else: + aminoacid_pIs.append((aa, pI)) + else: + continue - return f"Sequence: {sequence}. Isoelectric point of each aminoacid: {aminoacid_pIs}, Sequence's isoelectric point: {overall_pI}" + return f"Sequence: {sequence}. Isoelectric point of each aminoacid: {aminoacid_pIs}" -#Function to build scoring matrix for needleman_wunsch function +# Function to build scoring matrix for needleman_wunsch function def build_scoring_matrix( match_score: int, mismatch_score: int, - amino_acid_alphabet: str = "ACDEFGHIKLMNPQRSTVWY", + amino_acid_alphabet: str = None, ) -> dict: """ Build a default scoring matrix, if not provided in needleman-wunsch function parameter @@ -96,6 +83,10 @@ def build_scoring_matrix( - a dictionary of dictionaries representing a scoring matrix for aminoacids paris. Key of a dictionary is an aminoacid and its value is a dictionary of scores """ + if amino_acid_alphabet is None: + # Default pKa_values if not provided + amino_acid_alphabet = "ACDEFGHIKLMNPQRSTVWY" + scoring_matrix = {} for aa1 in amino_acid_alphabet: @@ -217,7 +208,7 @@ def calculate_aa_freq(seq: str) -> dict: :return: dictionary with the frequency of each amino acid :rtype: dict """ - sequences = '' + sequences = "" # Creating a dictionary with aminoacid frequencies: amino_acid_frequency = {} @@ -246,14 +237,14 @@ def convert_to_3L_code(seq: str) -> str: """ seq = seq.upper() if is_protein(seq) is True: - sequence = ''.join(pd.aa_one_to_three_letter.get(aa) for aa in seq) + sequence = "".join(pd.aa_one_to_three_letter.get(aa) for aa in seq) return sequence[:-1] else: raise ValueError("Sequence is not a protein, input should be protein") # Function to calculate protein mass -def protein_mass (seq: str) -> float: +def protein_mass(seq: str) -> float: """ This function takes aminoacids sequence and counts it's summary molecular weight using monoisotopic masses @@ -286,7 +277,7 @@ def translate_protein_rna(seq: str) -> str: """ seq = seq.upper() if is_protein(seq) is True: - rna = '' + rna = "" for aa in seq: codon = choice(pd.aa_codon_dict.get(aa)) rna += codon diff --git a/HW4_Trofimov/protein_dict.py b/HW4_Trofimov/protein_dict.py index b95c713..a7e05c2 100644 --- a/HW4_Trofimov/protein_dict.py +++ b/HW4_Trofimov/protein_dict.py @@ -1,73 +1,97 @@ -#Dictionary: keys - single-letter amino acid designations; values - list of RNA codons +# Dictionary: keys - single-letter amino acid designations; values - list of RNA codons aa_codon_dict = { - 'G': ['GGA', 'GGU', 'GGC', 'GGG'], - 'R': ['AGA', 'AGG', 'CGA', 'CGC', 'CGG', 'CGU'], - 'S': ['AGC', 'AGU', 'UCA', 'UCC', 'UCG', 'UCU'], - 'E': ['GAA', 'GAG'], - 'P': ['CCA', 'CCC', 'CCG', 'CCU'], - 'L': ['CUA', 'CUC', 'CUG', 'CUU', 'UUA', 'UUG'], - 'V': ['GUA', 'GUC', 'GUG', 'GUU'], - 'T': ['ACA', 'ACC', 'ACG', 'ACU'], - 'A': ['GCA', 'GCC', 'GCG', 'GCU'], - 'I': ['AUA', 'AUC', 'AUU'], - 'F': ['UUC', 'UUU'], - 'H': ['CAC', 'CAU'], - 'Y': ['UAC', 'UAU'], - 'Q': ['CAA', 'CAG'], - 'C': ['UGC', 'UGU'], - 'N': ['AAC', 'AAU'], - 'D': ['GAC', 'GAU'], - 'K': ['AAA', 'AAG'], - 'M': ['AUG'], - 'W': ['UGG'], + "G": ["GGA", "GGU", "GGC", "GGG"], + "R": ["AGA", "AGG", "CGA", "CGC", "CGG", "CGU"], + "S": ["AGC", "AGU", "UCA", "UCC", "UCG", "UCU"], + "E": ["GAA", "GAG"], + "P": ["CCA", "CCC", "CCG", "CCU"], + "L": ["CUA", "CUC", "CUG", "CUU", "UUA", "UUG"], + "V": ["GUA", "GUC", "GUG", "GUU"], + "T": ["ACA", "ACC", "ACG", "ACU"], + "A": ["GCA", "GCC", "GCG", "GCU"], + "I": ["AUA", "AUC", "AUU"], + "F": ["UUC", "UUU"], + "H": ["CAC", "CAU"], + "Y": ["UAC", "UAU"], + "Q": ["CAA", "CAG"], + "C": ["UGC", "UGU"], + "N": ["AAC", "AAU"], + "D": ["GAC", "GAU"], + "K": ["AAA", "AAG"], + "M": ["AUG"], + "W": ["UGG"], } -#Dictionary: keys - single-letter amino acid designations; values - names of amino acids +# Dictionary: keys - single-letter amino acid designations; values - names of amino acids aa_one_to_three_letter = { - 'A' : 'Ala-', - 'C' : 'Cys-', - 'D' : 'Asp-', - 'E' : 'Glu-', - 'F' : 'Phe-', - 'G' : 'Gly-', - 'H' : 'His-', - 'I' : 'Ile-', - 'K' : 'Lys-', - 'L' : 'Leu-', - 'M' : 'Met-', - 'N' : 'Asn-', - 'P' : 'Pro-', - 'Q' : 'Gln-', - 'R' : 'Arg-', - 'S' : 'Ser-', - 'T' : 'Thr-', - 'V' : 'Val-', - 'W' : 'Trp-', - 'Y' : 'Tyr-', + "A": "Ala-", + "C": "Cys-", + "D": "Asp-", + "E": "Glu-", + "F": "Phe-", + "G": "Gly-", + "H": "His-", + "I": "Ile-", + "K": "Lys-", + "L": "Leu-", + "M": "Met-", + "N": "Asn-", + "P": "Pro-", + "Q": "Gln-", + "R": "Arg-", + "S": "Ser-", + "T": "Thr-", + "V": "Val-", + "W": "Trp-", + "Y": "Tyr-", } # aminoacids mass dictionary aa_monoistopic_mass_dict = { - 'A' : 71.03711, - 'C' : 103.00919, - 'D' : 115.02694, - 'E' : 129.04259, - 'F' : 147.06841, - 'G' : 57.02146, - 'H' : 137.05891, - 'I' : 113.08406, - 'K' : 128.09496, - 'L' : 113.08406, - 'M' : 131.04049, - 'N' : 114.04293, - 'P' : 97.05276, - 'Q' : 128.05858, - 'R' : 156.10111, - 'S' : 87.03203, - 'T' : 101.04768, - 'V' : 99.06841, - 'W' : 186.07931, - 'Y' : 163.06333, + "A": 71.03711, + "C": 103.00919, + "D": 115.02694, + "E": 129.04259, + "F": 147.06841, + "G": 57.02146, + "H": 137.05891, + "I": 113.08406, + "K": 128.09496, + "L": 113.08406, + "M": 131.04049, + "N": 114.04293, + "P": 97.05276, + "Q": 128.05858, + "R": 156.10111, + "S": 87.03203, + "T": 101.04768, + "V": 99.06841, + "W": 186.07931, + "Y": 163.06333, +} + +# aminoacids pI (isoelectric point) values dictionary +aa_pI = { + "A": 6.0, # Alanine + "R": 10.8, # Arginine + "N": 5.4, # Asparagine + "D": 2.8, # Aspartic Acid + "C": 5.0, # Cysteine + "E": 3.2, # Glutamic Acid + "Q": 5.7, # Glutamine + "G": 6.1, # Glycine + "H": 7.6, # Histidine + "I": 6.0, # Isoleucine + "L": 6.0, # Leucine + "K": 9.7, # Lysine + "M": 5.7, # Methionine + "F": 5.5, # Phenylalanine + "P": 6.3, # Proline + "S": 5.7, # Serine + "T": 5.6, # Threonine + "W": 5.9, # Tryptophan + "Y": 5.7, # Tyrosine + "V": 6.0, # Valine } From 1a4a8e5cfa378182f6f544029c9079ecdb88b693 Mon Sep 17 00:00:00 2001 From: Michil Trofimov Date: Sat, 30 Sep 2023 16:04:11 +0300 Subject: [PATCH 20/25] Add build_scoring_matrix in main func --- HW4_Trofimov/das_protein_tools.py | 38 +++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index cb90b5b..09da19c 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -3,8 +3,28 @@ from random import choice -AMINO_LETTERS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', - 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] +AMINO_LETTERS = [ + "A", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", +] # Function to determine is the sequence is a protein or not @@ -289,20 +309,24 @@ def translate_protein_rna(seq: str) -> str: def main(*args): action = args[-1] action_list = { - "calculate_pI": calculate_pI, - "build_scoring_matrix": build_scoring_matrix, + "get_pI": get_pI, "needleman_wunsch": needleman_wunsch, + "build_scoring_matrix": build_scoring_matrix, "calculate_aa_freq": calculate_aa_freq, "translate_protein_rna": translate_protein_rna, "convert_to_3L_code": convert_to_3L_code, - "protein_mass": protein_mass + "protein_mass": protein_mass, } if action not in action_list: raise ValueError(f"No such action: {action}") - if not (action == "needleman_wunsch" and len(args) == 3 or - action != "needleman_wunsch" and len(args) == 2): + if not ( + action == "needleman_wunsch" + and len(args) == 3 + or action != "needleman_wunsch" + and len(args) == 2 + ): raise ValueError("Error in number of sequences") for sequence in args[:-1]: From fcef1e43a1b939b913b8b65edfe68b2ac25d138e Mon Sep 17 00:00:00 2001 From: Michil Trofimov Date: Sat, 30 Sep 2023 16:19:34 +0300 Subject: [PATCH 21/25] Fix bugs calculate_aa_freq --- HW4_Trofimov/das_protein_tools.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index 09da19c..334e93d 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -219,7 +219,7 @@ def needleman_wunsch( # Function to calculate frequency of unique aminoacid in the sequence -def calculate_aa_freq(seq: str) -> dict: +def calculate_aa_freq(sequences: str) -> dict: """ Calculates the frequency of each amino acid in a protein sequence or sequences. @@ -228,7 +228,6 @@ def calculate_aa_freq(seq: str) -> dict: :return: dictionary with the frequency of each amino acid :rtype: dict """ - sequences = "" # Creating a dictionary with aminoacid frequencies: amino_acid_frequency = {} From f045f2305d69fb09629c41785feeec9f930f2bf9 Mon Sep 17 00:00:00 2001 From: michtrofimov <92677906+michtrofimov@users.noreply.github.com> Date: Sat, 30 Sep 2023 16:26:35 +0300 Subject: [PATCH 22/25] Update README.md --- HW4_Trofimov/README.md | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/HW4_Trofimov/README.md b/HW4_Trofimov/README.md index b201973..d80ff30 100644 --- a/HW4_Trofimov/README.md +++ b/HW4_Trofimov/README.md @@ -7,26 +7,46 @@ Das biotools strikes again! Now it works only with aminoacid sequences! ## Features -- **calculate_pI()**: Calculate the isoelectric point of a given amino acid sequence, both individually for each amino acid and for the entire sequence. +- **get_pI()**: Calculate the isoelectric point of a given amino acid sequence, both individually for each amino acid and for the entire sequence. -- **build_scoring_matri())**: Build a scoring matrix for amino acid pairs, which can be used in sequence alignment algorithms. +- **build_scoring_matri()**: Build a scoring matrix for amino acid pairs, which can be used in sequence alignment algorithms. - **needleman_wunsch()**: Implement the Needleman-Wunsch algorithm for global sequence alignment of two amino acid sequences. - **convert_to_3L_code()**: Converts one letter animoacid sequence to three letter aminoacid sequence. +- **protein_mass()**: Calculates molecular weight of the aminoacid sequence using monoisotopic masses. + +- **translate_protein_rna()**: Converts aminoacid sequence to RNA sequence. For those aminoacids that are coded with more than one codon, this function randomly chooses one codon from the set. + +## Examples + +- **get_pI** + +```python +calculate_pI('RAHP') -> "Sequence: RAHP. Isoelectric point of each aminoacid: [('R', 10.8), ('A', 6.0), ('H', 7.6), ('P', 6.3)]" +``` + +- **needleman_wunsch** + +```python +needleman_wunsch('raHP','RAQQHP') -> 'ra--HP, RAQQHP, final score: 2' +``` + +- **convert_to_3L_code** + ```python convert_to_3L_code('ACDEF') -> 'Ala-Cys-Asp-Glu-Phe' ``` -- **protein_mass()**: Calculates molecular weight of the aminoacid sequence using monoisotopic masses. +- **protein_mass** ```python protein_mass('ACDEF') -> 565.184 ``` -- **translate_protein_rna()**: Converts aminoacid sequence to RNA sequence. For those aminoacids that are coded with more than one codon, this function randomly chooses one codon from the set. +- **translate_protein_rna** ```python translate_protein_rna('ACDEF') -> 'GCCUGCGACGAGUUC' -``` \ No newline at end of file +``` From 1b18fb7f2ed01a33b16a9ecc455b1a64b453aac4 Mon Sep 17 00:00:00 2001 From: michtrofimov <92677906+michtrofimov@users.noreply.github.com> Date: Sat, 30 Sep 2023 16:30:49 +0300 Subject: [PATCH 23/25] Update README.md --- HW4_Trofimov/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW4_Trofimov/README.md b/HW4_Trofimov/README.md index d80ff30..b98e466 100644 --- a/HW4_Trofimov/README.md +++ b/HW4_Trofimov/README.md @@ -7,7 +7,7 @@ Das biotools strikes again! Now it works only with aminoacid sequences! ## Features -- **get_pI()**: Calculate the isoelectric point of a given amino acid sequence, both individually for each amino acid and for the entire sequence. +- **get_pI()**: Gives isoelectric point value for each aminoacid individually. - **build_scoring_matri()**: Build a scoring matrix for amino acid pairs, which can be used in sequence alignment algorithms. From 8edf4b5ada925dd1ae53e56e937600e14ad8d2ae Mon Sep 17 00:00:00 2001 From: michtrofimov <92677906+michtrofimov@users.noreply.github.com> Date: Sat, 30 Sep 2023 16:42:08 +0300 Subject: [PATCH 24/25] Update README.md --- HW4_Trofimov/README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/HW4_Trofimov/README.md b/HW4_Trofimov/README.md index b98e466..c2660a9 100644 --- a/HW4_Trofimov/README.md +++ b/HW4_Trofimov/README.md @@ -9,7 +9,7 @@ Das biotools strikes again! Now it works only with aminoacid sequences! - **get_pI()**: Gives isoelectric point value for each aminoacid individually. -- **build_scoring_matri()**: Build a scoring matrix for amino acid pairs, which can be used in sequence alignment algorithms. +- **build_scoring_matri()**: Auxiliary function for needleman_wunsch. Build a scoring matrix for amino acid pairs, which can be used in sequence alignment algorithms. - **needleman_wunsch()**: Implement the Needleman-Wunsch algorithm for global sequence alignment of two amino acid sequences. @@ -50,3 +50,11 @@ protein_mass('ACDEF') -> 565.184 ```python translate_protein_rna('ACDEF') -> 'GCCUGCGACGAGUUC' ``` + +## OUR TEAM +Screenshot 2023-09-30 at 16 34 25 + +Up to bottom, left to right: +- Alisa Fedorenko: functions **main**, **calculate_aa_freq** +- Michil Trofimov: functions **get_pI**, **needleman_wunsch** (teamlead) +- Shakir Suleimanov: functions **convert_to_3L_code**, **protein_mass**, **translate_protein_rna** From e52a38e0fbf00042f57c700a3323b3817a10e63d Mon Sep 17 00:00:00 2001 From: Michil Trofimov Date: Sat, 30 Sep 2023 16:45:28 +0300 Subject: [PATCH 25/25] Add docstring to main function --- HW4_Trofimov/das_protein_tools.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py index 334e93d..3ae64ed 100644 --- a/HW4_Trofimov/das_protein_tools.py +++ b/HW4_Trofimov/das_protein_tools.py @@ -305,7 +305,31 @@ def translate_protein_rna(seq: str) -> str: raise ValueError("Sequence is not a protein, input should be a protein") -def main(*args): +def main(*args: str): + """ + Main function to perform various actions on protein sequences. + + Args: + - *args: Variable number of arguments. The first n-1 arguments should be protein sequences, + and the last argument should be a string specifying the action to be performed. + + Returns: + - The result of the specified action on the input protein sequences. + + Raises: + - ValueError: If the specified action is not supported or if there is an error in the number of sequences. + Also raised if the input sequences are not valid protein sequences. + + Supported Actions: + - "get_pI": Calculate isoelectric points for each amino acid in the sequence. + - "needleman_wunsch": Perform global alignment of two sequences using the Needleman-Wunsch algorithm. + - "build_scoring_matrix": Build a scoring matrix for amino acid pairs. + - "calculate_aa_freq": Calculate the frequency of each amino acid in a protein sequence. + - "translate_protein_rna": Translate amino acid sequence to RNA, using random codons for each amino acid. + - "convert_to_3L_code": Convert one-letter amino acid sequence to three-letter coding. + - "protein_mass": Calculate the molecular weight of the protein sequence. + """ + action = args[-1] action_list = { "get_pI": get_pI,