From d98ccaf8f9e3784252127c2639c2c843a9188dce Mon Sep 17 00:00:00 2001 From: Nikita Date: Tue, 26 Sep 2023 07:14:08 +0300 Subject: [PATCH 01/30] Initial commit --- HW4_Sapozhnikov/README.md | 19 +++++++++++++++++++ HW4_Sapozhnikov/prototool.py | 16 ++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 HW4_Sapozhnikov/README.md create mode 100644 HW4_Sapozhnikov/prototool.py diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md new file mode 100644 index 0000000..8a4222d --- /dev/null +++ b/HW4_Sapozhnikov/README.md @@ -0,0 +1,19 @@ +# HW 4. Functions 2 +> *This is the repo for the fourth homework of the BI Python 2023 course* + +### Title + +### Overview + +### Usage + +### Options + +### Examples + +### Troubleshooting + +### Contacts + +Feel free to report any bugs and problems encountered. +Email: nikita.sapozhnikov1@gmail.com \ No newline at end of file diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py new file mode 100644 index 0000000..98ca0a0 --- /dev/null +++ b/HW4_Sapozhnikov/prototool.py @@ -0,0 +1,16 @@ +def main(): + """ + an entry point to the tool + + This tool provides the following functionality: + - local alignment of two sequences + - ... + + To get started choose one of the possible programms to run: + 1. Local alignment + Enter two protein sequences in 1- letter encoding. The code will return alignment scores and + sequences aligned on each other. + 2. ... + + """ + pass \ No newline at end of file From 42c89b3c65607ba418caab66751abb4096efc343 Mon Sep 17 00:00:00 2001 From: Nikita Date: Tue, 26 Sep 2023 07:46:38 +0300 Subject: [PATCH 02/30] Add local alignment function --- HW4_Sapozhnikov/prototool.py | 96 ++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 98ca0a0..4564f37 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -1,3 +1,99 @@ +def local_alignment(seq1: str, seq2: str, match=2, mismatch=-1, gap=-1, prettify: bool=True) -> list: + """ + perform a local alignment of 2 given sequences + + Args: + - seq1, seq2 (str) - sequences to align + - match, mismatch, gap (int) - alignment scoring and penalty values + defaulted to 2, -1, -1 + + Returns: + - a a dictionary of {'aligned_seq1':aligned_seq1, + 'aligned_seq2':aligned_seq2, + 'alignment_score':alignment_score} + """ + + m, n = len(seq1), len(seq2) + + # Initialize the score matrix and traceback matrix + score_matrix = [[0] * (n + 1) for _ in range(m + 1)] + traceback_matrix = [[None] * (n + 1) for _ in range(m + 1)] + + alignment_score = 0 # To keep track of the maximum score in the matrix + max_i, max_j = 0, 0 # To store the position of the maximum score + + # Fill in the score matrix + for i in range(1, m + 1): + for j in range(1, n + 1): + if seq1[i - 1] == seq2[j - 1]: + match_score = score_matrix[i - 1][j - 1] + match + else: + match_score = score_matrix[i - 1][j - 1] + mismatch + + delete_score = score_matrix[i - 1][j] + gap + insert_score = score_matrix[i][j - 1] + gap + + # Calculate the maximum score for the current cell + score = max(0, match_score, delete_score, insert_score) + + # Update the score matrix and traceback matrix + score_matrix[i][j] = score + + if score > alignment_score: + alignment_score = score + max_i, max_j = i, j + + if score == match_score: + traceback_matrix[i][j] = "match" + elif score == delete_score: + traceback_matrix[i][j] = "delete" + elif score == insert_score: + traceback_matrix[i][j] = "insert" + else: + traceback_matrix[i][j] = "none" + + # Traceback to find the aligned sequences + aligned_seq1 = [] + aligned_seq2 = [] + + i, j = max_i, max_j + + while i > 0 and j > 0: + if traceback_matrix[i][j] == "match": + aligned_seq1.append(seq1[i - 1]) + aligned_seq2.append(seq2[j - 1]) + i -= 1 + j -= 1 + elif traceback_matrix[i][j] == "delete": + aligned_seq1.append(seq1[i - 1]) + aligned_seq2.append("-") + i -= 1 + elif traceback_matrix[i][j] == "insert": + aligned_seq1.append("-") + aligned_seq2.append(seq2[j - 1]) + j -= 1 + else: + break + + # Reverse the aligned sequences + aligned_seq1 = "".join(aligned_seq1[::-1]) + aligned_seq2 = "".join(aligned_seq2[::-1]) + + # Form an output dictionary + alignment_dict = {'aligned_seq1':aligned_seq1, + 'aligned_seq2':aligned_seq2, + 'alignment_score':alignment_score} + + # Prettify an alignment output + seq_on = (seq1 if seq1 <= seq2 else seq2) + if prettify == True: + prettify_alignment(seq_on, alignment_dict) + else: + pass + + return alignment_dict + + def main(): """ an entry point to the tool From 873f7d76cd0f5f3002d3c4055efe01b380b5ee49 Mon Sep 17 00:00:00 2001 From: Nikita Date: Thu, 28 Sep 2023 13:39:11 +0300 Subject: [PATCH 03/30] Add local alignment functionality Add check_input() to check the validity of the input in main() Add local_alignment() to perform Smith-Waterman algorithm Add prettify_alignment() to prettify the view of an alignment --- HW4_Sapozhnikov/prototool.py | 210 +++++++++++++++++++++++++++-------- 1 file changed, 165 insertions(+), 45 deletions(-) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 4564f37..799df57 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -1,44 +1,80 @@ -def local_alignment(seq1: str, seq2: str, match=2, mismatch=-1, gap=-1, prettify: bool=True) -> list: +from typing import List, Optional, Tuple, Union + + +def prettify_alignment(aligned_seq_on: str, aligned_seq2: str) -> None: + """ + Prettifies alignment output by printing out two + sequences on top of each other + + Finds the start of aligned sequence in the longer of sequences.\\ + Prints the longer sequence as an upper one and aligned sequence + is bellow separated via vertical lines + + Args: + - aligned_seq_on, aligned_seq2 - sequences + from the local_alignment() + + Returns: + None \\ + Prints out the prettified view in stdout + """ + + print(aligned_seq_on) + print('|' * len(aligned_seq2)) + print(aligned_seq2) + + +def local_alignment(seq_on: str, + seq2: Union[List[str], str], + alignment_dict: dict, + seq_id: int, + match=2, + mismatch=-1, + gap=-1, + prettify: bool = True) -> dict: """ - perform a local alignment of 2 given sequences + Perform a local alignment of 2 given sequences Args: - - seq1, seq2 (str) - sequences to align - - match, mismatch, gap (int) - alignment scoring and penalty values + - seq_on - the sequence to align onto + - seq2 - sequences to align + - alignment_dict - a dictionary to yield alignment results + - match, mismatch, gap - alignment scoring and penalty values defaulted to 2, -1, -1 + - prettify - if True (default) prints out the prettified version + of sequences aligned on top of each other + - seq_id - itterator for a seq list Returns: - - a a dictionary of {'aligned_seq1':aligned_seq1, - 'aligned_seq2':aligned_seq2, - 'alignment_score':alignment_score} + - a a dictionary with alignment resluts """ - m, n = len(seq1), len(seq2) - + len_seq_on, len_seq2 = len(seq_on), len(seq2) + # Initialize the score matrix and traceback matrix - score_matrix = [[0] * (n + 1) for _ in range(m + 1)] - traceback_matrix = [[None] * (n + 1) for _ in range(m + 1)] - + score_matrix = [[0] * (len_seq2 + 1) for _ in range(len_seq_on + 1)] + traceback_matrix = [[None] * (len_seq2 + 1) for _ in range(len_seq_on + 1)] + alignment_score = 0 # To keep track of the maximum score in the matrix max_i, max_j = 0, 0 # To store the position of the maximum score # Fill in the score matrix - for i in range(1, m + 1): - for j in range(1, n + 1): - if seq1[i - 1] == seq2[j - 1]: + for i in range(1, len_seq_on + 1): + for j in range(1, len_seq2 + 1): + if seq_on[i - 1] == seq2[j - 1]: match_score = score_matrix[i - 1][j - 1] + match else: match_score = score_matrix[i - 1][j - 1] + mismatch - + delete_score = score_matrix[i - 1][j] + gap insert_score = score_matrix[i][j - 1] + gap - + # Calculate the maximum score for the current cell score = max(0, match_score, delete_score, insert_score) - + # Update the score matrix and traceback matrix score_matrix[i][j] = score - + if score > alignment_score: alignment_score = score max_i, max_j = i, j @@ -51,62 +87,146 @@ def local_alignment(seq1: str, seq2: str, match=2, mismatch=-1, gap=-1, prettify traceback_matrix[i][j] = "insert" else: traceback_matrix[i][j] = "none" - + # Traceback to find the aligned sequences - aligned_seq1 = [] + aligned_seq_on = [] aligned_seq2 = [] - + + counter_identity: int = 0 + counter_gaps: int = 0 + i, j = max_i, max_j - + while i > 0 and j > 0: if traceback_matrix[i][j] == "match": - aligned_seq1.append(seq1[i - 1]) + aligned_seq_on.append(seq_on[i - 1]) aligned_seq2.append(seq2[j - 1]) + counter_identity += 1 i -= 1 j -= 1 elif traceback_matrix[i][j] == "delete": - aligned_seq1.append(seq1[i - 1]) + aligned_seq_on.append(seq_on[i - 1]) aligned_seq2.append("-") + counter_gaps += 1 i -= 1 elif traceback_matrix[i][j] == "insert": - aligned_seq1.append("-") + aligned_seq_on.append("-") aligned_seq2.append(seq2[j - 1]) + counter_gaps += 1 j -= 1 else: break - + # Reverse the aligned sequences - aligned_seq1 = "".join(aligned_seq1[::-1]) + aligned_seq_on = "".join(aligned_seq_on[::-1]) aligned_seq2 = "".join(aligned_seq2[::-1]) + alignment_length = (len(aligned_seq_on) + if len(aligned_seq_on) < len(aligned_seq2) + else len(aligned_seq2)) + # Form an output dictionary - alignment_dict = {'aligned_seq1':aligned_seq1, - 'aligned_seq2':aligned_seq2, - 'alignment_score':alignment_score} + alignment_dict['aligned_seq_on'] = aligned_seq_on + + identity = round(counter_identity/alignment_length, 4) + + alignment_dict[f'aligned_seq{seq_id+1}'] = {'seq': aligned_seq2, + 'length': alignment_length, + 'score': alignment_score, + 'identity': identity, + 'gaps': counter_gaps} # Prettify an alignment output - seq_on = (seq1 if seq1 <= seq2 else seq2) - if prettify == True: - prettify_alignment(seq_on, alignment_dict) - else: + if prettify is True: + prettify_alignment(aligned_seq_on, aligned_seq2) + else: pass return alignment_dict -def main(): +def check_input(*args: List[str]) -> Tuple[List[str], + str, + Optional[str]]: + """ + Function to check the validity of the input. + + Args: + *args - are supposed to be all sequences to process and the method to + process with. + The method is supposed to be the last argument. + + Returns: + - seqs_list - list of sequences + - method - a chosen method to use + - seq_on (optional) - in case of local_alignment method """ - an entry point to the tool - This tool provides the following functionality: - - local alignment of two sequences - - ... + if len(args) < 1: + # Handle the case where there are no arguments + raise ValueError("No input defined.") + else: + # Check the last element of the input is a valid method + method = args[-1] + if method not in ['local_alignment', '', '', '', '']: + raise ValueError(method, " is not a valid method.") + else: + # Form a list with sequences from the input + seqs_list = list(args[:-1]) + if method == 'local_alignment': + seq_on = seqs_list.pop(0) + return seqs_list, method, seq_on + seq_on = None + return seqs_list, method, seq_on + - To get started choose one of the possible programms to run: - 1. Local alignment - Enter two protein sequences in 1- letter encoding. The code will return alignment scores and - sequences aligned on each other. +def main(*args: Tuple[Union[List[str], str], str]) -> dict: + """ + This function provides the access to the following methods: + 1. Local Alignment of two sequences - the last argument: 'local_alignment' + - needs at least 2 protein sequences 1-letter encoded. + When more than 2 sequences are passed, uses the first + entered sequence to align the rest on + - performs an alignment using Smith-Waterman algorithm 2. ... + 3. ... + 4. ... + 5. ... + + Args: + *args - are supposed to be all sequences to process and the method + to process with. + The method is supposed to be the last argument. + Returns: + function_result - result of a chosen function """ - pass \ No newline at end of file + + seqs_list, method, seq_on = check_input(*args) + print(seqs_list, method, seq_on) + + match method: + + case 'local_alignment': + + alignment_dict: dict = {} + for seq_id, seq in enumerate(seqs_list): + function_result = local_alignment(seq_on=seq_on, + seq2=seq, + alignment_dict=alignment_dict, + seq_id=seq_id, + prettify=True) + + case '': + + pass + + case _: + + function_result = None + + return function_result + + +test = main("CGTAGTCGATGCTG", "AGTCGTACAT", "ATGRC", "local_alignment") +print(test) From 86f281b6af543897ea9d49cd66a68b8b985e06e1 Mon Sep 17 00:00:00 2001 From: Daria Date: Thu, 28 Sep 2023 19:18:50 +0300 Subject: [PATCH 04/30] start development from_proteins_seqs_to_rna function --- .idea/.gitignore | 3 ++ .idea/HW4_Sapozhnikov.iml | 8 +++++ .../inspectionProfiles/profiles_settings.xml | 6 ++++ .idea/misc.xml | 4 +++ .idea/modules.xml | 8 +++++ .idea/vcs.xml | 6 ++++ HW4_Sapozhnikov/prototool.py | 35 ++++++++++++++++++- 7 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/HW4_Sapozhnikov.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/HW4_Sapozhnikov.iml b/.idea/HW4_Sapozhnikov.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/HW4_Sapozhnikov.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..a971a2c --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..acbf176 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 98ca0a0..178b336 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -13,4 +13,37 @@ def main(): 2. ... """ - pass \ No newline at end of file + pass + + +def from_proteins_seqs_to_rna(*seqs, tool='RNA'): + PROTEIN_TO_RNA_COMBINATION = { + 'Ala': {'GCU', 'GCC', 'GCA', 'GCG'}, + 'Arg': {'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'}, + 'Asn': {'AAU', 'AAC'}, + 'Asp': {'GAU', 'GAC'}, + 'Cys': {'UGU', 'UGC'}, + 'Glu': {'GAA', 'GAG'}, + 'Gln': {'CAA', 'CAG'}, + 'Gly': {'GGU', 'GGC', 'GGA', 'GGG'}, + 'His': {'CAU', 'CAC'}, + 'Ile': {'AUU', 'AUC', 'AUA'}, + 'Leu': {'CUU', 'CUC', 'CUA', 'CUG'}, + 'Lys': {'AAA', 'AAG'}, + 'Met': {'AUG'}, + 'Phe': {'UUU', 'UUC'}, + 'Pro': {'CCU', 'CCC', 'CCA', 'CCG'}, + 'Ser': {'UCU', 'UCC', 'UCA', 'UCG'}, + 'Thr': {'ACU', 'ACC', 'ACA', 'ACG'}, + 'Tyr': {'UAU', 'UAC'}, + 'Trp': {'UGG'}, + 'Val': {'GUU', 'GUC', 'GUA', 'GUG'}, + } + + if seqs[::3] in PROTEIN_TO_RNA_COMBINATION.keys(): + if len() + + + + + pass From 231efcc88a179e8baba9fb86b9fd76591bb077f2 Mon Sep 17 00:00:00 2001 From: Daria Date: Thu, 28 Sep 2023 21:49:50 +0300 Subject: [PATCH 05/30] add cycles converting proteins to RNA in from_proteins_seqs_to_rna function --- HW4_Sapozhnikov/for_test.py | 6 ++++++ HW4_Sapozhnikov/prototool.py | 30 ++++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 8 deletions(-) create mode 100644 HW4_Sapozhnikov/for_test.py diff --git a/HW4_Sapozhnikov/for_test.py b/HW4_Sapozhnikov/for_test.py new file mode 100644 index 0000000..53e75a5 --- /dev/null +++ b/HW4_Sapozhnikov/for_test.py @@ -0,0 +1,6 @@ +my_input = 'vla', 'ValTrpPhe', 'phe', 'vla', 'ValTrpPhe', 'phe' + +for aminoacids in my_input: + devided = [aminoacids[i:i + 3] for i in range(0, len(aminoacids), 3)] + print(devided) + diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 178b336..78ddebe 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -16,7 +16,7 @@ def main(): pass -def from_proteins_seqs_to_rna(*seqs, tool='RNA'): +def from_proteins_seqs_to_rna(*seqs): PROTEIN_TO_RNA_COMBINATION = { 'Ala': {'GCU', 'GCC', 'GCA', 'GCG'}, 'Arg': {'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'}, @@ -40,10 +40,24 @@ def from_proteins_seqs_to_rna(*seqs, tool='RNA'): 'Val': {'GUU', 'GUC', 'GUA', 'GUG'}, } - if seqs[::3] in PROTEIN_TO_RNA_COMBINATION.keys(): - if len() - - - - - pass + for aminoacids in seqs: + found_sets = [] + divided_acids = [aminoacids[i:i + 3] for i in range(0, len(aminoacids), 3)] + for divided_acid in divided_acids: + if divided_acid in PROTEIN_TO_RNA_COMBINATION.keys(): + found_sets.append([]) + for comb in PROTEIN_TO_RNA_COMBINATION[divided_acid]: + found_sets[-1].append(comb) + + for i in range(0, len(found_sets)): + for j in range(0, len(found_sets[i])): + combination = found_sets[i][j] + if len(found_sets) > 1: + for k in range(0, len(found_sets)): + if k != i: + for m in range(0, len(found_sets[k])): + combination += ' ' + found_sets[k][m] + + print(combination) + +from_proteins_seqs_to_rna('ValTyrMet') \ No newline at end of file From 09d4711b1bc56ce45fdee0023e0d58203ded58ae Mon Sep 17 00:00:00 2001 From: Daria Date: Fri, 29 Sep 2023 23:21:47 +0300 Subject: [PATCH 06/30] add from_proteins_seqs_to_rna and isoelectric_point_determination functions --- HW4_Sapozhnikov/for_test.py | 6 --- HW4_Sapozhnikov/prototool.py | 87 +++++++++++++++++++++++++++--------- 2 files changed, 67 insertions(+), 26 deletions(-) delete mode 100644 HW4_Sapozhnikov/for_test.py diff --git a/HW4_Sapozhnikov/for_test.py b/HW4_Sapozhnikov/for_test.py deleted file mode 100644 index 53e75a5..0000000 --- a/HW4_Sapozhnikov/for_test.py +++ /dev/null @@ -1,6 +0,0 @@ -my_input = 'vla', 'ValTrpPhe', 'phe', 'vla', 'ValTrpPhe', 'phe' - -for aminoacids in my_input: - devided = [aminoacids[i:i + 3] for i in range(0, len(aminoacids), 3)] - print(devided) - diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 78ddebe..42bafdd 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -10,13 +10,19 @@ def main(): 1. Local alignment Enter two protein sequences in 1- letter encoding. The code will return alignment scores and sequences aligned on each other. - 2. ... + 2. Call method """ pass -def from_proteins_seqs_to_rna(*seqs): +def from_proteins_seqs_to_rna(*seqs: str) -> dict: + """ + :param seqs: strings with type 'ValTyrAla','AsnAspCys'. seqs is args parameter, so + you can pass more than one sequences at the time. + :return: dictionary, when [key] is your input protein sequences + and values are combinations of RNA codones, which encode proteins + """ PROTEIN_TO_RNA_COMBINATION = { 'Ala': {'GCU', 'GCC', 'GCA', 'GCG'}, 'Arg': {'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'}, @@ -39,25 +45,66 @@ def from_proteins_seqs_to_rna(*seqs): 'Trp': {'UGG'}, 'Val': {'GUU', 'GUC', 'GUA', 'GUG'}, } - + answer_dict = {} for aminoacids in seqs: - found_sets = [] + rna_combination = '' divided_acids = [aminoacids[i:i + 3] for i in range(0, len(aminoacids), 3)] for divided_acid in divided_acids: if divided_acid in PROTEIN_TO_RNA_COMBINATION.keys(): - found_sets.append([]) - for comb in PROTEIN_TO_RNA_COMBINATION[divided_acid]: - found_sets[-1].append(comb) - - for i in range(0, len(found_sets)): - for j in range(0, len(found_sets[i])): - combination = found_sets[i][j] - if len(found_sets) > 1: - for k in range(0, len(found_sets)): - if k != i: - for m in range(0, len(found_sets[k])): - combination += ' ' + found_sets[k][m] - - print(combination) - -from_proteins_seqs_to_rna('ValTyrMet') \ No newline at end of file + rna_combination += next(iter(PROTEIN_TO_RNA_COMBINATION[divided_acid])) + answer_dict[aminoacids] = rna_combination + return answer_dict + +def isoelectric_point_determination(*seqs: str) -> dict: + """ + :param seqs: strings with type 'ValTyrAla','AsnAspCys'. seqs is args parameter, so + you can pass more than one sequences at the time. + :return: dictionary, when [key] is your input protein sequence and value is an isoelectric point + of your input proteins + """ + PKA_AMINOACIDS = { + 'Ala': [2.34, 9.69], + 'Arg': [2.17, 9.04, 12.68], + 'Asn': [1.88, 9.60, 3.65], + 'Asp': [1.88, 9.60, 3.65], + 'Cys': [1.96, 10.28, 8.18], + 'Glu': [2.19, 9.67, 4.25], + 'Gln': [2.17, 9.13], + 'Gly': [2.34, 9.60], + 'His': [1.82, 9.17], + 'Ile': [2.36, 9.68], + 'Leu': [2.36, 9.60], + 'Lys': [2.18, 8.95, 10.53], + 'Met': [2.28, 9.21], + 'Phe': [2.20, 9.13], + 'Pro': [1.99, 10.96], + 'Ser': [2.21, 9.15], + 'Thr': [2.11, 9.62], + 'Tyr': [2.20, 9.11, 10.07], + 'Trp': [2.38, 9.39], + 'Val': [2.32, 9.62], + } + + answer_dictionary = {} + + for aminoacids in seqs: + divided_acids = [aminoacids[i:i + 3] for i in range(0, len(aminoacids), 3)] + for divided_acid in divided_acids: + if not divided_acid in PKA_AMINOACIDS.keys(): + raise ValueError('Non-protein aminoacids in sequence') + + isoelectric_point_mean = 0 + count_groups = 0 + for acid_index in range(0, len(divided_acids)): + if acid_index == 0: + isoelectric_point_mean += PKA_AMINOACIDS[divided_acids[acid_index]][0] + count_groups += 1 + elif acid_index == len(divided_acids) - 1: + isoelectric_point_mean = isoelectric_point_mean + PKA_AMINOACIDS[divided_acids[acid_index]][-1] + count_groups += 1 + else: + if len(PKA_AMINOACIDS[divided_acids[acid_index]]) > 2: + isoelectric_point_mean = isoelectric_point_mean + PKA_AMINOACIDS[divided_acids[acid_index]][1] + count_groups += 1 + answer_dictionary[aminoacids] = isoelectric_point_mean / count_groups + return answer_dictionary From 1e98426389717a2b368efc9fcf6ceddd77c446a2 Mon Sep 17 00:00:00 2001 From: Daria Date: Fri, 29 Sep 2023 23:55:35 +0300 Subject: [PATCH 07/30] add gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c2d52b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/* From d6f1bfdba00bba4609264374fed5cbd6e9e87b80 Mon Sep 17 00:00:00 2001 From: Daria Date: Fri, 29 Sep 2023 23:57:56 +0300 Subject: [PATCH 08/30] remove excess files --- .idea/.gitignore | 3 --- .idea/HW4_Sapozhnikov.iml | 8 -------- .idea/inspectionProfiles/profiles_settings.xml | 6 ------ .idea/misc.xml | 4 ---- .idea/modules.xml | 8 -------- .idea/vcs.xml | 6 ------ 6 files changed, 35 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/HW4_Sapozhnikov.iml delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 26d3352..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml diff --git a/.idea/HW4_Sapozhnikov.iml b/.idea/HW4_Sapozhnikov.iml deleted file mode 100644 index d0876a7..0000000 --- a/.idea/HW4_Sapozhnikov.iml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index a971a2c..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index acbf176..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 35eb1dd..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From f5e4308b0683e0121475c695f9aad59bde7c614f Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Fri, 29 Sep 2023 21:36:40 +0000 Subject: [PATCH 09/30] Minor code revision --- HW4_Sapozhnikov/prototool.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 42bafdd..00d0d6d 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -20,8 +20,8 @@ def from_proteins_seqs_to_rna(*seqs: str) -> dict: """ :param seqs: strings with type 'ValTyrAla','AsnAspCys'. seqs is args parameter, so you can pass more than one sequences at the time. - :return: dictionary, when [key] is your input protein sequences - and values are combinations of RNA codones, which encode proteins + :return: dictionary, where [key] is your input protein sequences + and values are combinations of RNA codones, which encode this protein """ PROTEIN_TO_RNA_COMBINATION = { 'Ala': {'GCU', 'GCC', 'GCA', 'GCG'}, @@ -45,15 +45,15 @@ def from_proteins_seqs_to_rna(*seqs: str) -> dict: 'Trp': {'UGG'}, 'Val': {'GUU', 'GUC', 'GUA', 'GUG'}, } - answer_dict = {} + answer_dictionary = {} for aminoacids in seqs: rna_combination = '' divided_acids = [aminoacids[i:i + 3] for i in range(0, len(aminoacids), 3)] for divided_acid in divided_acids: if divided_acid in PROTEIN_TO_RNA_COMBINATION.keys(): rna_combination += next(iter(PROTEIN_TO_RNA_COMBINATION[divided_acid])) - answer_dict[aminoacids] = rna_combination - return answer_dict + answer_dictionary[aminoacids] = rna_combination + return answer_dictionary def isoelectric_point_determination(*seqs: str) -> dict: """ From c318a3ac20a4fd592fb67d9f95424a13d192c1f4 Mon Sep 17 00:00:00 2001 From: Nikita Date: Sat, 30 Sep 2023 14:31:35 +0300 Subject: [PATCH 10/30] Add recode() function --- HW4_Sapozhnikov/prototool.py | 191 +++++++++++++++++++++++------------ 1 file changed, 129 insertions(+), 62 deletions(-) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 4bb7547..1a74584 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -1,6 +1,57 @@ +""" +This is a prototool. +""" + from typing import List, Optional, Tuple, Union +def recode(*seq: Union[List[str], str]) -> dict: + """ + Translate 1-letter to 3-letter encoding if 1-letter + encoded sequence is given and vice versa. + + Args: + - seq - sequence or list of sequences to recode + + Returns: + - function_result - a dictionary containing recoded sequences as values + for original sequences keys + """ + + to_1_dictionary = { + 'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', + 'Cys': 'C', 'Gln': 'Q', 'Glu': 'E', 'Gly': 'G', + 'His': 'H', 'Ile': 'I', 'Leu': 'L', 'Lys': 'K', + 'Met': 'M', 'Phe': 'F', 'Pro': 'P', 'Ser': 'S', + 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V' +} + + to_3_dictionary = {v: k for k, v in to_1_dictionary.items()} + + function_result = {} + + for sequence in seq: + # Check if the input sequence is in 1-letter or 3-letter format + is_one_letter = all(aa.isalpha() and aa.isupper() for aa in sequence) + + if is_one_letter: + # Translate 1-letter to 3-letter coded sequence + three_letter_sequence = "" + for aa in sequence: + three_letter_code = to_3_dictionary.get(aa, aa) + three_letter_sequence += three_letter_code + function_result[sequence] = three_letter_sequence + else: + # Translate 3-letter to 1-letter coded sequence + one_letter_sequence = "" + for aa in range(0, len(sequence), 3): + amino_acid = sequence[aa:aa+3] + one_letter_sequence += to_1_dictionary.get(amino_acid, + amino_acid) + function_result[sequence] = one_letter_sequence + return function_result + + def prettify_alignment(aligned_seq_on: str, aligned_seq2: str) -> None: """ Prettifies alignment output by printing out two @@ -180,64 +231,11 @@ def check_input(*args: List[str]) -> Tuple[List[str], return seqs_list, method, seq_on -def main(*args: Tuple[Union[List[str], str], str]) -> dict: - """ - This function provides the access to the following methods: - 1. Local Alignment of two sequences - the last argument: 'local_alignment' - - needs at least 2 protein sequences 1-letter encoded. - When more than 2 sequences are passed, uses the first - entered sequence to align the rest on - - performs an alignment using Smith-Waterman algorithm - 2. ... - 3. ... - 4. ... - 5. ... - - Args: - *args - are supposed to be all sequences to process and the method - to process with. - The method is supposed to be the last argument. -======= - To get started choose one of the possible programms to run: - 1. Local alignment - Enter two protein sequences in 1- letter encoding. The code will return alignment scores and - sequences aligned on each other. - 2. Call method - - Returns: - function_result - result of a chosen function - """ - - seqs_list, method, seq_on = check_input(*args) - print(seqs_list, method, seq_on) - - match method: - - case 'local_alignment': - - alignment_dict: dict = {} - for seq_id, seq in enumerate(seqs_list): - function_result = local_alignment(seq_on=seq_on, - seq2=seq, - alignment_dict=alignment_dict, - seq_id=seq_id, - prettify=True) - - case '': - - pass - - case _: - - function_result = None - - return function_result - - def from_proteins_seqs_to_rna(*seqs: str) -> dict: """ - :param seqs: strings with type 'ValTyrAla','AsnAspCys'. seqs is args parameter, so - you can pass more than one sequences at the time. + :param seqs: strings with type 'ValTyrAla','AsnAspCys'. + seqs is args parameter, so you can pass more than one + sequences at the time. :return: dictionary, where [key] is your input protein sequences and values are combinations of RNA codones, which encode this protein """ @@ -276,10 +274,11 @@ def from_proteins_seqs_to_rna(*seqs: str) -> dict: def isoelectric_point_determination(*seqs: str) -> dict: """ - :param seqs: strings with type 'ValTyrAla','AsnAspCys'. seqs is args parameter, so - you can pass more than one sequences at the time. - :return: dictionary, when [key] is your input protein sequence and value is an isoelectric point - of your input proteins + :param seqs: strings with type 'ValTyrAla','AsnAspCys'. + seqs is args parameter, so you can pass more than one + sequences at a time. + :return: dictionary, where [key] is your input protein sequence and value + is an isoelectric point of your input proteins """ PKA_AMINOACIDS = { 'Ala': [2.34, 9.69], @@ -309,7 +308,7 @@ def isoelectric_point_determination(*seqs: str) -> dict: for aminoacids in seqs: divided_acids = [aminoacids[i:i + 3] for i in range(0, len(aminoacids), 3)] for divided_acid in divided_acids: - if not divided_acid in PKA_AMINOACIDS.keys(): + if divided_acid not in PKA_AMINOACIDS.keys(): raise ValueError('Non-protein aminoacids in sequence') isoelectric_point_mean = 0 @@ -326,4 +325,72 @@ def isoelectric_point_determination(*seqs: str) -> dict: isoelectric_point_mean = isoelectric_point_mean + PKA_AMINOACIDS[divided_acids[acid_index]][1] count_groups += 1 answer_dictionary[aminoacids] = isoelectric_point_mean / count_groups - return answer_dictionary \ No newline at end of file + return answer_dictionary + + +def main(*args: Tuple[Union[List[str], str], str]) -> dict: + """ + This function provides the access to the following methods: + + 1. Translate 1 letter to 3 letter encoding and vice versa - the last + argument: 'recode' + - needs at least 1 sequence 1- or 3- letter encoded. Can recive + more than 1 sequences + - returns a dictionary containing translations between 1- and 3- + letter codes + + 2. Local Alignment of two sequences - the last argument: 'local_alignment' + - needs at least 2 protein sequences 1-letter encoded. + When more than 2 sequences are passed, uses the first + entered sequence to align the rest on + - performs an alignment using Smith-Waterman algorithm + + 3. Find all possible RNA sequences for defined protein sequence - the + last argument: from_proteins_seqs_to_rna + - needs at least 1 protein sequence 3-letter encoded + - returns a dictionary, where key is your input protein sequences + and values are combinations of RNA codones, which encode this protein + + 4. Determinate isoelectric point - the last argument: + 'isoelectric_point_determination' + - needs an input containing at least 1 aminoacid. Can recive multiple + different protein sequences + - returns a dictionary, where key is your input protein sequence and + value is an isoelectric point of this protein + + 4. ... + 5. ... + + Args: + *args - are supposed to be all sequences to process and the method + to process with. + The method is supposed to be the last argument. + + Returns: + function_result - result of a chosen function + """ + + seqs_list, method, seq_on = check_input(*args) + print(seqs_list, method, seq_on) + + match method: + + case 'local_alignment': + + alignment_dict: dict = {} + for seq_id, seq in enumerate(seqs_list): + function_result = local_alignment(seq_on=seq_on, + seq2=seq, + alignment_dict=alignment_dict, + seq_id=seq_id, + prettify=True) + + case '': + + pass + + case _: + + function_result = None + + return function_result From 96e209da79167c13122aa33715976f8200d53bc2 Mon Sep 17 00:00:00 2001 From: Daria Date: Sat, 30 Sep 2023 17:28:36 +0300 Subject: [PATCH 11/30] add raise ValueError in from_proteins_seqs_to_rna function, add line breaks --- HW4_Sapozhnikov/prototool.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 4bb7547..5873d1a 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -270,6 +270,8 @@ def from_proteins_seqs_to_rna(*seqs: str) -> dict: for divided_acid in divided_acids: if divided_acid in PROTEIN_TO_RNA_COMBINATION.keys(): rna_combination += next(iter(PROTEIN_TO_RNA_COMBINATION[divided_acid])) + else: + raise ValueError('Non-protein aminoacids in sequence') answer_dictionary[aminoacids] = rna_combination return answer_dictionary @@ -316,14 +318,17 @@ def isoelectric_point_determination(*seqs: str) -> dict: count_groups = 0 for acid_index in range(0, len(divided_acids)): if acid_index == 0: - isoelectric_point_mean += PKA_AMINOACIDS[divided_acids[acid_index]][0] + isoelectric_point_mean\ + += PKA_AMINOACIDS[divided_acids[acid_index]][0] count_groups += 1 elif acid_index == len(divided_acids) - 1: - isoelectric_point_mean = isoelectric_point_mean + PKA_AMINOACIDS[divided_acids[acid_index]][-1] + isoelectric_point_mean = (isoelectric_point_mean + + PKA_AMINOACIDS[divided_acids[acid_index]][-1]) count_groups += 1 else: if len(PKA_AMINOACIDS[divided_acids[acid_index]]) > 2: - isoelectric_point_mean = isoelectric_point_mean + PKA_AMINOACIDS[divided_acids[acid_index]][1] + isoelectric_point_mean = (isoelectric_point_mean + + PKA_AMINOACIDS[divided_acids[acid_index]][1]) count_groups += 1 answer_dictionary[aminoacids] = isoelectric_point_mean / count_groups - return answer_dictionary \ No newline at end of file + return answer_dictionary From 703249c228a75c50c689e43877fbf46caf01a5b5 Mon Sep 17 00:00:00 2001 From: Alina Date: Sat, 30 Sep 2023 18:49:54 +0300 Subject: [PATCH 12/30] Add back_transcribe function --- HW4_Sapozhnikov/prototool.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 5873d1a..9b603d2 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -332,3 +332,21 @@ def isoelectric_point_determination(*seqs: str) -> dict: count_groups += 1 answer_dictionary[aminoacids] = isoelectric_point_mean / count_groups return answer_dictionary + +TRANSCRIBE_DICT = dict(A='A', U='T', G='G', C='C', a='a', u='t', g='g', c='c') + +def back_transcribe(*seqs: str) -> dict: + """ + :param seqs: Seqs is an argument of the function. It is a string without whitespace. + You can put as many arguments as you wish. + :return: THis function returns a dictonary, which [key] is inputed protein + sequence and values are DNA codons + """ + result = {} + for seq in seqs: + rna = list((from_proteins_seqs_to_rna(seq)).get(seq)) + for i in range(len(rna)): + if rna[i] in TRANSCRIBE_DICT.keys(): + rna[i] = TRANSCRIBE_DICT[rna[i]] + result[seq] = "".join(rna) + return result From 1ba062bc51518af6b70f56add5d65d3097b4483d Mon Sep 17 00:00:00 2001 From: Alina Date: Sat, 30 Sep 2023 18:57:49 +0300 Subject: [PATCH 13/30] Add gc_content function --- HW4_Sapozhnikov/prototool.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 9b603d2..65b20ec 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -350,3 +350,16 @@ def back_transcribe(*seqs: str) -> dict: rna[i] = TRANSCRIBE_DICT[rna[i]] result[seq] = "".join(rna) return result + +def count_gc_content(*seqs: str) -> dict: + ''' + :param seqs: Seqs is an argument of the function. It is a string without whitespace. + You can put as many arguments as you wish. + :return: THis function returns GC-content of DNA sequence, which encodes the protein + ''' + result = {} + for seq in seqs: + dna = list((back_transcribe(seq)).get(seq)) + gc_content = round(100 * (dna.count('G') + dna.count('C'))/len(dna)) + result[seq] = gc_content + return result From b21741c33e1c1b080070711e4d52bde7c0cc8f26 Mon Sep 17 00:00:00 2001 From: Alina Date: Sat, 30 Sep 2023 19:03:57 +0300 Subject: [PATCH 14/30] Add count_protein_molecular_weigh function --- HW4_Sapozhnikov/prototool.py | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 65b20ec..f6045dd 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -363,3 +363,43 @@ def count_gc_content(*seqs: str) -> dict: gc_content = round(100 * (dna.count('G') + dna.count('C'))/len(dna)) result[seq] = gc_content return result + +MOLECULAR_WEIGHTS = { + 'Ala': 89, + 'Cys': 121, + 'Asp': 133, + 'Glu': 147, + 'Phe': 165, + 'Gly': 75, + 'His': 155, + 'Ile': 131, + 'Lys': 146, + 'Leu': 131, + 'Met': 149, + 'Asn': 132, + 'Pro': 115, + 'Gln': 146, + 'Arg': 174, + 'Ser': 105, + 'Thr': 119, + 'Val': 117, + 'Trp': 204, + 'Tyr': 181} + +def count_protein_molecular_weight(*seqs: str) -> dict: + """ + :param seqs: Seqs is an argument of the function. It is a string without whitespace + (f.g. 'AlaSer'). You can put as many arguments as you wish. + :return: This function returns molecular weight of the protein. + """ + result = {} + for seq in seqs: + protein_weight = 0 + aminoacids = [seq[i:i + 3] for i in range(0, len(seq), 3)] + for i in range(len(aminoacids)): + if aminoacids[i] in MOLECULAR_WEIGHTS.keys(): + aminoacid_weight = MOLECULAR_WEIGHTS[aminoacids[i]] + protein_weight += aminoacid_weight + result[seq] = protein_weight + return result + From 463dbf3c7e5a2c8b0500250d93e1d004d33cc508 Mon Sep 17 00:00:00 2001 From: Nikita Date: Sat, 30 Sep 2023 19:50:35 +0300 Subject: [PATCH 15/30] Add recode() function --- HW4_Sapozhnikov/prototool.py | 68 +++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 1a74584..6a364f9 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -5,7 +5,7 @@ from typing import List, Optional, Tuple, Union -def recode(*seq: Union[List[str], str]) -> dict: +def recode(seq: str) -> dict: """ Translate 1-letter to 3-letter encoding if 1-letter encoded sequence is given and vice versa. @@ -18,38 +18,33 @@ def recode(*seq: Union[List[str], str]) -> dict: for original sequences keys """ - to_1_dictionary = { + TO_1_dict = { 'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C', 'Gln': 'Q', 'Glu': 'E', 'Gly': 'G', 'His': 'H', 'Ile': 'I', 'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P', 'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V' -} - - to_3_dictionary = {v: k for k, v in to_1_dictionary.items()} + } - function_result = {} + TO_3_dict = {v: k for k, v in TO_1_dict.items()} - for sequence in seq: - # Check if the input sequence is in 1-letter or 3-letter format - is_one_letter = all(aa.isalpha() and aa.isupper() for aa in sequence) + # Check if the input sequence is in 1-letter or 3-letter format + is_one_letter = all(aa.isalpha() and aa.isupper() for aa in seq) - if is_one_letter: - # Translate 1-letter to 3-letter coded sequence - three_letter_sequence = "" - for aa in sequence: - three_letter_code = to_3_dictionary.get(aa, aa) - three_letter_sequence += three_letter_code - function_result[sequence] = three_letter_sequence - else: - # Translate 3-letter to 1-letter coded sequence - one_letter_sequence = "" - for aa in range(0, len(sequence), 3): - amino_acid = sequence[aa:aa+3] - one_letter_sequence += to_1_dictionary.get(amino_acid, - amino_acid) - function_result[sequence] = one_letter_sequence - return function_result + if is_one_letter: + # Translate 1-letter to 3-letter coded sequence + three_letter_sequence = "" + for aa in seq: + three_letter_code = TO_3_dict.get(aa, aa) + three_letter_sequence += three_letter_code + return three_letter_sequence + # Translate 3-letter to 1-letter coded sequence + one_letter_sequence = "" + for aa in range(0, len(seq), 3): + amino_acid = seq[aa:aa+3] + one_letter_sequence += TO_1_dict.get(amino_acid, + amino_acid) + return one_letter_sequence def prettify_alignment(aligned_seq_on: str, aligned_seq2: str) -> None: @@ -219,7 +214,11 @@ def check_input(*args: List[str]) -> Tuple[List[str], else: # Check the last element of the input is a valid method method = args[-1] - if method not in ['local_alignment', '', '', '', '']: + if method not in ['recode', + 'local_alignment', + 'from_proteins_seqs_to_rna', + 'isoelectric_point_determination', + '']: raise ValueError(method, " is not a valid method.") else: # Form a list with sequences from the input @@ -264,14 +263,16 @@ def from_proteins_seqs_to_rna(*seqs: str) -> dict: answer_dictionary = {} for aminoacids in seqs: rna_combination = '' - divided_acids = [aminoacids[i:i + 3] for i in range(0, len(aminoacids), 3)] + divided_acids = [aminoacids[i:i + 3] for i in range(0, + len(aminoacids), + 3)] for divided_acid in divided_acids: if divided_acid in PROTEIN_TO_RNA_COMBINATION.keys(): rna_combination += next(iter(PROTEIN_TO_RNA_COMBINATION[divided_acid])) answer_dictionary[aminoacids] = rna_combination return answer_dictionary - + def isoelectric_point_determination(*seqs: str) -> dict: """ :param seqs: strings with type 'ValTyrAla','AsnAspCys'. @@ -371,12 +372,21 @@ def main(*args: Tuple[Union[List[str], str], str]) -> dict: """ seqs_list, method, seq_on = check_input(*args) - print(seqs_list, method, seq_on) + print(f'Your sequences are: {seqs_list}', + f'The method is: {method}', sep='\n') match method: + case 'recode': + + recode_dict: dict = {} + for seq in seqs_list: + recode_dict[seq] = recode(seq=seq) + return recode_dict + case 'local_alignment': + print('The sequence align on: ', seq_on) alignment_dict: dict = {} for seq_id, seq in enumerate(seqs_list): function_result = local_alignment(seq_on=seq_on, From a29692d281cc6ab1880d0782930ab86667454cb8 Mon Sep 17 00:00:00 2001 From: Alina Date: Sat, 30 Sep 2023 20:09:24 +0300 Subject: [PATCH 16/30] changed order of functions --- HW4_Sapozhnikov/prototool.py | 139 +++++++++++++++++------------------ 1 file changed, 69 insertions(+), 70 deletions(-) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index f6045dd..bbc4294 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -179,6 +179,75 @@ def check_input(*args: List[str]) -> Tuple[List[str], seq_on = None return seqs_list, method, seq_on +TRANSCRIBE_DICT = dict(A='A', U='T', G='G', C='C', a='a', u='t', g='g', c='c') + +def back_transcribe(*seqs: str) -> dict: + """ + :param seqs: Seqs is an argument of the function. It is a string without whitespace. + You can put as many arguments as you wish. + :return: THis function returns a dictonary, which [key] is inputed protein + sequence and values are DNA codons + """ + result = {} + for seq in seqs: + rna = list((from_proteins_seqs_to_rna(seq)).get(seq)) + for i in range(len(rna)): + if rna[i] in TRANSCRIBE_DICT.keys(): + rna[i] = TRANSCRIBE_DICT[rna[i]] + result[seq] = "".join(rna) + return result + +def count_gc_content(*seqs: str) -> dict: + ''' + :param seqs: Seqs is an argument of the function. It is a string without whitespace. + You can put as many arguments as you wish. + :return: THis function returns GC-content of DNA sequence, which encodes the protein + ''' + result = {} + for seq in seqs: + dna = list((back_transcribe(seq)).get(seq)) + gc_content = round(100 * (dna.count('G') + dna.count('C'))/len(dna)) + result[seq] = gc_content + return result + +MOLECULAR_WEIGHTS = { + 'Ala': 89, + 'Cys': 121, + 'Asp': 133, + 'Glu': 147, + 'Phe': 165, + 'Gly': 75, + 'His': 155, + 'Ile': 131, + 'Lys': 146, + 'Leu': 131, + 'Met': 149, + 'Asn': 132, + 'Pro': 115, + 'Gln': 146, + 'Arg': 174, + 'Ser': 105, + 'Thr': 119, + 'Val': 117, + 'Trp': 204, + 'Tyr': 181} + +def count_protein_molecular_weight(*seqs: str) -> dict: + """ + :param seqs: Seqs is an argument of the function. It is a string without whitespace + (f.g. 'AlaSer'). You can put as many arguments as you wish. + :return: This function returns molecular weight of the protein. + """ + result = {} + for seq in seqs: + protein_weight = 0 + aminoacids = [seq[i:i + 3] for i in range(0, len(seq), 3)] + for i in range(len(aminoacids)): + if aminoacids[i] in MOLECULAR_WEIGHTS.keys(): + aminoacid_weight = MOLECULAR_WEIGHTS[aminoacids[i]] + protein_weight += aminoacid_weight + result[seq] = protein_weight + return result def main(*args: Tuple[Union[List[str], str], str]) -> dict: """ @@ -332,74 +401,4 @@ def isoelectric_point_determination(*seqs: str) -> dict: count_groups += 1 answer_dictionary[aminoacids] = isoelectric_point_mean / count_groups return answer_dictionary - -TRANSCRIBE_DICT = dict(A='A', U='T', G='G', C='C', a='a', u='t', g='g', c='c') - -def back_transcribe(*seqs: str) -> dict: - """ - :param seqs: Seqs is an argument of the function. It is a string without whitespace. - You can put as many arguments as you wish. - :return: THis function returns a dictonary, which [key] is inputed protein - sequence and values are DNA codons - """ - result = {} - for seq in seqs: - rna = list((from_proteins_seqs_to_rna(seq)).get(seq)) - for i in range(len(rna)): - if rna[i] in TRANSCRIBE_DICT.keys(): - rna[i] = TRANSCRIBE_DICT[rna[i]] - result[seq] = "".join(rna) - return result - -def count_gc_content(*seqs: str) -> dict: - ''' - :param seqs: Seqs is an argument of the function. It is a string without whitespace. - You can put as many arguments as you wish. - :return: THis function returns GC-content of DNA sequence, which encodes the protein - ''' - result = {} - for seq in seqs: - dna = list((back_transcribe(seq)).get(seq)) - gc_content = round(100 * (dna.count('G') + dna.count('C'))/len(dna)) - result[seq] = gc_content - return result - -MOLECULAR_WEIGHTS = { - 'Ala': 89, - 'Cys': 121, - 'Asp': 133, - 'Glu': 147, - 'Phe': 165, - 'Gly': 75, - 'His': 155, - 'Ile': 131, - 'Lys': 146, - 'Leu': 131, - 'Met': 149, - 'Asn': 132, - 'Pro': 115, - 'Gln': 146, - 'Arg': 174, - 'Ser': 105, - 'Thr': 119, - 'Val': 117, - 'Trp': 204, - 'Tyr': 181} - -def count_protein_molecular_weight(*seqs: str) -> dict: - """ - :param seqs: Seqs is an argument of the function. It is a string without whitespace - (f.g. 'AlaSer'). You can put as many arguments as you wish. - :return: This function returns molecular weight of the protein. - """ - result = {} - for seq in seqs: - protein_weight = 0 - aminoacids = [seq[i:i + 3] for i in range(0, len(seq), 3)] - for i in range(len(aminoacids)): - if aminoacids[i] in MOLECULAR_WEIGHTS.keys(): - aminoacid_weight = MOLECULAR_WEIGHTS[aminoacids[i]] - protein_weight += aminoacid_weight - result[seq] = protein_weight - return result From 6ce8cf8f601ba9291fe10cad2a1b720c9003b3df Mon Sep 17 00:00:00 2001 From: Alina Date: Sat, 30 Sep 2023 20:31:54 +0300 Subject: [PATCH 17/30] Changed order of functions --- HW4_Sapozhnikov/prototool.py | 62 ++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index bbc4294..3f42045 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -179,37 +179,6 @@ def check_input(*args: List[str]) -> Tuple[List[str], seq_on = None return seqs_list, method, seq_on -TRANSCRIBE_DICT = dict(A='A', U='T', G='G', C='C', a='a', u='t', g='g', c='c') - -def back_transcribe(*seqs: str) -> dict: - """ - :param seqs: Seqs is an argument of the function. It is a string without whitespace. - You can put as many arguments as you wish. - :return: THis function returns a dictonary, which [key] is inputed protein - sequence and values are DNA codons - """ - result = {} - for seq in seqs: - rna = list((from_proteins_seqs_to_rna(seq)).get(seq)) - for i in range(len(rna)): - if rna[i] in TRANSCRIBE_DICT.keys(): - rna[i] = TRANSCRIBE_DICT[rna[i]] - result[seq] = "".join(rna) - return result - -def count_gc_content(*seqs: str) -> dict: - ''' - :param seqs: Seqs is an argument of the function. It is a string without whitespace. - You can put as many arguments as you wish. - :return: THis function returns GC-content of DNA sequence, which encodes the protein - ''' - result = {} - for seq in seqs: - dna = list((back_transcribe(seq)).get(seq)) - gc_content = round(100 * (dna.count('G') + dna.count('C'))/len(dna)) - result[seq] = gc_content - return result - MOLECULAR_WEIGHTS = { 'Ala': 89, 'Cys': 121, @@ -248,6 +217,37 @@ def count_protein_molecular_weight(*seqs: str) -> dict: protein_weight += aminoacid_weight result[seq] = protein_weight return result + +TRANSCRIBE_DICT = dict(A='A', U='T', G='G', C='C', a='a', u='t', g='g', c='c') + +def back_transcribe(*seqs: str) -> dict: + """ + :param seqs: Seqs is an argument of the function. It is a string without whitespace. + You can put as many arguments as you wish. + :return: THis function returns a dictonary, which [key] is inputed protein + sequence and values are DNA codons + """ + result = {} + for seq in seqs: + rna = list((from_proteins_seqs_to_rna(seq)).get(seq)) + for i in range(len(rna)): + if rna[i] in TRANSCRIBE_DICT.keys(): + rna[i] = TRANSCRIBE_DICT[rna[i]] + result[seq] = "".join(rna) + return result + +def count_gc_content(*seqs: str) -> dict: + ''' + :param seqs: Seqs is an argument of the function. It is a string without whitespace. + You can put as many arguments as you wish. + :return: THis function returns GC-content of DNA sequence, which encodes the protein + ''' + result = {} + for seq in seqs: + dna = list((back_transcribe(seq)).get(seq)) + gc_content = round(100 * (dna.count('G') + dna.count('C'))/len(dna)) + result[seq] = gc_content + return result def main(*args: Tuple[Union[List[str], str], str]) -> dict: """ From 5cc5a9be2b13edcdd5b83f210e0c9ff057f69e8c Mon Sep 17 00:00:00 2001 From: Alina Date: Sat, 30 Sep 2023 20:45:44 +0300 Subject: [PATCH 18/30] changed order of functions --- HW4_Sapozhnikov/prototool.py | 62 ++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 3f42045..8df43c5 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -178,6 +178,37 @@ def check_input(*args: List[str]) -> Tuple[List[str], return seqs_list, method, seq_on seq_on = None return seqs_list, method, seq_on + +TRANSCRIBE_DICT = dict(A='A', U='T', G='G', C='C', a='a', u='t', g='g', c='c') + +def back_transcribe(*seqs: str) -> dict: + """ + :param seqs: Seqs is an argument of the function. It is a string without whitespace. + You can put as many arguments as you wish. + :return: THis function returns a dictonary, which [key] is inputed protein + sequence and values are DNA codons + """ + result = {} + for seq in seqs: + rna = list((from_proteins_seqs_to_rna(seq)).get(seq)) + for i in range(len(rna)): + if rna[i] in TRANSCRIBE_DICT.keys(): + rna[i] = TRANSCRIBE_DICT[rna[i]] + result[seq] = "".join(rna) + return result + +def count_gc_content(*seqs: str) -> dict: + ''' + :param seqs: Seqs is an argument of the function. It is a string without whitespace. + You can put as many arguments as you wish. + :return: THis function returns GC-content of DNA sequence, which encodes the protein + ''' + result = {} + for seq in seqs: + dna = list((back_transcribe(seq)).get(seq)) + gc_content = round(100 * (dna.count('G') + dna.count('C'))/len(dna)) + result[seq] = gc_content + return result MOLECULAR_WEIGHTS = { 'Ala': 89, @@ -218,37 +249,6 @@ def count_protein_molecular_weight(*seqs: str) -> dict: result[seq] = protein_weight return result -TRANSCRIBE_DICT = dict(A='A', U='T', G='G', C='C', a='a', u='t', g='g', c='c') - -def back_transcribe(*seqs: str) -> dict: - """ - :param seqs: Seqs is an argument of the function. It is a string without whitespace. - You can put as many arguments as you wish. - :return: THis function returns a dictonary, which [key] is inputed protein - sequence and values are DNA codons - """ - result = {} - for seq in seqs: - rna = list((from_proteins_seqs_to_rna(seq)).get(seq)) - for i in range(len(rna)): - if rna[i] in TRANSCRIBE_DICT.keys(): - rna[i] = TRANSCRIBE_DICT[rna[i]] - result[seq] = "".join(rna) - return result - -def count_gc_content(*seqs: str) -> dict: - ''' - :param seqs: Seqs is an argument of the function. It is a string without whitespace. - You can put as many arguments as you wish. - :return: THis function returns GC-content of DNA sequence, which encodes the protein - ''' - result = {} - for seq in seqs: - dna = list((back_transcribe(seq)).get(seq)) - gc_content = round(100 * (dna.count('G') + dna.count('C'))/len(dna)) - result[seq] = gc_content - return result - def main(*args: Tuple[Union[List[str], str], str]) -> dict: """ This function provides the access to the following methods: From be1abc52bb8049bfaae5b66ca37ef2398252e3dd Mon Sep 17 00:00:00 2001 From: Nikita Date: Sun, 1 Oct 2023 10:57:59 +0300 Subject: [PATCH 19/30] Major code review and merging all functions together --- HW4_Sapozhnikov/prototool.py | 546 +++++++++++++++++++---------------- 1 file changed, 290 insertions(+), 256 deletions(-) diff --git a/HW4_Sapozhnikov/prototool.py b/HW4_Sapozhnikov/prototool.py index 7fb8327..dd2d851 100644 --- a/HW4_Sapozhnikov/prototool.py +++ b/HW4_Sapozhnikov/prototool.py @@ -1,10 +1,121 @@ """ -This is a prototool. +This is a prototool. WE ARE SORRY!!! """ from typing import List, Optional, Tuple, Union +AMINOACIDS_DICT = { + 'Ala': {'TO_1': 'A', + 'PROTEIN_TO_RNA_COMBINATION': {'GCU', 'GCC', 'GCA', 'GCG'}, + 'PKA_AMINOACIDS': [2.34, 9.69], + 'MOLECULAR_WEIGHTS': 89}, + 'Arg': {'TO_1': 'R', + 'PROTEIN_TO_RNA_COMBINATION': {'CGU', 'CGC', 'CGA', 'CGG', 'AGA', + 'AGG'}, + 'PKA_AMINOACIDS': [2.17, 9.04, 12.68], + 'MOLECULAR_WEIGHTS': 174}, + 'Asn': {'TO_1': 'N', + 'PROTEIN_TO_RNA_COMBINATION': {'AAU', 'AAC'}, + 'PKA_AMINOACIDS': [1.88, 9.60, 3.65], + 'MOLECULAR_WEIGHTS': 132}, + 'Asp': {'TO_1': 'D', + 'PROTEIN_TO_RNA_COMBINATION': {'GAU', 'GAC'}, + 'PKA_AMINOACIDS': [1.88, 9.60, 3.65], + 'MOLECULAR_WEIGHTS': 133}, + 'Cys': {'TO_1': 'C', + 'PROTEIN_TO_RNA_COMBINATION': {'UGU', 'UGC'}, + 'PKA_AMINOACIDS': [1.96, 10.28, 8.18], + 'MOLECULAR_WEIGHTS': 121}, + 'Glu': {'TO_1': 'Q', + 'PROTEIN_TO_RNA_COMBINATION': {'GAA', 'GAG'}, + 'PKA_AMINOACIDS': [2.19, 9.67, 4.25], + 'MOLECULAR_WEIGHTS': 147}, + 'Gln': {'TO_1': 'E', + 'PROTEIN_TO_RNA_COMBINATION': {'CAA', 'CAG'}, + 'PKA_AMINOACIDS': [2.17, 9.13], + 'MOLECULAR_WEIGHTS': 146}, + 'Gly': {'TO_1': 'G', + 'PROTEIN_TO_RNA_COMBINATION': {'GGU', 'GGC', 'GGA', 'GGG'}, + 'PKA_AMINOACIDS': [2.34, 9.60], + 'MOLECULAR_WEIGHTS': 75}, + 'His': {'TO_1': 'E', + 'PROTEIN_TO_RNA_COMBINATION': {'CAU', 'CAC'}, + 'PKA_AMINOACIDS': [1.82, 9.17], + 'MOLECULAR_WEIGHTS': 155}, + 'Ile': {'TO_1': 'I', + 'PROTEIN_TO_RNA_COMBINATION': {'AUU', 'AUC', 'AUA'}, + 'PKA_AMINOACIDS': [2.36, 9.68], + 'MOLECULAR_WEIGHTS': 131}, + 'Leu': {'TO_1': 'L', + 'PROTEIN_TO_RNA_COMBINATION': {'CUU', 'CUC', 'CUA', 'CUG'}, + 'PKA_AMINOACIDS': [2.36, 9.60], + 'MOLECULAR_WEIGHTS': 131}, + 'Lys': {'TO_1': 'K', + 'PROTEIN_TO_RNA_COMBINATION': {'AAA', 'AAG'}, + 'PKA_AMINOACIDS': [2.18, 8.95, 10.53], + 'MOLECULAR_WEIGHTS': 146}, + 'Met': {'TO_1': 'M', + 'PROTEIN_TO_RNA_COMBINATION': {'AUG'}, + 'PKA_AMINOACIDS': [2.28, 9.21], + 'MOLECULAR_WEIGHTS': 149}, + 'Phe': {'TO_1': 'F', + 'PROTEIN_TO_RNA_COMBINATION': {'UUU', 'UUC'}, + 'PKA_AMINOACIDS': [2.20, 9.13], + 'MOLECULAR_WEIGHTS': 165}, + 'Pro': {'TO_1': 'P', + 'PROTEIN_TO_RNA_COMBINATION': {'CCU', 'CCC', 'CCA', 'CCG'}, + 'PKA_AMINOACIDS': [1.99, 10.96], + 'MOLECULAR_WEIGHTS': 115}, + 'Ser': {'TO_1': 'S', + 'PROTEIN_TO_RNA_COMBINATION': {'UCU', 'UCC', 'UCA', 'UCG'}, + 'PKA_AMINOACIDS': [2.21, 9.15], + 'MOLECULAR_WEIGHTS': 105}, + 'Thr': {'TO_1': 'T', + 'PROTEIN_TO_RNA_COMBINATION': {'ACU', 'ACC', 'ACA', 'ACG'}, + 'PKA_AMINOACIDS': [2.11, 9.62], + 'MOLECULAR_WEIGHTS': 119}, + 'Tyr': {'TO_1': 'W', + 'PROTEIN_TO_RNA_COMBINATION': {'UAU', 'UAC'}, + 'PKA_AMINOACIDS': [2.20, 9.11, 10.07], + 'MOLECULAR_WEIGHTS': 181}, + 'Trp': {'TO_1': 'Y', + 'PROTEIN_TO_RNA_COMBINATION': {'UGG'}, + 'PKA_AMINOACIDS': [2.38, 9.39], + 'MOLECULAR_WEIGHTS': 204}, + 'Val': {'TO_1': 'V', + 'PROTEIN_TO_RNA_COMBINATION': {'GUU', 'GUC', 'GUA', 'GUG'}, + 'PKA_AMINOACIDS': [2.32, 9.62], + 'MOLECULAR_WEIGHTS': 117}, +} + +# A dictionary where keys are 1-letter and values are 3-letters codes +TO_3_DICT = {nested_dict['TO_1']: key for key, + nested_dict in AMINOACIDS_DICT.items()} + +TRANSCRIBE_DICT: dict = {'A': 'A', + 'U': 'T', + 'G': 'G', + 'C': 'C', + 'a': 'a', + 'u': 't', + 'g': 'g', + 'c': 'c'} + + +def is_one_letter(seq: str) -> bool: + """ + Defines whether the sequence is 1 coded. + + Args: + - seq - sequence to check + + Returns: + - bool + """ + return all(aa.isalpha() and aa.isupper() for aa in seq) + + def recode(seq: str) -> dict: """ Translate 1-letter to 3-letter encoding if 1-letter @@ -18,32 +129,18 @@ def recode(seq: str) -> dict: for original sequences keys """ - TO_1_dict = { - 'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', - 'Cys': 'C', 'Gln': 'Q', 'Glu': 'E', 'Gly': 'G', - 'His': 'H', 'Ile': 'I', 'Leu': 'L', 'Lys': 'K', - 'Met': 'M', 'Phe': 'F', 'Pro': 'P', 'Ser': 'S', - 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V' - } - - TO_3_dict = {v: k for k, v in TO_1_dict.items()} - - # Check if the input sequence is in 1-letter or 3-letter format - is_one_letter = all(aa.isalpha() and aa.isupper() for aa in seq) - - if is_one_letter: + if is_one_letter(seq): # Translate 1-letter to 3-letter coded sequence three_letter_sequence = "" for aa in seq: - three_letter_code = TO_3_dict.get(aa, aa) + three_letter_code = TO_3_DICT.get(aa, aa) three_letter_sequence += three_letter_code return three_letter_sequence # Translate 3-letter to 1-letter coded sequence one_letter_sequence = "" for aa in range(0, len(seq), 3): amino_acid = seq[aa:aa+3] - one_letter_sequence += TO_1_dict.get(amino_acid, - amino_acid) + one_letter_sequence += AMINOACIDS_DICT[amino_acid]['TO_1'] return one_letter_sequence @@ -71,7 +168,7 @@ def prettify_alignment(aligned_seq_on: str, aligned_seq2: str) -> None: def local_alignment(seq_on: str, - seq2: Union[List[str], str], + seq2: str, alignment_dict: dict, seq_id: int, match=2, @@ -191,269 +288,178 @@ def local_alignment(seq_on: str, return alignment_dict -def check_input(*args: List[str]) -> Tuple[List[str], - str, - Optional[str]]: - """ - Function to check the validity of the input. - - Args: - *args - are supposed to be all sequences to process and the method to - process with. - The method is supposed to be the last argument. - - Returns: - - seqs_list - list of sequences - - method - a chosen method to use - - seq_on (optional) - in case of local_alignment method - """ - - if len(args) < 1: - # Handle the case where there are no arguments - raise ValueError("No input defined.") - else: - # Check the last element of the input is a valid method - method = args[-1] - if method not in ['recode', - 'local_alignment', - 'from_proteins_seqs_to_rna', - 'isoelectric_point_determination', - '']: - raise ValueError(method, " is not a valid method.") - else: - # Form a list with sequences from the input - seqs_list = list(args[:-1]) - if method == 'local_alignment': - seq_on = seqs_list.pop(0) - return seqs_list, method, seq_on - seq_on = None - return seqs_list, method, seq_on - -TRANSCRIBE_DICT = dict(A='A', U='T', G='G', C='C', a='a', u='t', g='g', c='c') -def back_transcribe(*seqs: str) -> dict: - """ - :param seqs: Seqs is an argument of the function. It is a string without whitespace. - You can put as many arguments as you wish. - :return: THis function returns a dictonary, which [key] is inputed protein - sequence and values are DNA codons - """ - result = {} - for seq in seqs: - rna = list((from_proteins_seqs_to_rna(seq)).get(seq)) - for i in range(len(rna)): - if rna[i] in TRANSCRIBE_DICT.keys(): - rna[i] = TRANSCRIBE_DICT[rna[i]] - result[seq] = "".join(rna) - return result - -def count_gc_content(*seqs: str) -> dict: - ''' - :param seqs: Seqs is an argument of the function. It is a string without whitespace. - You can put as many arguments as you wish. - :return: THis function returns GC-content of DNA sequence, which encodes the protein - ''' - result = {} - for seq in seqs: - dna = list((back_transcribe(seq)).get(seq)) - gc_content = round(100 * (dna.count('G') + dna.count('C'))/len(dna)) - result[seq] = gc_content - return result - -MOLECULAR_WEIGHTS = { - 'Ala': 89, - 'Cys': 121, - 'Asp': 133, - 'Glu': 147, - 'Phe': 165, - 'Gly': 75, - 'His': 155, - 'Ile': 131, - 'Lys': 146, - 'Leu': 131, - 'Met': 149, - 'Asn': 132, - 'Pro': 115, - 'Gln': 146, - 'Arg': 174, - 'Ser': 105, - 'Thr': 119, - 'Val': 117, - 'Trp': 204, - 'Tyr': 181} - -def count_protein_molecular_weight(*seqs: str) -> dict: +def count_protein_molecular_weight(*seqs_list: Union[List[str], str]) -> dict: """ - :param seqs: Seqs is an argument of the function. It is a string without whitespace - (f.g. 'AlaSer'). You can put as many arguments as you wish. + :param seqs_list: seqs_list is a list of strings without whitespace + (e.g. 'AlaSer'). You can put as many sequences as you wish. :return: This function returns molecular weight of the protein. """ result = {} - for seq in seqs: + for seq in seqs_list: protein_weight = 0 aminoacids = [seq[i:i + 3] for i in range(0, len(seq), 3)] - for i in range(len(aminoacids)): - if aminoacids[i] in MOLECULAR_WEIGHTS.keys(): - aminoacid_weight = MOLECULAR_WEIGHTS[aminoacids[i]] + for i, aminoacid in enumerate(aminoacids): + if aminoacid in AMINOACIDS_DICT.keys(): + aminoacid_weight = (AMINOACIDS_DICT[aminoacid] + ['MOLECULAR_WEIGHTS']) protein_weight += aminoacid_weight result[seq] = protein_weight return result - -def main(*args: Tuple[Union[List[str], str], str]) -> dict: - """ - This function provides the access to the following methods: - 1. Local Alignment of two sequences - the last argument: 'local_alignment' - - needs at least 2 protein sequences 1-letter encoded. - When more than 2 sequences are passed, uses the first - entered sequence to align the rest on - - performs an alignment using Smith-Waterman algorithm - 2. ... - 3. ... - 4. ... - 5. ... - Args: - *args - are supposed to be all sequences to process and the method - to process with. - The method is supposed to be the last argument - To get started choose one of the possible programms to run: - 1. Local alignment - Enter two protein sequences in 1- letter encoding. The code will return alignment scores and - sequences aligned on each other. - 2. Call method - Returns: - function_result - result of a chosen function +def from_proteins_seqs_to_rna(*seqs_list: Union[List[str], str]) -> dict: """ - - seqs_list, method, seq_on = check_input(*args) - print(seqs_list, method, seq_on) - - match method: - - case 'local_alignment': - - alignment_dict: dict = {} - for seq_id, seq in enumerate(seqs_list): - function_result = local_alignment(seq_on=seq_on, - seq2=seq, - alignment_dict=alignment_dict, - seq_id=seq_id, - prettify=True) - - case '': - - pass - - case _: - - function_result = None - - return function_result - -def from_proteins_seqs_to_rna(*seqs: str) -> dict: - """ - :param seqs: strings with type 'ValTyrAla','AsnAspCys'. - seqs is args parameter, so you can pass more than one - sequences at the time. + :param seqs_list: a list of strings with type 'ValTyrAla','AsnAspCys'. + You can pass more than one sequence at the time. :return: dictionary, where [key] is your input protein sequences and values are combinations of RNA codones, which encode this protein """ - PROTEIN_TO_RNA_COMBINATION = { - 'Ala': {'GCU', 'GCC', 'GCA', 'GCG'}, - 'Arg': {'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'}, - 'Asn': {'AAU', 'AAC'}, - 'Asp': {'GAU', 'GAC'}, - 'Cys': {'UGU', 'UGC'}, - 'Glu': {'GAA', 'GAG'}, - 'Gln': {'CAA', 'CAG'}, - 'Gly': {'GGU', 'GGC', 'GGA', 'GGG'}, - 'His': {'CAU', 'CAC'}, - 'Ile': {'AUU', 'AUC', 'AUA'}, - 'Leu': {'CUU', 'CUC', 'CUA', 'CUG'}, - 'Lys': {'AAA', 'AAG'}, - 'Met': {'AUG'}, - 'Phe': {'UUU', 'UUC'}, - 'Pro': {'CCU', 'CCC', 'CCA', 'CCG'}, - 'Ser': {'UCU', 'UCC', 'UCA', 'UCG'}, - 'Thr': {'ACU', 'ACC', 'ACA', 'ACG'}, - 'Tyr': {'UAU', 'UAC'}, - 'Trp': {'UGG'}, - 'Val': {'GUU', 'GUC', 'GUA', 'GUG'}, - } + answer_dictionary = {} - for aminoacids in seqs: + for seq in seqs_list: + rna_combination = '' - divided_acids = [aminoacids[i:i + 3] for i in range(0, - len(aminoacids), - 3)] + divided_acids = [seq[i:i + 3] for i in range(0, + len(seq), + 3)] for divided_acid in divided_acids: - if divided_acid in PROTEIN_TO_RNA_COMBINATION.keys(): - rna_combination += next(iter(PROTEIN_TO_RNA_COMBINATION[divided_acid])) + + if divided_acid in AMINOACIDS_DICT.keys(): + rna_combination += next(iter(AMINOACIDS_DICT[divided_acid] + ['PROTEIN_TO_RNA_COMBINATION'])) else: raise ValueError('Non-protein aminoacids in sequence') - answer_dictionary[aminoacids] = rna_combination + answer_dictionary[seq] = rna_combination return answer_dictionary -def isoelectric_point_determination(*seqs: str) -> dict: +def isoelectric_point_determination(*seqs_list: Union[List[str], str]) -> dict: """ - :param seqs: strings with type 'ValTyrAla','AsnAspCys'. - seqs is args parameter, so you can pass more than one - sequences at a time. + :param seqs_list: a list of strings with type 'ValTyrAla','AsnAspCys'. + You can pass more than one sequence at a time. :return: dictionary, where [key] is your input protein sequence and value is an isoelectric point of your input proteins """ - PKA_AMINOACIDS = { - 'Ala': [2.34, 9.69], - 'Arg': [2.17, 9.04, 12.68], - 'Asn': [1.88, 9.60, 3.65], - 'Asp': [1.88, 9.60, 3.65], - 'Cys': [1.96, 10.28, 8.18], - 'Glu': [2.19, 9.67, 4.25], - 'Gln': [2.17, 9.13], - 'Gly': [2.34, 9.60], - 'His': [1.82, 9.17], - 'Ile': [2.36, 9.68], - 'Leu': [2.36, 9.60], - 'Lys': [2.18, 8.95, 10.53], - 'Met': [2.28, 9.21], - 'Phe': [2.20, 9.13], - 'Pro': [1.99, 10.96], - 'Ser': [2.21, 9.15], - 'Thr': [2.11, 9.62], - 'Tyr': [2.20, 9.11, 10.07], - 'Trp': [2.38, 9.39], - 'Val': [2.32, 9.62], - } - answer_dictionary = {} - for aminoacids in seqs: - divided_acids = [aminoacids[i:i + 3] for i in range(0, len(aminoacids), 3)] + for aminoacids in seqs_list: + divided_acids = [aminoacids[i:i + 3] for i in range(0, + len(aminoacids), + 3)] for divided_acid in divided_acids: - if divided_acid not in PKA_AMINOACIDS.keys(): + if divided_acid not in AMINOACIDS_DICT.keys(): raise ValueError('Non-protein aminoacids in sequence') isoelectric_point_mean = 0 count_groups = 0 - for acid_index in range(0, len(divided_acids)): + for acid_index, aminoacid in enumerate(divided_acids): if acid_index == 0: isoelectric_point_mean\ - += PKA_AMINOACIDS[divided_acids[acid_index]][0] + += (AMINOACIDS_DICT[aminoacid]['PKA_AMINOACIDS'][0]) count_groups += 1 elif acid_index == len(divided_acids) - 1: isoelectric_point_mean = (isoelectric_point_mean - + PKA_AMINOACIDS[divided_acids[acid_index]][-1]) + + (AMINOACIDS_DICT[aminoacid] + ['PKA_AMINOACIDS'][-1])) count_groups += 1 else: - if len(PKA_AMINOACIDS[divided_acids[acid_index]]) > 2: + if len(AMINOACIDS_DICT[aminoacid]['PKA_AMINOACIDS']) > 2: isoelectric_point_mean = (isoelectric_point_mean - + PKA_AMINOACIDS[divided_acids[acid_index]][1]) + + (AMINOACIDS_DICT[aminoacid] + ['PKA_AMINOACIDS'][1])) count_groups += 1 answer_dictionary[aminoacids] = isoelectric_point_mean / count_groups return answer_dictionary -def main(*args: Tuple[Union[List[str], str], str]) -> dict: + +def back_transcribe(*seqs_list: Union[List[str], str]) -> dict: + """ + :param seqs_list: is a list of strings without whitespace. + You can put as many sequences as you wish. + :return: This function returns a dictonary where key is inputed protein + sequence and values are DNA codons + """ + result = {} + for seq in seqs_list: + rna = list((from_proteins_seqs_to_rna(seq)).get(seq)) + for i in range(len(rna)): + if rna[i] in TRANSCRIBE_DICT.keys(): + rna[i] = TRANSCRIBE_DICT[rna[i]] + result[seq] = "".join(rna) + return result + + +def count_gc_content(*seqs_list: Union[List[str], str]) -> dict: + """ + :param seqs_list: is a list of strings without whitespace. + You can put as many sequences as you wish. + :return: This function returns GC-content of DNA sequence, which encodes + the protein + """ + result = {} + for seq in seqs_list: + dna = list((back_transcribe(seq)).get(seq)) + gc_content = round(100 * (dna.count('G') + dna.count('C'))/len(dna)) + result[seq] = gc_content + return result + + +def check_input(*args: Union[List[str], str], method: str) -> \ + Tuple[List[str], Optional[str]]: + """ + Function to check the validity of the input. + + Args: + - *args - are supposed to be all sequences to process + - method - the method to process with method + + Returns: + - seqs_list - list of sequences + - seq_on (optional) - in case of local_alignment method + """ + + if len(args) == 0: + # Handle the case where there are no arguments + raise ValueError('No input defined.') + else: + if method not in ['recode', + 'local_alignment', + 'from_proteins_seqs_to_rna', + 'isoelectric_point_determination', + 'count_protein_molecular_weight', + 'back_transcribe', + 'count_gc_content']: + raise ValueError(method, ' is not a valid method.') + else: + # Form a list with sequences from the input + seqs_list = list(args) + if method == 'local_alignment': + if len(seqs_list) < 2: + raise IndexError('Need at least two sequences to align.') + for i, seq in enumerate(seqs_list): + if not is_one_letter(seq): + print('Warning! Function local_alignment() needs ' + '1-letter encoded sequences. Your sequence ' + 'will be mutated to a 1-letter encoding.') + seqs_list[i] = recode(seq) + print(seq, ' sequence has been mutated into: ', + seqs_list[i]) + seq_on = seqs_list.pop(0) + return seqs_list, seq_on + for i, seq in enumerate(seqs_list): + if is_one_letter(seq): + print(f'Warning! Function {method}() needs ' + '3-letter encoded sequences. Your sequence ' + 'will be mutated to a 3-letter encoding.') + seqs_list[i] = recode(seq) + print(seq, ' sequence has been mutated into: ', + seqs_list[i]) + seq_on = None + return seqs_list, seq_on + + +def main(*args: Tuple[Union[List[str], str]], + method: Optional[str] = None) -> dict: """ This function provides the access to the following methods: @@ -470,8 +476,8 @@ def main(*args: Tuple[Union[List[str], str], str]) -> dict: entered sequence to align the rest on - performs an alignment using Smith-Waterman algorithm - 3. Find all possible RNA sequences for defined protein sequence - the - last argument: from_proteins_seqs_to_rna + 3. Find possible RNA sequences for defined protein sequence - the + last argument: 'from_proteins_seqs_to_rna' - needs at least 1 protein sequence 3-letter encoded - returns a dictionary, where key is your input protein sequences and values are combinations of RNA codones, which encode this protein @@ -483,19 +489,37 @@ def main(*args: Tuple[Union[List[str], str], str]) -> dict: - returns a dictionary, where key is your input protein sequence and value is an isoelectric point of this protein - 4. ... - 5. ... + 5. Calculate protein molecular weight - the last argument: + 'count_protein_molecular_weight' + - Seqs is an argument of the function. It is a string without + whitespace (e.g. 'AlaSer'). You can put as many arguments as you wish. + - returns a dictionary with protein sequences as keys and their + calculated molecular weight as corresponding values + + 6. Determine possible DNA sequence from protein sequence - the last + argument: 'back_transcribe' + - needs a string without whitespaces. You can put as many arguments as + you wish. + - returns a dictonary where keys are inputed protein sequences and + corresponding values are possible DNA codons + + 7. Calculate a GC ratio in a possible DNA sequence of a given aminoacid + sequence - the last argument 'count_gc_content' + - needs a string without whitespaces. You can put as many sequences + as you wish. + - returns a dictionary where keys are inputed aminoacid sequences and + GC-content of DNA sequence, which encodes the protein are + corresponding values Args: - *args - are supposed to be all sequences to process and the method - to process with. - The method is supposed to be the last argument. + - *args - are supposed to be all sequences to process + - method is a kwarg - the method to process with. Returns: - function_result - result of a chosen function + function_result - a dictionary with the result of a chosen function """ - seqs_list, method, seq_on = check_input(*args) + seqs_list, seq_on = check_input(*args, method=method) print(f'Your sequences are: {seqs_list}', f'The method is: {method}', sep='\n') @@ -510,21 +534,31 @@ def main(*args: Tuple[Union[List[str], str], str]) -> dict: case 'local_alignment': - print('The sequence align on: ', seq_on) alignment_dict: dict = {} for seq_id, seq in enumerate(seqs_list): - function_result = local_alignment(seq_on=seq_on, - seq2=seq, - alignment_dict=alignment_dict, - seq_id=seq_id, - prettify=True) + local_alignment(seq_on=seq_on, + seq2=seq, + alignment_dict=alignment_dict, + seq_id=seq_id, + prettify=True) + return alignment_dict + + case 'from_proteins_seqs_to_rna': + + return from_proteins_seqs_to_rna(*seqs_list) + + case 'count_protein_molecular_weight': + + return count_protein_molecular_weight(*seqs_list) + + case 'isoelectric_point_determination': - case '': + return isoelectric_point_determination(*seqs_list) - pass + case 'back_transcribe': - case _: + return back_transcribe(*seqs_list) - function_result = None + case 'count_gc_content': - return function_result + return count_gc_content(*seqs_list) From 05caf3c398e113823b6faa4eb3513d29d2330134 Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:13:14 +0300 Subject: [PATCH 20/30] Update README.md --- HW4_Sapozhnikov/README.md | 81 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 3 deletions(-) diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md index 8a4222d..886e4e5 100644 --- a/HW4_Sapozhnikov/README.md +++ b/HW4_Sapozhnikov/README.md @@ -2,18 +2,93 @@ > *This is the repo for the fourth homework of the BI Python 2023 course* ### Title +'prototool.py' is a special script for working with polyaminoacid sequences -### Overview +### Overview +'prototool.py' includes 7 methods to treatment of polyaminoacid sequences. +'prototool.py' can be used for the next goals: +- recoding 1-letter coded polyaminoacid seqeunces into 3-letter coded and vice versa; +- polyaminoacid sequences aligment with Smith-Waterman algorithm [^1]; +- finding possinle RNA sequences for given polyaminoacid sequences; +- determining polyaminoacid isoelectric point; +- calculating polyaminoacid molecular weight; +- finding possinle DNA sequences for given polyaminoacid sequences; +- determining GC-content of a corresponding DNA sequence to a given polyaminoacid sequence ### Usage +This tool can be used both standalone and as module. +- to use 'prototools' standalone you will have to add these lines in the code + ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/5fa3cf7f-e6f3-4294-9e81-b1ebe17c8514) + - where *args are sequences you want to process and method is a specified algorithm to use + - your result will be written in a variable (test on a picture) +- to use 'prototools' as module (recomended) you should import it as any other module (check the path: prototools.py should be in the same directory as your script). Then you can freely use any of its functions (see examples). ### Options +Arguments: +- '''*args[str]''' sequences to work with. You can pass several arguments into all functions +- method - a method to use +*** +output: All functions return a '''dictionary''' , where keys are original sequenses, values are results after using a corresponding method. ### Examples +*** +def recode allows to translate 1-letter to 3-letters polyaminoacids code +- '''main('AlaValTyr', 'DNT', method = 'recode')''' +- '''recode('AlaValTyr', 'DNT')''' +- ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/117befa5-feaa-433a-9ac9-23cffe9b024f) + +*** +def local_alignmen perform a local alignment of 2 given sequences. Needs at least two sequences to be passed +- '''main('MetAsnTrp', 'MNT', method='local_alignment')''' +- '''local_alignmen('MetAsnTrp', 'MNT')''' +- Note that local_alignment function has a flag prettify (default = True) that prints out aligned sequences on each another +- ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/4dd36d24-a177-4419-9053-a5e2923a980c) + +*** +def from_proteins_seqs_to_rna allows to decode polyaminoacid sequences in RNA sequences +- '''main('AlaValTyr', 'DNT', method = 'from_proteins_seqs_to_rna')''' +- '''from_proteins_seqs_to_rna('AlaValTyr', 'DNT')''' +- ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/9ee92d0d-68a4-471b-b65a-2fa6b46ab844) + +*** +def isoelectric_point_determination allows to determine isoelectric point of polyaminoacid sequences +- '''main('AlaValTyr', 'DNT', method = 'isoelectric_point_determination')''' +- '''isoelectric_point_determination('AlaValTyr', 'DNT')''' +- ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/24027a07-b20b-42d4-bb10-4ca7189038d4) + +*** +def back_transcribe allows to decode polyaminoacid sequences in DNA sequences +- '''main('AlaValTyr', 'DNT', method = 'back_transcribe')''' +- '''back_transcribe('AlaValTyr', 'DNT')''' +- ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/71f07616-a37d-48da-9e63-82b81836b9d7) + +*** +def count_gc_content allows to count the ratio of GC in the entire DNA sequence +- '''main('AlaValTyr', 'DNT', method = 'count_gc_content')''' +- '''count_gc_content('AlaValTyr', 'DNT')''' +- ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/d2705714-a3e8-4054-8998-61d922a4feb6) + +*** +def count_protein_molecular_weight allows to calculate the molecular weight of the polyaminoacid +- '''main('AlaValTyr', 'DNT', method = 'count_protein_molecular_weight')''' +- '''count_protein_molecular_weight('AlaValTyr', 'DNT')''' +- ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/cc1eff9a-1b39-4232-98e4-80f622101083) ### Troubleshooting +If you have '''ValueError("No input defined.")''' it means, that you have an empty input. Please, enter the correct input. +*** +If you have '''ValueError(method, " is not a valid method.")''' it means, that your tool is not correct. Please, enter the right tool. +*** +If you have '''ValueError('Non-protein aminoacids in sequence')''' it means, that your sequences contain non-protein aminoacids. Please, check your sequences and enter the correct input. + +### References +[^1]: T.F. Smith, M.S. Waterman, (1981). [Identification of common molecular subsequences](https://doi.org/10.1016/0022-2836(81)90087-5). Journal of Molecular Biology. -### Contacts +### Contributions and contacts Feel free to report any bugs and problems encountered. -Email: nikita.sapozhnikov1@gmail.com \ No newline at end of file +Email: nikita.sapozhnikov1@gmail.com developed recode(), prettify_alignment(), local_alignmen(), check_input() +*** +nekrasovadasha22@mail.ru developed from_proteins_seqs_to_rna(), isoelectric_point_determination() +*** +alina.potyseva@gmail.com developed back_transcribe(), count_gc_content(), count_protein_molecular_weight() From 1bdbb2f8cb8ed92223b464fe2f5dff44dc22b029 Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:16:52 +0300 Subject: [PATCH 21/30] Update README.md --- HW4_Sapozhnikov/README.md | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md index 886e4e5..8c4564f 100644 --- a/HW4_Sapozhnikov/README.md +++ b/HW4_Sapozhnikov/README.md @@ -4,6 +4,8 @@ ### Title 'prototool.py' is a special script for working with polyaminoacid sequences +*** + ### Overview 'prototool.py' includes 7 methods to treatment of polyaminoacid sequences. 'prototool.py' can be used for the next goals: @@ -15,6 +17,8 @@ - finding possinle DNA sequences for given polyaminoacid sequences; - determining GC-content of a corresponding DNA sequence to a given polyaminoacid sequence +*** + ### Usage This tool can be used both standalone and as module. - to use 'prototools' standalone you will have to add these lines in the code @@ -23,57 +27,63 @@ This tool can be used both standalone and as module. - your result will be written in a variable (test on a picture) - to use 'prototools' as module (recomended) you should import it as any other module (check the path: prototools.py should be in the same directory as your script). Then you can freely use any of its functions (see examples). +*** + ### Options Arguments: - '''*args[str]''' sequences to work with. You can pass several arguments into all functions - method - a method to use + +output: All functions return a dict, where keys are original sequenses, values are results after using a corresponding method. + *** -output: All functions return a '''dictionary''' , where keys are original sequenses, values are results after using a corresponding method. ### Examples -*** + def recode allows to translate 1-letter to 3-letters polyaminoacids code - '''main('AlaValTyr', 'DNT', method = 'recode')''' - '''recode('AlaValTyr', 'DNT')''' - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/117befa5-feaa-433a-9ac9-23cffe9b024f) - *** + def local_alignmen perform a local alignment of 2 given sequences. Needs at least two sequences to be passed - '''main('MetAsnTrp', 'MNT', method='local_alignment')''' - '''local_alignmen('MetAsnTrp', 'MNT')''' - Note that local_alignment function has a flag prettify (default = True) that prints out aligned sequences on each another - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/4dd36d24-a177-4419-9053-a5e2923a980c) - *** + def from_proteins_seqs_to_rna allows to decode polyaminoacid sequences in RNA sequences - '''main('AlaValTyr', 'DNT', method = 'from_proteins_seqs_to_rna')''' - '''from_proteins_seqs_to_rna('AlaValTyr', 'DNT')''' - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/9ee92d0d-68a4-471b-b65a-2fa6b46ab844) - *** + def isoelectric_point_determination allows to determine isoelectric point of polyaminoacid sequences - '''main('AlaValTyr', 'DNT', method = 'isoelectric_point_determination')''' - '''isoelectric_point_determination('AlaValTyr', 'DNT')''' - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/24027a07-b20b-42d4-bb10-4ca7189038d4) - *** + def back_transcribe allows to decode polyaminoacid sequences in DNA sequences - '''main('AlaValTyr', 'DNT', method = 'back_transcribe')''' - '''back_transcribe('AlaValTyr', 'DNT')''' - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/71f07616-a37d-48da-9e63-82b81836b9d7) - *** + def count_gc_content allows to count the ratio of GC in the entire DNA sequence - '''main('AlaValTyr', 'DNT', method = 'count_gc_content')''' - '''count_gc_content('AlaValTyr', 'DNT')''' - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/d2705714-a3e8-4054-8998-61d922a4feb6) - *** + def count_protein_molecular_weight allows to calculate the molecular weight of the polyaminoacid - '''main('AlaValTyr', 'DNT', method = 'count_protein_molecular_weight')''' - '''count_protein_molecular_weight('AlaValTyr', 'DNT')''' - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/cc1eff9a-1b39-4232-98e4-80f622101083) +*** + ### Troubleshooting If you have '''ValueError("No input defined.")''' it means, that you have an empty input. Please, enter the correct input. *** @@ -81,9 +91,14 @@ If you have '''ValueError(method, " is not a valid method.")''' it means, that y *** If you have '''ValueError('Non-protein aminoacids in sequence')''' it means, that your sequences contain non-protein aminoacids. Please, check your sequences and enter the correct input. +*** + ### References + [^1]: T.F. Smith, M.S. Waterman, (1981). [Identification of common molecular subsequences](https://doi.org/10.1016/0022-2836(81)90087-5). Journal of Molecular Biology. +*** + ### Contributions and contacts Feel free to report any bugs and problems encountered. From 3d76bb5c66660aed840b581f1df6937dc12ecb8e Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:19:41 +0300 Subject: [PATCH 22/30] Update README.md --- HW4_Sapozhnikov/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md index 8c4564f..ff2f0dc 100644 --- a/HW4_Sapozhnikov/README.md +++ b/HW4_Sapozhnikov/README.md @@ -10,7 +10,7 @@ 'prototool.py' includes 7 methods to treatment of polyaminoacid sequences. 'prototool.py' can be used for the next goals: - recoding 1-letter coded polyaminoacid seqeunces into 3-letter coded and vice versa; -- polyaminoacid sequences aligment with Smith-Waterman algorithm [^1]; +- polyaminoacid sequences aligment with Smith-Waterman algorithm [[1]](#ref1); - finding possinle RNA sequences for given polyaminoacid sequences; - determining polyaminoacid isoelectric point; - calculating polyaminoacid molecular weight; @@ -95,8 +95,9 @@ If you have '''ValueError('Non-protein aminoacids in sequence')''' it means, tha ### References -[^1]: T.F. Smith, M.S. Waterman, (1981). [Identification of common molecular subsequences](https://doi.org/10.1016/0022-2836(81)90087-5). Journal of Molecular Biology. +1. T.F. Smith, M.S. Waterman, (1981). [Identification of common molecular subsequences](https://doi.org/10.1016/0022-2836(81)90087-5). Journal of Molecular Biology. +[1]: #ref1 *** ### Contributions and contacts From f32641a5420a852ca42c147463217dc5a139202a Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:21:22 +0300 Subject: [PATCH 23/30] Update README.md --- HW4_Sapozhnikov/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md index ff2f0dc..59bfc6d 100644 --- a/HW4_Sapozhnikov/README.md +++ b/HW4_Sapozhnikov/README.md @@ -10,7 +10,7 @@ 'prototool.py' includes 7 methods to treatment of polyaminoacid sequences. 'prototool.py' can be used for the next goals: - recoding 1-letter coded polyaminoacid seqeunces into 3-letter coded and vice versa; -- polyaminoacid sequences aligment with Smith-Waterman algorithm [[1]](#ref1); +- polyaminoacid sequences aligment with Smith-Waterman algorithm [^1]; - finding possinle RNA sequences for given polyaminoacid sequences; - determining polyaminoacid isoelectric point; - calculating polyaminoacid molecular weight; @@ -95,7 +95,7 @@ If you have '''ValueError('Non-protein aminoacids in sequence')''' it means, tha ### References -1. T.F. Smith, M.S. Waterman, (1981). [Identification of common molecular subsequences](https://doi.org/10.1016/0022-2836(81)90087-5). Journal of Molecular Biology. +- T.F. Smith, M.S. Waterman, (1981). [Identification of common molecular subsequences](https://doi.org/10.1016/0022-2836(81)90087-5). Journal of Molecular Biology. [1]: #ref1 *** From 9d6f68726d5ab1bb680c9b24fe57aea86290e69e Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:22:00 +0300 Subject: [PATCH 24/30] Update README.md --- HW4_Sapozhnikov/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md index 59bfc6d..e84e6af 100644 --- a/HW4_Sapozhnikov/README.md +++ b/HW4_Sapozhnikov/README.md @@ -97,7 +97,6 @@ If you have '''ValueError('Non-protein aminoacids in sequence')''' it means, tha - T.F. Smith, M.S. Waterman, (1981). [Identification of common molecular subsequences](https://doi.org/10.1016/0022-2836(81)90087-5). Journal of Molecular Biology. -[1]: #ref1 *** ### Contributions and contacts From 9563f394772767988e4c3e77a9fe8d1cc6caa59b Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:22:39 +0300 Subject: [PATCH 25/30] Update README.md --- HW4_Sapozhnikov/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md index e84e6af..8c4564f 100644 --- a/HW4_Sapozhnikov/README.md +++ b/HW4_Sapozhnikov/README.md @@ -95,7 +95,7 @@ If you have '''ValueError('Non-protein aminoacids in sequence')''' it means, tha ### References -- T.F. Smith, M.S. Waterman, (1981). [Identification of common molecular subsequences](https://doi.org/10.1016/0022-2836(81)90087-5). Journal of Molecular Biology. +[^1]: T.F. Smith, M.S. Waterman, (1981). [Identification of common molecular subsequences](https://doi.org/10.1016/0022-2836(81)90087-5). Journal of Molecular Biology. *** From 597f21b47492b0e145c4423058cd5549d95db293 Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:23:03 +0300 Subject: [PATCH 26/30] Update README.md --- HW4_Sapozhnikov/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md index 8c4564f..e23bff4 100644 --- a/HW4_Sapozhnikov/README.md +++ b/HW4_Sapozhnikov/README.md @@ -93,12 +93,6 @@ If you have '''ValueError('Non-protein aminoacids in sequence')''' it means, tha *** -### References - -[^1]: T.F. Smith, M.S. Waterman, (1981). [Identification of common molecular subsequences](https://doi.org/10.1016/0022-2836(81)90087-5). Journal of Molecular Biology. - -*** - ### Contributions and contacts Feel free to report any bugs and problems encountered. @@ -107,3 +101,9 @@ Email: nikita.sapozhnikov1@gmail.com developed recode(), prettify_alignment(), l nekrasovadasha22@mail.ru developed from_proteins_seqs_to_rna(), isoelectric_point_determination() *** alina.potyseva@gmail.com developed back_transcribe(), count_gc_content(), count_protein_molecular_weight() + +*** + +### References + +[^1]: T.F. Smith, M.S. Waterman, (1981). [Identification of common molecular subsequences](https://doi.org/10.1016/0022-2836(81)90087-5). Journal of Molecular Biology. From f6a34ef821052d1c790911ac5b410b5062e88bfb Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:26:37 +0300 Subject: [PATCH 27/30] Update README.md --- HW4_Sapozhnikov/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md index e23bff4..21ca04e 100644 --- a/HW4_Sapozhnikov/README.md +++ b/HW4_Sapozhnikov/README.md @@ -31,8 +31,8 @@ This tool can be used both standalone and as module. ### Options Arguments: -- '''*args[str]''' sequences to work with. You can pass several arguments into all functions -- method - a method to use +- '*args[str]' sequences to work with. You can pass several arguments into all functions +- 'method' - a method to use output: All functions return a dict, where keys are original sequenses, values are results after using a corresponding method. @@ -41,8 +41,8 @@ output: All functions return a dict, where keys are original sequenses, values a ### Examples def recode allows to translate 1-letter to 3-letters polyaminoacids code -- '''main('AlaValTyr', 'DNT', method = 'recode')''' -- '''recode('AlaValTyr', 'DNT')''' +- 'main('AlaValTyr', 'DNT', method = 'recode')' +- 'recode('AlaValTyr', 'DNT')' - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/117befa5-feaa-433a-9ac9-23cffe9b024f) *** From f6e10e8622b028e9c9793f3411b90920a1b43e22 Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:30:55 +0300 Subject: [PATCH 28/30] Update README.md --- HW4_Sapozhnikov/README.md | 48 +++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md index 21ca04e..0c7f9d2 100644 --- a/HW4_Sapozhnikov/README.md +++ b/HW4_Sapozhnikov/README.md @@ -2,13 +2,13 @@ > *This is the repo for the fourth homework of the BI Python 2023 course* ### Title -'prototool.py' is a special script for working with polyaminoacid sequences +`prototool.py` is a special script for working with polyaminoacid sequences *** ### Overview -'prototool.py' includes 7 methods to treatment of polyaminoacid sequences. -'prototool.py' can be used for the next goals: +`prototool.py` includes 7 methods to treatment of polyaminoacid sequences. +`prototool.py` can be used for the next goals: - recoding 1-letter coded polyaminoacid seqeunces into 3-letter coded and vice versa; - polyaminoacid sequences aligment with Smith-Waterman algorithm [^1]; - finding possinle RNA sequences for given polyaminoacid sequences; @@ -21,18 +21,18 @@ ### Usage This tool can be used both standalone and as module. -- to use 'prototools' standalone you will have to add these lines in the code +- to use `prototool` standalone you will have to add these lines in the code ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/5fa3cf7f-e6f3-4294-9e81-b1ebe17c8514) - where *args are sequences you want to process and method is a specified algorithm to use - your result will be written in a variable (test on a picture) -- to use 'prototools' as module (recomended) you should import it as any other module (check the path: prototools.py should be in the same directory as your script). Then you can freely use any of its functions (see examples). +- to use `prototool` as module (recomended) you should import it as any other module (check the path: prototools.py should be in the same directory as your script). Then you can freely use any of its functions (see examples). *** ### Options Arguments: -- '*args[str]' sequences to work with. You can pass several arguments into all functions -- 'method' - a method to use +- `*args[str]` sequences to work with. You can pass several arguments into all functions +- `method` - a method to use output: All functions return a dict, where keys are original sequenses, values are results after using a corresponding method. @@ -41,55 +41,55 @@ output: All functions return a dict, where keys are original sequenses, values a ### Examples def recode allows to translate 1-letter to 3-letters polyaminoacids code -- 'main('AlaValTyr', 'DNT', method = 'recode')' -- 'recode('AlaValTyr', 'DNT')' +- `main('AlaValTyr', 'DNT', method = 'recode')` +- `recode('AlaValTyr', 'DNT')` - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/117befa5-feaa-433a-9ac9-23cffe9b024f) *** def local_alignmen perform a local alignment of 2 given sequences. Needs at least two sequences to be passed -- '''main('MetAsnTrp', 'MNT', method='local_alignment')''' -- '''local_alignmen('MetAsnTrp', 'MNT')''' +- `main('MetAsnTrp', 'MNT', method='local_alignment')` +- `local_alignmen('MetAsnTrp', 'MNT')` - Note that local_alignment function has a flag prettify (default = True) that prints out aligned sequences on each another - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/4dd36d24-a177-4419-9053-a5e2923a980c) *** def from_proteins_seqs_to_rna allows to decode polyaminoacid sequences in RNA sequences -- '''main('AlaValTyr', 'DNT', method = 'from_proteins_seqs_to_rna')''' -- '''from_proteins_seqs_to_rna('AlaValTyr', 'DNT')''' +- `main('AlaValTyr', 'DNT', method = 'from_proteins_seqs_to_rna')` +- `from_proteins_seqs_to_rna('AlaValTyr', 'DNT')` - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/9ee92d0d-68a4-471b-b65a-2fa6b46ab844) *** def isoelectric_point_determination allows to determine isoelectric point of polyaminoacid sequences -- '''main('AlaValTyr', 'DNT', method = 'isoelectric_point_determination')''' -- '''isoelectric_point_determination('AlaValTyr', 'DNT')''' +- `main('AlaValTyr', 'DNT', method = 'isoelectric_point_determination')` +- `isoelectric_point_determination('AlaValTyr', 'DNT')` - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/24027a07-b20b-42d4-bb10-4ca7189038d4) *** def back_transcribe allows to decode polyaminoacid sequences in DNA sequences -- '''main('AlaValTyr', 'DNT', method = 'back_transcribe')''' -- '''back_transcribe('AlaValTyr', 'DNT')''' +- `main('AlaValTyr', 'DNT', method = 'back_transcribe')` +- `back_transcribe('AlaValTyr', 'DNT')` - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/71f07616-a37d-48da-9e63-82b81836b9d7) *** def count_gc_content allows to count the ratio of GC in the entire DNA sequence -- '''main('AlaValTyr', 'DNT', method = 'count_gc_content')''' -- '''count_gc_content('AlaValTyr', 'DNT')''' +- `main('AlaValTyr', 'DNT', method = 'count_gc_content')` +- `count_gc_content('AlaValTyr', 'DNT')` - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/d2705714-a3e8-4054-8998-61d922a4feb6) *** def count_protein_molecular_weight allows to calculate the molecular weight of the polyaminoacid -- '''main('AlaValTyr', 'DNT', method = 'count_protein_molecular_weight')''' -- '''count_protein_molecular_weight('AlaValTyr', 'DNT')''' +- `main('AlaValTyr', 'DNT', method = 'count_protein_molecular_weight')` +- `count_protein_molecular_weight('AlaValTyr', 'DNT')` - ![image](https://github.com/NSapozhnikov/HW4_Sapozhnikov/assets/81642791/cc1eff9a-1b39-4232-98e4-80f622101083) *** ### Troubleshooting -If you have '''ValueError("No input defined.")''' it means, that you have an empty input. Please, enter the correct input. +If you have `ValueError("No input defined.")` it means, that you have an empty input. Please, enter the correct input. *** -If you have '''ValueError(method, " is not a valid method.")''' it means, that your tool is not correct. Please, enter the right tool. +If you have `ValueError(method, " is not a valid method.")` it means, that your tool is not correct. Please, enter the right tool. *** -If you have '''ValueError('Non-protein aminoacids in sequence')''' it means, that your sequences contain non-protein aminoacids. Please, check your sequences and enter the correct input. +If you have `ValueError('Non-protein aminoacids in sequence')` it means, that your sequences contain non-protein aminoacids. Please, check your sequences and enter the correct input. *** From fc97f7dc0e53f8a88f03d1398b9614c3b8e63a25 Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:33:14 +0300 Subject: [PATCH 29/30] Update README.md --- HW4_Sapozhnikov/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md index 0c7f9d2..dce6161 100644 --- a/HW4_Sapozhnikov/README.md +++ b/HW4_Sapozhnikov/README.md @@ -1,7 +1,7 @@ # HW 4. Functions 2 > *This is the repo for the fourth homework of the BI Python 2023 course* -### Title +### Prototool `prototool.py` is a special script for working with polyaminoacid sequences *** From ca5cf540a197cc205f726aa34f9d6ff210f960ae Mon Sep 17 00:00:00 2001 From: Nikita <81642791+NSapozhnikov@users.noreply.github.com> Date: Fri, 6 Oct 2023 16:48:55 +0300 Subject: [PATCH 30/30] Update HW4_Sapozhnikov/README.md Co-authored-by: Nikita Vaulin --- HW4_Sapozhnikov/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW4_Sapozhnikov/README.md b/HW4_Sapozhnikov/README.md index dce6161..c347e60 100644 --- a/HW4_Sapozhnikov/README.md +++ b/HW4_Sapozhnikov/README.md @@ -8,7 +8,7 @@ ### Overview `prototool.py` includes 7 methods to treatment of polyaminoacid sequences. -`prototool.py` can be used for the next goals: +`prototool.py` can be used for the following purposes: - recoding 1-letter coded polyaminoacid seqeunces into 3-letter coded and vice versa; - polyaminoacid sequences aligment with Smith-Waterman algorithm [^1]; - finding possinle RNA sequences for given polyaminoacid sequences;