From 795f31a7a05bfff64c352acae10a5ca52cd1fd54 Mon Sep 17 00:00:00 2001 From: nikita Date: Tue, 26 Sep 2023 22:37:23 +0300 Subject: [PATCH 01/25] Add translation, mutations and level_of_hydrophobic --- protein_tools.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 protein_tools.py diff --git a/protein_tools.py b/protein_tools.py new file mode 100644 index 0000000..0ef6f82 --- /dev/null +++ b/protein_tools.py @@ -0,0 +1,60 @@ +def level_of_hydrophobic(protein): + gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} + + count_of_gydrophobic = 0 + if is_protein(protein): + for i in range(len(protein)): + if protein[i] in gydrophobic_aminoacids: + count_of_gydrophobic += 1 + + percentage = count_of_gydrophobic / len(protein) * 100 + + return f"Percentage of gydrophobic aminoacids in {protein} = {percentage}%." + + +def translation(seq): + """ + """ + gene_code = { + "F": ["UUC", "UUU"], "L": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"], + "I": ["AUU", "AUC", "AUA"], "M": ["AUG"], "V": ["GUU", "GUC", "GUA", "GUG"], + "S": ["UCU", "UCC", "UCA", "UCG"], "P": ["CCU", "CCC", "CCA", "CCG"], + "T": ["ACU", "ACC", "ACA", "ACG"], "A": ["GCU", "GCC", "GCA", "GCG"], + "Y": ["UAC", "UAU"], "*": ["UAA", "UAG", "UGA"], "H": ["CAU", "CAC"], + "Q": ["CAA", "CAG"], "N": ["AAU", "AAC"], + "K": ["AAA", "AAG"], "D": ["GAU", "GAC"], "E": ["GAA", "GAG"], + "C": ["UGU", "UGC"], "W": ["UGG"], "R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"], + "S": ["AGU", "AGC"], "G": ["GGU", "GGC", "GGA", "GGG"] + } + triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] + protein = [] + for triplet in triplets: + for aminoacid in gene_code.keys(): + if triplet in gene_code[aminoacid]: + protein.append(aminoacid) + + if is_protein("".join(protein)): + start = protein.index("M") + stop = protein.index("*") + return "".join(protein[start:stop + 1]) + else: + return "This sequence doesn't include the gene." + + +def mutations(seq, protein): + correct_protein = translation(seq) + + if is_protein(protein): + bank_of_mutations = [] + for i in range(len(correct_protein)): + if correct_protein[i] != protein[i]: + bank_of_mutations.append(f'{protein[i]}{i + 1}') + + if len(bank_of_mutations) == 0: + return "Protein without mutations." + else: + return "Mutations:" + ", ".join(bank_of_mutations) + "." + else: + return "It isn't a protein." + + From 490b985c5749ea2806a5ef50b68127d3d51546cd Mon Sep 17 00:00:00 2001 From: Artyom Date: Wed, 27 Sep 2023 01:24:24 +0300 Subject: [PATCH 02/25] Add directory HW4_toropov --- HW4_Toropov/protein_tools.py | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 HW4_Toropov/protein_tools.py diff --git a/HW4_Toropov/protein_tools.py b/HW4_Toropov/protein_tools.py new file mode 100644 index 0000000..87bff7b --- /dev/null +++ b/HW4_Toropov/protein_tools.py @@ -0,0 +1,54 @@ +alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} +amino_acid_masses = { + 'A': 71.03711, + 'R': 156.10111, + 'N': 114.04293, + 'D': 115.02694, + 'C': 103.00919, + 'Q': 128.05858, + 'E': 129.04259, + 'G': 57.02146, + 'H': 137.05891, + 'I': 113.08406, + 'L': 113.08406, + 'K': 128.09496, + 'M': 131.04049, + 'F': 147.06841, + 'P': 97.05276, + 'S': 87.03203, + 'T': 101.04768, + 'W': 186.07931, + 'Y': 163.06333, + 'V': 99.06841 +} + + +def is_protein(seq): + unique_chars = set(seq) + return unique_chars <= alphabet_protein + + +def molecular_weight(seq): + molecular_weight = 0 + for amino_acid in seq: + molecular_weight += amino_acid_masses[amino_acid] + return round(molecular_weight, 3) + + +def run_protein_tools(*seqs_and_procedure): + procedure = seqs_and_procedure[-1] + seqs = seqs_and_procedure[:-1] + + results = [] + + for seq in seqs: + seq = seq.upper() + if is_protein(seq) is not True: + raise ValueError("Invalid alphabet") + if procedure == 'molecular_weight': + results.append(molecular_weight(seq)) + + if len(results) == 1: + return results[0] + else: + return results From 4fb8e575a66f468668301e8b4aa287dce8c16fa9 Mon Sep 17 00:00:00 2001 From: Artyom Date: Wed, 27 Sep 2023 00:54:10 +0300 Subject: [PATCH 03/25] Add protein_tools.py With functions is_protein, run_protein_tools, molecular_weight --- protein_tools.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 protein_tools.py diff --git a/protein_tools.py b/protein_tools.py new file mode 100644 index 0000000..87bff7b --- /dev/null +++ b/protein_tools.py @@ -0,0 +1,54 @@ +alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} +amino_acid_masses = { + 'A': 71.03711, + 'R': 156.10111, + 'N': 114.04293, + 'D': 115.02694, + 'C': 103.00919, + 'Q': 128.05858, + 'E': 129.04259, + 'G': 57.02146, + 'H': 137.05891, + 'I': 113.08406, + 'L': 113.08406, + 'K': 128.09496, + 'M': 131.04049, + 'F': 147.06841, + 'P': 97.05276, + 'S': 87.03203, + 'T': 101.04768, + 'W': 186.07931, + 'Y': 163.06333, + 'V': 99.06841 +} + + +def is_protein(seq): + unique_chars = set(seq) + return unique_chars <= alphabet_protein + + +def molecular_weight(seq): + molecular_weight = 0 + for amino_acid in seq: + molecular_weight += amino_acid_masses[amino_acid] + return round(molecular_weight, 3) + + +def run_protein_tools(*seqs_and_procedure): + procedure = seqs_and_procedure[-1] + seqs = seqs_and_procedure[:-1] + + results = [] + + for seq in seqs: + seq = seq.upper() + if is_protein(seq) is not True: + raise ValueError("Invalid alphabet") + if procedure == 'molecular_weight': + results.append(molecular_weight(seq)) + + if len(results) == 1: + return results[0] + else: + return results From 058419501ebf7ed1c21cdaba9101a01324129c62 Mon Sep 17 00:00:00 2001 From: sofiyaga Date: Tue, 26 Sep 2023 22:48:16 +0300 Subject: [PATCH 04/25] Add function calculate_length --- protein_tools.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/protein_tools.py b/protein_tools.py index 87bff7b..08206f4 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -27,6 +27,19 @@ def is_protein(seq): unique_chars = set(seq) return unique_chars <= alphabet_protein +def compute_length(*seqs: str): + """ + Compute the length of the input amino acid sequence + + """ + lens = [] + for seq in seqs: + if is_protein(seq): + lens.append(len(seq)) + else: + raise ValueError('Not a protein') + return lens if len(lens) > 1 else lens[0] + def molecular_weight(seq): molecular_weight = 0 From 8ecd46d9c61e6fa3838cd9485c44e9f4426a2f2a Mon Sep 17 00:00:00 2001 From: sofiyaga Date: Tue, 26 Sep 2023 22:53:23 +0300 Subject: [PATCH 05/25] Add dictionary codon_table --- protein_tools.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/protein_tools.py b/protein_tools.py index 08206f4..f2a2a93 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -22,6 +22,28 @@ 'V': 99.06841 } +codon_table = { + 'A': ['GCT', 'GCC', 'GCA', 'GCG'], + 'C': ['TGT', 'TGC'], + 'D': ['GAT', 'GAC'], + 'E': ['GAA', 'GAG'], + 'F': ['TTT', 'TTC'], + 'G': ['GGT', 'GGC', 'GGA', 'GGG'], + 'H': ['CAT', 'CAC'], + 'I': ['ATT', 'ATC', 'ATA'], + 'K': ['AAA', 'AAG'], + 'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'], + 'M': ['ATG'], + 'N': ['AAT', 'AAC'], + 'P': ['CCT', 'CCC', 'CCA', 'CCG'], + 'Q': ['CAA', 'CAG'], + 'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], + 'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'], + 'T': ['ACT', 'ACC', 'ACA', 'ACG'], + 'V': ['GTT', 'GTC', 'GTA', 'GTG'], + 'W': ['TGG'], + 'Y': ['TAT', 'TAC']} + def is_protein(seq): unique_chars = set(seq) From 1746a23be47ad4317d761bf2975a7348862c8c4b Mon Sep 17 00:00:00 2001 From: Artyom Date: Wed, 27 Sep 2023 16:58:02 +0300 Subject: [PATCH 06/25] Create global variables with alphabets --- protein_tools.py | 93 +++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 52 deletions(-) diff --git a/protein_tools.py b/protein_tools.py index e0fb41b..e55f70f 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -1,3 +1,30 @@ +alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} + +amino_acid_masses = { + 'A': 71.03711, + 'R': 156.10111, + 'N': 114.04293, + 'D': 115.02694, + 'C': 103.00919, + 'Q': 128.05858, + 'E': 129.04259, + 'G': 57.02146, + 'H': 137.05891, + 'I': 113.08406, + 'L': 113.08406, + 'K': 128.09496, + 'M': 131.04049, + 'F': 147.06841, + 'P': 97.05276, + 'S': 87.03203, + 'T': 101.04768, + 'W': 186.07931, + 'Y': 163.06333, + 'V': 99.06841 +} + +gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} + codon_table = { 'A': ['GCT', 'GCC', 'GCA', 'GCG'], 'C': ['TGT', 'TGC'], @@ -21,6 +48,17 @@ 'Y': ['TAT', 'TAC']} +def is_protein(seq): + unique_chars = set(seq) + return unique_chars <= alphabet_protein + + +def molecular_weight(seq): + molecular_weight = 0 + for amino_acid in seq: + molecular_weight += amino_acid_masses[amino_acid] + return round(molecular_weight, 3) + def compute_length(*seqs: str): """ @@ -37,7 +75,6 @@ def compute_length(*seqs: str): def level_of_hydrophobic(protein): - gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} count_of_gydrophobic = 0 if is_protein(protein): @@ -53,22 +90,11 @@ def level_of_hydrophobic(protein): def translation(seq): """ """ - gene_code = { - "F": ["UUC", "UUU"], "L": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"], - "I": ["AUU", "AUC", "AUA"], "M": ["AUG"], "V": ["GUU", "GUC", "GUA", "GUG"], - "S": ["UCU", "UCC", "UCA", "UCG"], "P": ["CCU", "CCC", "CCA", "CCG"], - "T": ["ACU", "ACC", "ACA", "ACG"], "A": ["GCU", "GCC", "GCA", "GCG"], - "Y": ["UAC", "UAU"], "*": ["UAA", "UAG", "UGA"], "H": ["CAU", "CAC"], - "Q": ["CAA", "CAG"], "N": ["AAU", "AAC"], - "K": ["AAA", "AAG"], "D": ["GAU", "GAC"], "E": ["GAA", "GAG"], - "C": ["UGU", "UGC"], "W": ["UGG"], "R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"], - "S": ["AGU", "AGC"], "G": ["GGU", "GGC", "GGA", "GGG"] - } - triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] + triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] protein = [] for triplet in triplets: - for aminoacid in gene_code.keys(): - if triplet in gene_code[aminoacid]: + for aminoacid in codon_table.keys(): + if triplet in codon_table[aminoacid]: protein.append(aminoacid) if is_protein("".join(protein)): @@ -96,43 +122,6 @@ def mutations(seq, protein): return "It isn't a protein." -alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} -amino_acid_masses = { - 'A': 71.03711, - 'R': 156.10111, - 'N': 114.04293, - 'D': 115.02694, - 'C': 103.00919, - 'Q': 128.05858, - 'E': 129.04259, - 'G': 57.02146, - 'H': 137.05891, - 'I': 113.08406, - 'L': 113.08406, - 'K': 128.09496, - 'M': 131.04049, - 'F': 147.06841, - 'P': 97.05276, - 'S': 87.03203, - 'T': 101.04768, - 'W': 186.07931, - 'Y': 163.06333, - 'V': 99.06841 -} - - -def is_protein(seq): - unique_chars = set(seq) - return unique_chars <= alphabet_protein - - -def molecular_weight(seq): - molecular_weight = 0 - for amino_acid in seq: - molecular_weight += amino_acid_masses[amino_acid] - return round(molecular_weight, 3) - - def run_protein_tools(*seqs_and_procedure): procedure = seqs_and_procedure[-1] seqs = seqs_and_procedure[:-1] From e48afa0c8da143938a7d856fbe525e09908891e4 Mon Sep 17 00:00:00 2001 From: Artyom Date: Wed, 27 Sep 2023 17:21:55 +0300 Subject: [PATCH 07/25] Remove is_protein from all functions except run_protein tools --- protein_tools.py | 63 +++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 36 deletions(-) diff --git a/protein_tools.py b/protein_tools.py index e55f70f..3c54477 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -1,5 +1,4 @@ alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} - amino_acid_masses = { 'A': 71.03711, 'R': 156.10111, @@ -22,9 +21,6 @@ 'Y': 163.06333, 'V': 99.06841 } - -gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} - codon_table = { 'A': ['GCT', 'GCC', 'GCA', 'GCG'], 'C': ['TGT', 'TGC'], @@ -46,18 +42,7 @@ 'V': ['GTT', 'GTC', 'GTA', 'GTG'], 'W': ['TGG'], 'Y': ['TAT', 'TAC']} - - -def is_protein(seq): - unique_chars = set(seq) - return unique_chars <= alphabet_protein - - -def molecular_weight(seq): - molecular_weight = 0 - for amino_acid in seq: - molecular_weight += amino_acid_masses[amino_acid] - return round(molecular_weight, 3) +gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} def compute_length(*seqs: str): @@ -67,20 +52,16 @@ def compute_length(*seqs: str): """ lens = [] for seq in seqs: - if is_protein(seq): - lens.append(len(seq)) - else: - raise ValueError('Not a protein') + lens.append(len(seq)) return lens if len(lens) > 1 else lens[0] def level_of_hydrophobic(protein): count_of_gydrophobic = 0 - if is_protein(protein): - for i in range(len(protein)): - if protein[i] in gydrophobic_aminoacids: - count_of_gydrophobic += 1 + for i in range(len(protein)): + if protein[i] in gydrophobic_aminoacids: + count_of_gydrophobic += 1 percentage = count_of_gydrophobic / len(protein) * 100 @@ -90,7 +71,7 @@ def level_of_hydrophobic(protein): def translation(seq): """ """ - triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] + triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] protein = [] for triplet in triplets: for aminoacid in codon_table.keys(): @@ -108,18 +89,28 @@ def translation(seq): def mutations(seq, protein): correct_protein = translation(seq) - if is_protein(protein): - bank_of_mutations = [] - for i in range(len(correct_protein)): - if correct_protein[i] != protein[i]: - bank_of_mutations.append(f'{protein[i]}{i + 1}') + + bank_of_mutations = [] + for i in range(len(correct_protein)): + if correct_protein[i] != protein[i]: + bank_of_mutations.append(f'{protein[i]}{i + 1}') - if len(bank_of_mutations) == 0: - return "Protein without mutations." - else: - return "Mutations:" + ", ".join(bank_of_mutations) + "." + if len(bank_of_mutations) == 0: + return "Protein without mutations." else: - return "It isn't a protein." + return "Mutations:" + ", ".join(bank_of_mutations) + "." + + +def is_protein(seq): + unique_chars = set(seq) + return unique_chars <= alphabet_protein + + +def molecular_weight(seq): + molecular_weight = 0 + for amino_acid in seq: + molecular_weight += amino_acid_masses[amino_acid] + return round(molecular_weight, 3) def run_protein_tools(*seqs_and_procedure): @@ -139,4 +130,4 @@ def run_protein_tools(*seqs_and_procedure): return results[0] else: return results - + \ No newline at end of file From 34ed7cd47a48200b00cd6af0c91e157e1fdfb5ae Mon Sep 17 00:00:00 2001 From: rereremin <114501294+rereremin@users.noreply.github.com> Date: Wed, 27 Sep 2023 18:14:29 +0300 Subject: [PATCH 08/25] Update functions and dictionaroes in protein_tools.py --- protein_tools.py | 188 +++++++++++++++++++++++++++++------------------ 1 file changed, 117 insertions(+), 71 deletions(-) diff --git a/protein_tools.py b/protein_tools.py index 0b4de71..3fbd93d 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -1,65 +1,7 @@ +alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} -def level_of_hydrophobic(protein): - gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} - - count_of_gydrophobic = 0 - if is_protein(protein): - for i in range(len(protein)): - if protein[i] in gydrophobic_aminoacids: - count_of_gydrophobic += 1 - - percentage = count_of_gydrophobic / len(protein) * 100 - - return f"Percentage of gydrophobic aminoacids in {protein} = {percentage}%." - - -def translation(seq): - """ - """ - gene_code = { - "F": ["UUC", "UUU"], "L": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"], - "I": ["AUU", "AUC", "AUA"], "M": ["AUG"], "V": ["GUU", "GUC", "GUA", "GUG"], - "S": ["UCU", "UCC", "UCA", "UCG"], "P": ["CCU", "CCC", "CCA", "CCG"], - "T": ["ACU", "ACC", "ACA", "ACG"], "A": ["GCU", "GCC", "GCA", "GCG"], - "Y": ["UAC", "UAU"], "*": ["UAA", "UAG", "UGA"], "H": ["CAU", "CAC"], - "Q": ["CAA", "CAG"], "N": ["AAU", "AAC"], - "K": ["AAA", "AAG"], "D": ["GAU", "GAC"], "E": ["GAA", "GAG"], - "C": ["UGU", "UGC"], "W": ["UGG"], "R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"], - "S": ["AGU", "AGC"], "G": ["GGU", "GGC", "GGA", "GGG"] - } - triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] - protein = [] - for triplet in triplets: - for aminoacid in gene_code.keys(): - if triplet in gene_code[aminoacid]: - protein.append(aminoacid) - - if is_protein("".join(protein)): - start = protein.index("M") - stop = protein.index("*") - return "".join(protein[start:stop + 1]) - else: - return "This sequence doesn't include the gene." - - -def mutations(seq, protein): - correct_protein = translation(seq) - - if is_protein(protein): - bank_of_mutations = [] - for i in range(len(correct_protein)): - if correct_protein[i] != protein[i]: - bank_of_mutations.append(f'{protein[i]}{i + 1}') - - if len(bank_of_mutations) == 0: - return "Protein without mutations." - else: - return "Mutations:" + ", ".join(bank_of_mutations) + "." - else: - return "It isn't a protein." - +alphabet_rna = {'A', 'U', 'G', 'C'} -alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} amino_acid_masses = { 'A': 71.03711, 'R': 156.10111, @@ -83,34 +25,138 @@ def mutations(seq, protein): 'V': 99.06841 } +gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} + +dna_codons = { + 'A': ['GCT', 'GCC', 'GCA', 'GCG'], + 'C': ['TGT', 'TGC'], + 'D': ['GAT', 'GAC'], + 'E': ['GAA', 'GAG'], + 'F': ['TTT', 'TTC'], + 'G': ['GGT', 'GGC', 'GGA', 'GGG'], + 'H': ['CAT', 'CAC'], + 'I': ['ATT', 'ATC', 'ATA'], + 'K': ['AAA', 'AAG'], + 'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'], + 'M': ['ATG'], + 'N': ['AAT', 'AAC'], + 'P': ['CCT', 'CCC', 'CCA', 'CCG'], + 'Q': ['CAA', 'CAG'], + 'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], + 'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'], + 'T': ['ACT', 'ACC', 'ACA', 'ACG'], + 'V': ['GTT', 'GTC', 'GTA', 'GTG'], + 'W': ['TGG'], + 'Y': ['TAT', 'TAC'], + '*': ["UAA", "UAG", "UGA"]} + +rna_codons = { + "F": ["UUC", "UUU"], "L": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"], + "I": ["AUU", "AUC", "AUA"], "M": ["AUG"], "V": ["GUU", "GUC", "GUA", "GUG"], + "S": ["UCU", "UCC", "UCA", "UCG"], "P": ["CCU", "CCC", "CCA", "CCG"], + "T": ["ACU", "ACC", "ACA", "ACG"], "A": ["GCU", "GCC", "GCA", "GCG"], + "Y": ["UAC", "UAU"], "*": ["UAA", "UAG", "UGA"], "H": ["CAU", "CAC"], + "Q": ["CAA", "CAG"], "N": ["AAU", "AAC"], + "K": ["AAA", "AAG"], "D": ["GAU", "GAC"], "E": ["GAA", "GAG"], + "C": ["UGU", "UGC"], "W": ["UGG"], "R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"], + "S": ["AGU", "AGC"], "G": ["GGU", "GGC", "GGA", "GGG"] + } + def is_protein(seq): - unique_chars = set(seq) + unique_chars = set(seq.upper()) return unique_chars <= alphabet_protein -def molecular_weight(seq): +def is_rna(seq): + unique_chars = set(seq.upper()) + return unique_chars <= alphabet_rna + + +def compute_molecular_weight(seq): molecular_weight = 0 for amino_acid in seq: molecular_weight += amino_acid_masses[amino_acid] return round(molecular_weight, 3) +def compute_length(seq: str): + """ + Compute the length of the input amino acid sequence + + """ + return len(seq) + + +def compute_hydrophobicity(protein): + + count_of_gydrophobic = 0 + if is_protein(protein): + for i in range(len(protein)): + if protein[i] in gydrophobic_aminoacids: + count_of_gydrophobic += 1 + + percentage = round(count_of_gydrophobic / len(protein) * 100, 3) + + return f"Percentage of gydrophobic aminoacids in {protein} = {percentage}%." + + +def translation(seq): + """ + """ + triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] + protein = [] + for triplet in triplets: + for aminoacid in rna_codons.keys(): + if triplet in rna_codons[aminoacid]: + protein.append(aminoacid) + + start = protein.index("M") + stop = protein.index("*") + return "".join(protein[start:stop + 1]) + + +def check_mutations(seq, protein): + + if is_protein(protein[:-1]) is not True: + raise ValueError("Invalid protein sequence") + if is_rna(seq) is not True: + raise ValueError("Invalid RNA sequence") + + correct_protein = translation(seq) + bank_of_mutations = [] + + for i in range(len(correct_protein)): + if correct_protein[i] != protein[i]: + bank_of_mutations.append(f'{protein[i]}{i + 1}') + + if len(bank_of_mutations) == 0: + return "Protein without mutations." + else: + return "Mutations:" + ", ".join(bank_of_mutations) + "." + + def run_protein_tools(*seqs_and_procedure): procedure = seqs_and_procedure[-1] seqs = seqs_and_procedure[:-1] results = [] - - for seq in seqs: - seq = seq.upper() - if is_protein(seq) is not True: - raise ValueError("Invalid alphabet") - if procedure == 'molecular_weight': - results.append(molecular_weight(seq)) - + if procedure == 'check_mutations': + results.append(check_mutations(seqs[0], seqs[1])) + + else: + for seq in seqs: + seq = seq.upper() + if is_protein(seq) is not True: + raise ValueError("Invalid protein sequence") + if procedure == 'compute_molecular_weight': + results.append(molecular_weight(seq)) + elif procedure == 'compute_length': + results.append(compute_length(seq)) + elif procedure == 'compute_hydrophobicity': + results.append(compute_hydrophobicity(seq)) if len(results) == 1: return results[0] else: return results - + From 21c12af1695547bf7aa5fd7180cf5f3a02f7ea79 Mon Sep 17 00:00:00 2001 From: Artyom Date: Wed, 27 Sep 2023 19:31:09 +0300 Subject: [PATCH 09/25] Change functions names and extend run_protein_tools --- protein_tools.py | 117 +++++++++++++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 44 deletions(-) diff --git a/protein_tools.py b/protein_tools.py index 3c54477..6fb0943 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -1,4 +1,7 @@ alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} + +alphabet_rna = {'A', 'U', 'G', 'C'} + amino_acid_masses = { 'A': 71.03711, 'R': 156.10111, @@ -21,7 +24,10 @@ 'Y': 163.06333, 'V': 99.06841 } -codon_table = { + +gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} + +dna_codons = { 'A': ['GCT', 'GCC', 'GCA', 'GCG'], 'C': ['TGT', 'TGC'], 'D': ['GAT', 'GAC'], @@ -41,29 +47,56 @@ 'T': ['ACT', 'ACC', 'ACA', 'ACG'], 'V': ['GTT', 'GTC', 'GTA', 'GTG'], 'W': ['TGG'], - 'Y': ['TAT', 'TAC']} -gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} + 'Y': ['TAT', 'TAC'], + '*': ["UAA", "UAG", "UGA"]} + +rna_codons = { + "F": ["UUC", "UUU"], "L": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"], + "I": ["AUU", "AUC", "AUA"], "M": ["AUG"], "V": ["GUU", "GUC", "GUA", "GUG"], + "S": ["UCU", "UCC", "UCA", "UCG"], "P": ["CCU", "CCC", "CCA", "CCG"], + "T": ["ACU", "ACC", "ACA", "ACG"], "A": ["GCU", "GCC", "GCA", "GCG"], + "Y": ["UAC", "UAU"], "*": ["UAA", "UAG", "UGA"], "H": ["CAU", "CAC"], + "Q": ["CAA", "CAG"], "N": ["AAU", "AAC"], + "K": ["AAA", "AAG"], "D": ["GAU", "GAC"], "E": ["GAA", "GAG"], + "C": ["UGU", "UGC"], "W": ["UGG"], "R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"], + "S": ["AGU", "AGC"], "G": ["GGU", "GGC", "GGA", "GGG"] + } + + +def is_protein(seq): + unique_chars = set(seq.upper()) + return unique_chars <= alphabet_protein -def compute_length(*seqs: str): +def is_rna(seq): + unique_chars = set(seq.upper()) + return unique_chars <= alphabet_rna + + +def compute_molecular_weight(seq): + molecular_weight = 0 + for amino_acid in seq: + molecular_weight += amino_acid_masses[amino_acid] + return round(molecular_weight, 3) + + +def compute_length(seq: str): """ Compute the length of the input amino acid sequence """ - lens = [] - for seq in seqs: - lens.append(len(seq)) - return lens if len(lens) > 1 else lens[0] + return len(seq) -def level_of_hydrophobic(protein): +def compute_hydrophobicity(protein): count_of_gydrophobic = 0 - for i in range(len(protein)): - if protein[i] in gydrophobic_aminoacids: - count_of_gydrophobic += 1 + if is_protein(protein): + for i in range(len(protein)): + if protein[i] in gydrophobic_aminoacids: + count_of_gydrophobic += 1 - percentage = count_of_gydrophobic / len(protein) * 100 + percentage = round(count_of_gydrophobic / len(protein) * 100, 3) return f"Percentage of gydrophobic aminoacids in {protein} = {percentage}%." @@ -74,23 +107,25 @@ def translation(seq): triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] protein = [] for triplet in triplets: - for aminoacid in codon_table.keys(): - if triplet in codon_table[aminoacid]: + for aminoacid in rna_codons.keys(): + if triplet in rna_codons[aminoacid]: protein.append(aminoacid) - if is_protein("".join(protein)): - start = protein.index("M") - stop = protein.index("*") - return "".join(protein[start:stop + 1]) - else: - return "This sequence doesn't include the gene." + start = protein.index("M") + stop = protein.index("*") + return "".join(protein[start:stop + 1]) -def mutations(seq, protein): - correct_protein = translation(seq) +def check_mutations(seq, protein): - + if is_protein(protein[:-1]) is not True: + raise ValueError("Invalid protein sequence") + if is_rna(seq) is not True: + raise ValueError("Invalid RNA sequence") + + correct_protein = translation(seq) bank_of_mutations = [] + for i in range(len(correct_protein)): if correct_protein[i] != protein[i]: bank_of_mutations.append(f'{protein[i]}{i + 1}') @@ -101,31 +136,25 @@ def mutations(seq, protein): return "Mutations:" + ", ".join(bank_of_mutations) + "." -def is_protein(seq): - unique_chars = set(seq) - return unique_chars <= alphabet_protein - - -def molecular_weight(seq): - molecular_weight = 0 - for amino_acid in seq: - molecular_weight += amino_acid_masses[amino_acid] - return round(molecular_weight, 3) - - def run_protein_tools(*seqs_and_procedure): procedure = seqs_and_procedure[-1] seqs = seqs_and_procedure[:-1] results = [] - - for seq in seqs: - seq = seq.upper() - if is_protein(seq) is not True: - raise ValueError("Invalid alphabet") - if procedure == 'molecular_weight': - results.append(molecular_weight(seq)) - + if procedure == 'check_mutations': + results.append(check_mutations(seqs[0], seqs[1])) + + else: + for seq in seqs: + seq = seq.upper() + if is_protein(seq) is not True: + raise ValueError("Invalid protein sequence") + if procedure == 'compute_molecular_weight': + results.append(molecular_weight(seq)) + elif procedure == 'compute_length': + results.append(compute_length(seq)) + elif procedure == 'compute_hydrophobicity': + results.append(compute_hydrophobicity(seq)) if len(results) == 1: return results[0] else: From aa4618c0480c8f91f75014675f9bd1384c3d6f63 Mon Sep 17 00:00:00 2001 From: Artyom Date: Fri, 29 Sep 2023 02:22:49 +0300 Subject: [PATCH 10/25] Add docstings and revise run_protein_tools function --- protein_tools.py | 78 ++++++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/protein_tools.py b/protein_tools.py index 6fb0943..0881b2a 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -63,45 +63,54 @@ } -def is_protein(seq): +def is_protein(seq:str): + """ + Check the existence of a protein sequence, return boolean. + """ unique_chars = set(seq.upper()) return unique_chars <= alphabet_protein -def is_rna(seq): +def is_rna(seq:str): + """ + Check the existence of a RNA sequence, return boolean. + """ unique_chars = set(seq.upper()) return unique_chars <= alphabet_rna -def compute_molecular_weight(seq): +def compute_molecular_weight(seq:str): + """ + Compute molecular weight (g/mol) of protein sequence. + """ molecular_weight = 0 - for amino_acid in seq: + for amino_acid in seq.upper(): molecular_weight += amino_acid_masses[amino_acid] return round(molecular_weight, 3) -def compute_length(seq: str): +def compute_length(seq:str): """ - Compute the length of the input amino acid sequence - + Compute the length of protein sequence. """ return len(seq) -def compute_hydrophobicity(protein): - +def compute_hydrophobicity(protein:str): + """ + Compute the percentage of gydrophobic aminoacids in protein sequence. + """ count_of_gydrophobic = 0 - if is_protein(protein): - for i in range(len(protein)): - if protein[i] in gydrophobic_aminoacids: - count_of_gydrophobic += 1 + for i in range(len(protein)): + if protein[i] in gydrophobic_aminoacids: + count_of_gydrophobic += 1 percentage = round(count_of_gydrophobic / len(protein) * 100, 3) return f"Percentage of gydrophobic aminoacids in {protein} = {percentage}%." -def translation(seq): +def translation(seq:str): """ """ triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] @@ -116,8 +125,9 @@ def translation(seq): return "".join(protein[start:stop + 1]) -def check_mutations(seq, protein): - +def check_mutations(seq:str, protein:str): + """ + """ if is_protein(protein[:-1]) is not True: raise ValueError("Invalid protein sequence") if is_rna(seq) is not True: @@ -136,27 +146,37 @@ def check_mutations(seq, protein): return "Mutations:" + ", ".join(bank_of_mutations) + "." -def run_protein_tools(*seqs_and_procedure): - procedure = seqs_and_procedure[-1] - seqs = seqs_and_procedure[:-1] - +def run_protein_tools(*args:str): + """ + Function containing methods for protein analysis. + + Takes arbitrary number of arguments with protein sequencies + and the name of the procedure to be performed (always the + last argument). Returns the result of the procedure as string + if one sequnce is submitted or list if several. + + If procedure 'check_mutations' is used then input must be only three + arguments: RNA sequence, protein sequence and the name of procedure + itself. + """ + *seqs, procedure = args results = [] + d_of_functions = {'compute_molecular_weight': compute_molecular_weight, + 'compute_length': compute_length, + 'compute_hydrophobicity': compute_hydrophobicity, + } if procedure == 'check_mutations': results.append(check_mutations(seqs[0], seqs[1])) - else: for seq in seqs: - seq = seq.upper() if is_protein(seq) is not True: raise ValueError("Invalid protein sequence") - if procedure == 'compute_molecular_weight': - results.append(molecular_weight(seq)) - elif procedure == 'compute_length': - results.append(compute_length(seq)) - elif procedure == 'compute_hydrophobicity': - results.append(compute_hydrophobicity(seq)) + if procedure not in d_of_functions: + raise ValueError("Wrong procedure name") + else: + results.append(d_of_functions[procedure](seq)) if len(results) == 1: return results[0] else: return results - \ No newline at end of file + \ No newline at end of file From ed17f8a4c8c6254a208a8b04237632e2467c74fb Mon Sep 17 00:00:00 2001 From: Artem Toropov <144557024+artyomtorr@users.noreply.github.com> Date: Fri, 29 Sep 2023 00:35:50 +0300 Subject: [PATCH 11/25] Update README.md --- README.md | 80 +++++++++++++++++-------------------------------------- 1 file changed, 25 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index f918170..a93e4cf 100644 --- a/README.md +++ b/README.md @@ -1,65 +1,35 @@ -# HW 4. Functions 2 -> *This is the repo for the fourth homework of the BI Python 2023 course* +# protein_tools.py -### Homework description +**protein_tools.py** - is a tool which allows the performing of various procedures for a user entered protein sequences. -На прошлой неделе вы делали утилиту для работы с последовательностями нуклеиновых кислот (с весьма строгим ТЗ). Пришло время для чего-то более самостоятельного. +### Usage -#### Основное задание +The tool works by calling the function `run_protein_tools`, which takes arbitrary number of arguments with protein sequencies (*str*) and the name of the procedure to be performed (always the last argument, *str*, see the usage examples below). The output is the result of the procedure as *string* if one sequence is submitted or *list* if several. +**NOTE:** For the procedure `check_mutations` a fixed number of string arguments are used: one RNA sequence, one protein sequence and the name of procedure itself. -Напишите утилиту для работы с последовательностями белков. Там должно быть минимум 5 различных операций, должна быть какая-то точка входа через которую пользователь будет всё это дело использовать. На этом, по сути, всё. Всё целиком зависит от вашей фантазии и креативности. Можете опираться на ДЗ №2 и №3. +### Procedures -Самая главная часть задания - это файл `README.md`. Сделайте краткое введение, напишите описание тула, приведите документацию по использованию со списком аргументов. Добавьте примеры использования. Возможно, вы захотите сделать секцию Troubleshooting. ***Почему это нужно?*** В этот раз проверяющий не будет знать того, как должен работать ваш тул. Это ваш авторский код. Даже самая прекрасная функциональность, не будучи отраженной в README, скорее всего останется незамеченной. README - это ваш способ познакомить пользователя с тулом, показать всё лучше и обосновать, почему именно ваша команда должна получить наивысший балл. +- `compute_molecular_weight` — computes molecular weight of protein sequence in g/mol +- `compute_length` — computes the number of amino acids in protein sequence +- `compute_hydrophobicity` — computes the percentage of gydrophobic aminoacids in protein sequence +- `check_mutations` — -Есть люди которые, любят писать документации, а есть те - кто не любит. Найдите в вашей команде того, кто любит. И в будущем в своих рабочих проектах всегда держите рядом такого человек (или будьте им). +### Examples +```python +run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'GSCKRGPRT', 'compute_length') # [10, 18, 9] +run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'GSCKRGPRT', 'compute_molecular_weight') # [1055.496, 1886.872, 942.482] +run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'GSCKRGPRT', 'compute_hydrophobicity') # [50.0, 27.778, 11.111] -Примеры некоторых README, которыми можно вдохновляться: +``` + +### Additional information +- The program works **only** with protein or RNA sequences. If any of the entered sequences contain inappropriate characters or cannot exist, the program will display an error. Sequences can contain characters of any case. -- [MetaFX](https://github.com/ctlab/metafx), тул Артёма Иванова. Там еще и [wiki](https://github.com/ctlab/metafx/wiki) крутое. -- [samovar](https://github.com/nvaulin/samovar) -- [MetaGEM](https://github.com/franciscozorrilla/metaGEM) -- [Pharokka](https://github.com/gbouras13/pharokka) +```python +run_protein_tools('ATA', 'DefinitelyNotDNA', 'transcribe') # ValueError: Invalid alpabet +run_protein_tools('ATGU', 'reverse') # ValueError: Invalid alpabet +``` -Типовые секции, на которые стоит обратить внимание: Title, Overview, Usage, Options, Examples, Troubleshooting, Contacts. - -**Tехническое требование к заданию.** - -Это задание будет выполняться в командах по 3 человека. Каждый из членов команды должен внести ***как минимум*** 2 функции. Каждое внесение функции должно сопровождаться коммитом с осмысленным описанием коммита. Ниже приведена последовательность действий для успешного выполнения задания (аналогично ДЗ №2): - -1. Посмотрите состав своей команды здесь ([**ССЫЛКА**](https://docs.google.com/spreadsheets/d/1KMBBBu8LqauRpDJb0v1ldPwpvzNn8-KakcHexAcqLsE/edit?usp=sharing)). -2. Тимлид делает форк данного репозитория. **В форке создает ветку `HW4_`, в ветке создает папку `HW4_`, в этой папке вы всё делаете.** -3. Члены команды могут либо делать свои форки, либо работать в репозитории тимлида в качестве колабораторов ("contributors"). В любом случае делаете клоны => пишите код локально => пушите. -4. В конце тимлид делайет pull-request из `HW4_` своего репозитория в `main` этого. - - -А также: -- Сопроводите программу лучшим `README.md` файлом в вашей жизни (на английском языке). -- В этом ДЗ проблемы с качеством кода (нейминги, пустые строки, анноатции типов, док.стринги, пробелы) могут привести к снижению балла. Воспользуйтесь линтерами чтобы себя обезопасить. IDE по типу PyCharm или VSCode имеют фунцонал по авто-исправлению многих проблем такого рода. - -Автотестов на GitHub в этом ДЗ нет, но вы можете прогнать линтеры на качество кода локально (как в ДЗ №3, подробнее читайте [тут](https://plausible-cannon-091.notion.site/Code-auto-checks-02b2ea69c1d545fca07b50ce5933ed5f?pvs=4)). - -- Программа должна сохранять регистр символов. -- Программа должна работать только с последовательностями белков. -- Запрещается использование сторонних модулей. - - -### Форма сдачи - -Прикрепите ссылку на pull-request тимлида в Google Class (можете сделать от лица каждого члена команды, но это не обязательно). - - -### Pазбалловка - -- За каждую из 5 операций - максимум **1.5 балла** -- За README - максимум **2.5 балла** -- Если вы не внесли как минимум 2 функции от себя, вы получаете 0 баллов (на баллы остальных членов команды это не влияет). -- За фото созвона в README можно получить 0.2 доп. балла (но не более 10 баллов суммарно) - - - -### **Предполагаемый учебный результат** - -Это задание позволит вам проявить креативность и учиться быть не только кодером, но и автором. Также это задание поможет окончательно закрепить материал по функциям который мы прошли. - -Удачи! ✨✨ +### Contacts +Author contributions: From a25248ca9525748800ee83cf47362291fbf797a0 Mon Sep 17 00:00:00 2001 From: Artem Toropov <144557024+artyomtorr@users.noreply.github.com> Date: Fri, 29 Sep 2023 16:52:17 +0300 Subject: [PATCH 12/25] Add contacts to README.md --- README.md | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a93e4cf..241b744 100644 --- a/README.md +++ b/README.md @@ -13,23 +13,34 @@ The tool works by calling the function `run_protein_tools`, which takes arbitrar - `compute_molecular_weight` — computes molecular weight of protein sequence in g/mol - `compute_length` — computes the number of amino acids in protein sequence - `compute_hydrophobicity` — computes the percentage of gydrophobic aminoacids in protein sequence -- `check_mutations` — +- `check_mutations` — +- ### Examples ```python run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'GSCKRGPRT', 'compute_length') # [10, 18, 9] run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'GSCKRGPRT', 'compute_molecular_weight') # [1055.496, 1886.872, 942.482] run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'GSCKRGPRT', 'compute_hydrophobicity') # [50.0, 27.778, 11.111] - +run_protein_tools('AUGGAUCAUcAAUAA', 'MDKL*', 'check_mutations') #'Mutations:K3, L4.' ``` ### Additional information -- The program works **only** with protein or RNA sequences. If any of the entered sequences contain inappropriate characters or cannot exist, the program will display an error. Sequences can contain characters of any case. +- The program works **only** with protein and RNA sequences. If any of the entered sequences contain inappropriate characters or cannot exist, the program will display an error. Sequences can contain characters of any case. ```python -run_protein_tools('ATA', 'DefinitelyNotDNA', 'transcribe') # ValueError: Invalid alpabet -run_protein_tools('ATGU', 'reverse') # ValueError: Invalid alpabet +run_protein_tools('PROTEIN', 'compute_molecular_weight') # ValueError: Invalid protein sequence +run_protein_tools('AUGGAU_AUcAAUAA', 'MDKL*', 'check_mutations')# ValueError: Invalid RNA sequence ``` ### Contacts -Author contributions: +Please use contacts below to reach out with any comments, concerns, or discussions regarding **protein_tools.py.**
+- Artyom Toropov ([@artyomtorr](github.com/artyomtorr))
+- Sofiya Vinogradova ([@sofiyaga57](github.com/sofiyaga57))
+- Nikita Zherko ([@rereremin](github.com/rereremin))
+![изображение](https://github.com/artyomtorr/HW4_Functions2/assets/144557024/88f1c523-711a-40d7-9134-30c6b6639037) + + +*Author contributions:*
+Artyom Toropov (teamlead): functions *is_protein*, *molecular_weight*, *run_protein_tools*
+Sofiya Vinogradova: functions ...,
+Nikita Zherko: functions *compute_hydrophobicity*, *check_mutations*. From 7d1e07ef80d07b7cba878b5f22434e6e4ee17814 Mon Sep 17 00:00:00 2001 From: Artem Toropov <144557024+artyomtorr@users.noreply.github.com> Date: Fri, 29 Sep 2023 17:15:27 +0300 Subject: [PATCH 13/25] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 241b744..b6b838a 100644 --- a/README.md +++ b/README.md @@ -34,13 +34,13 @@ run_protein_tools('AUGGAU_AUcAAUAA', 'MDKL*', 'check_mutations')# ValueError: In ### Contacts Please use contacts below to reach out with any comments, concerns, or discussions regarding **protein_tools.py.**
-- Artyom Toropov ([@artyomtorr](github.com/artyomtorr))
-- Sofiya Vinogradova ([@sofiyaga57](github.com/sofiyaga57))
-- Nikita Zherko ([@rereremin](github.com/rereremin))
+- Artyom Toropov ([@artyomtorr](https://github.com/artyomtorr/))
+- Sofiya Vinogradova ([@sofiyaga57](https://github.com/sofiyaga57/))
+- Nikita Zherko ([@rereremin](https://github.com/rereremin/))
![изображение](https://github.com/artyomtorr/HW4_Functions2/assets/144557024/88f1c523-711a-40d7-9134-30c6b6639037) *Author contributions:*
-Artyom Toropov (teamlead): functions *is_protein*, *molecular_weight*, *run_protein_tools*
+Artyom Toropov (teamlead): functions `is_protein`, `compute_molecular_weight`, `run_protein_tools`
Sofiya Vinogradova: functions ...,
-Nikita Zherko: functions *compute_hydrophobicity*, *check_mutations*. +Nikita Zherko: functions `compute_hydrophobicity`, `check_mutations` From 140222d3a18f8bc0826dfb25ba5ff68d107d5420 Mon Sep 17 00:00:00 2001 From: rereremin <114501294+rereremin@users.noreply.github.com> Date: Fri, 29 Sep 2023 22:01:14 +0300 Subject: [PATCH 14/25] Add docstings protein_tools.py --- protein_tools.py | 78 ++++++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/protein_tools.py b/protein_tools.py index 3fbd93d..0452f71 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -63,45 +63,54 @@ } -def is_protein(seq): +def is_protein(seq:str): + """ + Check the existence of a protein sequence, return boolean. + """ unique_chars = set(seq.upper()) return unique_chars <= alphabet_protein -def is_rna(seq): +def is_rna(seq:str): + """ + Check the existence of a RNA sequence, return boolean. + """ unique_chars = set(seq.upper()) return unique_chars <= alphabet_rna -def compute_molecular_weight(seq): +def compute_molecular_weight(seq:str): + """ + Compute molecular weight (g/mol) of protein sequence. + """ molecular_weight = 0 - for amino_acid in seq: + for amino_acid in seq.upper(): molecular_weight += amino_acid_masses[amino_acid] return round(molecular_weight, 3) -def compute_length(seq: str): +def compute_length(seq:str): """ - Compute the length of the input amino acid sequence - + Compute the length of protein sequence. """ return len(seq) -def compute_hydrophobicity(protein): - +def compute_hydrophobicity(protein:str): + """ + Compute the percentage of gydrophobic aminoacids in protein sequence. + """ count_of_gydrophobic = 0 - if is_protein(protein): - for i in range(len(protein)): - if protein[i] in gydrophobic_aminoacids: - count_of_gydrophobic += 1 + for i in range(len(protein)): + if protein[i] in gydrophobic_aminoacids: + count_of_gydrophobic += 1 percentage = round(count_of_gydrophobic / len(protein) * 100, 3) return f"Percentage of gydrophobic aminoacids in {protein} = {percentage}%." -def translation(seq): +def translation(seq:str): """ """ triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] @@ -116,8 +125,9 @@ def translation(seq): return "".join(protein[start:stop + 1]) -def check_mutations(seq, protein): - +def check_mutations(seq:str, protein:str): + """ + """ if is_protein(protein[:-1]) is not True: raise ValueError("Invalid protein sequence") if is_rna(seq) is not True: @@ -136,27 +146,37 @@ def check_mutations(seq, protein): return "Mutations:" + ", ".join(bank_of_mutations) + "." -def run_protein_tools(*seqs_and_procedure): - procedure = seqs_and_procedure[-1] - seqs = seqs_and_procedure[:-1] - +def run_protein_tools(*args:str): + """ + Function containing methods for protein analysis. + + Takes arbitrary number of arguments with protein sequencies + and the name of the procedure to be performed (always the + last argument). Returns the result of the procedure as string + if one sequnce is submitted or list if several. + + If procedure 'check_mutations' is used then input must be only three + arguments: RNA sequence, protein sequence and the name of procedure + itself. + """ + *seqs, procedure = args results = [] + d_of_functions = {'compute_molecular_weight': compute_molecular_weight, + 'compute_length': compute_length, + 'compute_hydrophobicity': compute_hydrophobicity, + } if procedure == 'check_mutations': results.append(check_mutations(seqs[0], seqs[1])) - else: for seq in seqs: - seq = seq.upper() if is_protein(seq) is not True: raise ValueError("Invalid protein sequence") - if procedure == 'compute_molecular_weight': - results.append(molecular_weight(seq)) - elif procedure == 'compute_length': - results.append(compute_length(seq)) - elif procedure == 'compute_hydrophobicity': - results.append(compute_hydrophobicity(seq)) + if procedure not in d_of_functions: + raise ValueError("Wrong procedure name") + else: + results.append(d_of_functions[procedure](seq)) if len(results) == 1: return results[0] else: return results - + From f85159cde2a2e073ec17771d1d0ec4552fa7f7fe Mon Sep 17 00:00:00 2001 From: nikita Date: Fri, 29 Sep 2023 23:59:53 +0300 Subject: [PATCH 15/25] Update translation and check_mutatoins with raise and add docstrings --- protein_tools.py | 57 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/protein_tools.py b/protein_tools.py index 0452f71..9f4e7dc 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -96,9 +96,16 @@ def compute_length(seq:str): return len(seq) -def compute_hydrophobicity(protein:str): +def compute_hydrophobicity(protein:str) -> str: """ Compute the percentage of gydrophobic aminoacids in protein sequence. + + Argument: + - protein (str): protein sequence. Include hydrophobic + and hydrophilic aminoacids. + + Return: + - str, result of computation percentage of gydrophobic aminoacids. """ count_of_gydrophobic = 0 for i in range(len(protein)): @@ -110,8 +117,16 @@ def compute_hydrophobicity(protein:str): return f"Percentage of gydrophobic aminoacids in {protein} = {percentage}%." -def translation(seq:str): +def translation(seq:str) -> str: """ + Realize the translation mRNA into protein sequence. + + Argument: + - seq (str): mRNA sequence + + Return: + - str, protein after translation + Remark: Correct protein sequence starts with "M" and ends with "*". """ triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] protein = [] @@ -120,18 +135,50 @@ def translation(seq:str): if triplet in rna_codons[aminoacid]: protein.append(aminoacid) + if protein[-1] != "*": + raise ValueError("Stop-codon (*) is absent in mRNA") + if protein[0] != "M": + raise ValueError("Start-codon (M) is absent in mRNA") + start = protein.index("M") stop = protein.index("*") return "".join(protein[start:stop + 1]) -def check_mutations(seq:str, protein:str): +def check_mutations(seq:str, protein:str) -> str: """ + Check mutations in the protein sequence after translation. + + Use additional function "translation(seq)". + This function doesn't show mutations, which don't lead to + change aminoacids in protein sequence. + + Arguments: + - seq (str): translation sequence of mRNA with/without mutations + - protein (str): protein for comparison with protein after translation. + Every protein starts with "M" and ends with "*" (stop-codon). + Remark: is_protein(seq) doesn't see "*", but it's used in the other part of function. + + Return: + - str, if mRNA without mutations return "Protein without mutations." + If some mutations in protein, return aminoacid(s) and their position(s) + + Examples: + - "AUGGUAGGGAAAUUUUGA", "MVGKF*" -> "Protein without mutations." + - "AUGGUAGGGAAAUUUUGA", "MGGKF*" -> "Mutations:G2." + - "AUGGUAGGGAAAUUUUGA", "MGGVF*" -> "Mutations:G2, V4." + - "AUGGUAGGGAAAUUUUGA", "MGGKF" –> ValueError: Stop (*) is absent" + - "AUGGUAGGGAAAUUUUGA", "GGKF*" –> ValueError: Start (M) is absent" + """ if is_protein(protein[:-1]) is not True: raise ValueError("Invalid protein sequence") if is_rna(seq) is not True: raise ValueError("Invalid RNA sequence") + if protein[-1] != "*": + raise ValueError("Stop (*) is absent") + if protein[0] != "M": + raise ValueError("Start (M) is absent") correct_protein = translation(seq) bank_of_mutations = [] @@ -180,3 +227,7 @@ def run_protein_tools(*args:str): else: return results +print(run_protein_tools("AUGGUAGGGAAAUUUUGA", "MGGKF*", "check_mutations")) +print(run_protein_tools("GUAGGGAAAUUUUgA", "MGVKF*", "check_mutations")) +#print(translation("AUGGUAGGGAAAUUUUGA")) + From d7575e2d8f8d077980c78b4f4a4772f6a12e947b Mon Sep 17 00:00:00 2001 From: rereremin <114501294+rereremin@users.noreply.github.com> Date: Sat, 30 Sep 2023 01:02:18 +0300 Subject: [PATCH 16/25] Add raise in check_mutations and change return in compute_hydrophobicity --- protein_tools.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/protein_tools.py b/protein_tools.py index ab0e53a..5dc669a 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -96,9 +96,16 @@ def compute_length(seq:str): return len(seq) -def compute_hydrophobicity(protein:str) -> str: +def compute_hydrophobicity(protein:str) -> tuple: """ Compute the percentage of gydrophobic aminoacids in protein sequence. + + Argument: + - protein (str): protein sequence. Include hydrophobic + and hydrophilic aminoacids. + + Return: + - tuple, result of computation percentage of gydrophobic aminoacids. """ count_of_gydrophobic = 0 for i in range(len(protein)): @@ -107,7 +114,7 @@ def compute_hydrophobicity(protein:str) -> str: percentage = round(count_of_gydrophobic / len(protein) * 100, 3) - return f"Percentage of gydrophobic aminoacids in {protein} = {percentage}%." + return protein, percentage def translate_rna(seq:str) -> str: @@ -160,9 +167,9 @@ def check_mutations(seq:str, protein:str) -> str: Examples: - "AUGGUAGGGAAAUUUUGA", "MVGKF*" -> "Protein without mutations." - "AUGGUAGGGAAAUUUUGA", "MGGVF*" -> "Mutations:G2, V4." - - "AUGGUAGGGAAAUUUUGA", "MGGKF" –> ValueError: Stop (*) is absent" - - "AUGGUAGGGAAAUUUUGA", "GGKF*" –> ValueError: Start (M) is absent" - + - "AUGGUAGGGAAAUUUUGA", "MGGKF" –> "ValueError: Stop (*) is absent" + - "AUGGUAGGGAAAUUUUGA", "GGKF*" –> "ValueError: Start (M) is absent" + - "AUGAAAAAAUGA", "MK*" -> "ValueError: Different length of translated protein and protein" """ correct_protein = translation(seq) @@ -176,6 +183,8 @@ def check_mutations(seq:str, protein:str) -> str: raise ValueError("Stop (*) is absent") if protein[0] != "M": raise ValueError("Start (M) is absent") + if len(protein) != len(seq)/3: + raise ValueError("Different length of translated protein and protein") for i in range(len(correct_protein)): if correct_protein[i] != protein[i]: From 5fddd4fb15f1b6a70756ac02dd5e0decb20c7e5a Mon Sep 17 00:00:00 2001 From: Artyom Date: Sun, 1 Oct 2023 04:35:55 +0300 Subject: [PATCH 17/25] Edit docstrings --- protein_tools.py | 61 +++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/protein_tools.py b/protein_tools.py index 5dc669a..68e7bb8 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -79,21 +79,34 @@ def is_rna(seq:str): return unique_chars <= alphabet_rna -def compute_molecular_weight(seq:str): +def compute_molecular_weight(seq:str) -> tuple: """ Compute molecular weight (g/mol) of protein sequence. + + Argument: + - protein (str): protein sequence. + + Return: + - tuple with protein sequence and computed molecular + weight (float rounded to 3 decimal places). """ molecular_weight = 0 for amino_acid in seq.upper(): molecular_weight += amino_acid_masses[amino_acid] - return round(molecular_weight, 3) + return seq, round(molecular_weight, 3) -def compute_length(seq:str): +def compute_length(seq:str) -> tuple: """ Compute the length of protein sequence. + + Argument: + - protein (str): protein sequence. + + Return: + - tuple with protein sequence and computed length. """ - return len(seq) + return seq, len(seq) def compute_hydrophobicity(protein:str) -> tuple: @@ -101,11 +114,11 @@ def compute_hydrophobicity(protein:str) -> tuple: Compute the percentage of gydrophobic aminoacids in protein sequence. Argument: - - protein (str): protein sequence. Include hydrophobic - and hydrophilic aminoacids. + - protein (str): protein sequence. Return: - - tuple, result of computation percentage of gydrophobic aminoacids. + - tuple with protein sequence and computed percentage + of gydrophobic aminoacids. """ count_of_gydrophobic = 0 for i in range(len(protein)): @@ -148,21 +161,21 @@ def translate_rna(seq:str) -> str: def check_mutations(seq:str, protein:str) -> str: """ - Check mutations in the protein sequence after translation. + Check missense mutations in the protein sequence after translation. - Use additional function "translation(seq)". - This function doesn't show mutations, which don't lead to - change aminoacids in protein sequence. + Uses additional function "translate_rna(seq)". Arguments: - - seq (str): translation sequence of mRNA with/without mutations - - protein (str): protein for comparison with protein after translation. - Every protein starts with "M" and ends with "*" (stop-codon). - Remark: is_protein(seq) doesn't see "*", but it's used in the other part of function. + - seq (str): sequence of mRNA with/without mutations. + Must contain start-codon and one of the stop-codons. + - protein (str): protein sequence translated from mRNA. + Must start with "M" and ends with "*" (stop-codon). + + Note: is_protein(seq) doesn't see "*", but it's used in the other part of function. Return: - str, if mRNA without mutations return "Protein without mutations." - If some mutations in protein, return aminoacid(s) and their position(s) + If there are mutations in protein, returns aminoacid(s) and their position(s) Examples: - "AUGGUAGGGAAAUUUUGA", "MVGKF*" -> "Protein without mutations." @@ -172,7 +185,7 @@ def check_mutations(seq:str, protein:str) -> str: - "AUGAAAAAAUGA", "MK*" -> "ValueError: Different length of translated protein and protein" """ - correct_protein = translation(seq) + correct_protein = translate_rna(seq) bank_of_mutations = [] if is_protein(protein[:-1]) is not True: @@ -193,7 +206,7 @@ def check_mutations(seq:str, protein:str) -> str: if len(bank_of_mutations) == 0: return "Protein without mutations." else: - return "Mutations:" + ", ".join(bank_of_mutations) + "." + return "Mutations: " + ", ".join(bank_of_mutations) + "." def run_protein_tools(*args:str): @@ -202,12 +215,12 @@ def run_protein_tools(*args:str): Takes arbitrary number of arguments with protein sequencies and the name of the procedure to be performed (always the - last argument). Returns the result of the procedure as string - if one sequnce is submitted or list if several. + last argument). Returns the result of the procedure as tuple + if one sequnce is submitted or list of tuples if several. - If procedure 'check_mutations' is used then input must be only three - arguments: RNA sequence, protein sequence and the name of procedure - itself. + Note: if procedure 'check_mutations' is used then input must + contain only three arguments: RNA sequence, protein sequence + and the name of procedure itself. """ *seqs, procedure = args results = [] @@ -229,4 +242,4 @@ def run_protein_tools(*args:str): return results[0] else: return results - + \ No newline at end of file From 54171eb733b9c84fda2f0b36597c0e0a2cbc3140 Mon Sep 17 00:00:00 2001 From: Artem Toropov <144557024+artyomtorr@users.noreply.github.com> Date: Sun, 1 Oct 2023 01:46:52 +0300 Subject: [PATCH 18/25] Update README.md --- README.md | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index b6b838a..f8b086e 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ### Usage -The tool works by calling the function `run_protein_tools`, which takes arbitrary number of arguments with protein sequencies (*str*) and the name of the procedure to be performed (always the last argument, *str*, see the usage examples below). The output is the result of the procedure as *string* if one sequence is submitted or *list* if several. +The tool works by calling the function `run_protein_tools`, which takes arbitrary number of arguments with protein sequencies (*str*) and the name of the procedure to be performed (always the last argument, *str*, see the usage examples below). The output is the result of the procedure as *string, tuple* or *dictionary* if one sequence is submitted or *list* if several. **NOTE:** For the procedure `check_mutations` a fixed number of string arguments are used: one RNA sequence, one protein sequence and the name of procedure itself. @@ -13,25 +13,45 @@ The tool works by calling the function `run_protein_tools`, which takes arbitrar - `compute_molecular_weight` — computes molecular weight of protein sequence in g/mol - `compute_length` — computes the number of amino acids in protein sequence - `compute_hydrophobicity` — computes the percentage of gydrophobic aminoacids in protein sequence -- `check_mutations` — -- +- `check_mutations` — checks missense mutations in the protein sequence after translation +- `protein_to_nucleic_acid`- returns possible variants of DNAs for a given protein sequence +- `count_amino_acids` - calculates the number of each aminoacid in protein sequence ### Examples ```python -run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'GSCKRGPRT', 'compute_length') # [10, 18, 9] -run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'GSCKRGPRT', 'compute_molecular_weight') # [1055.496, 1886.872, 942.482] -run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'GSCKRGPRT', 'compute_hydrophobicity') # [50.0, 27.778, 11.111] -run_protein_tools('AUGGAUCAUcAAUAA', 'MDKL*', 'check_mutations') #'Mutations:K3, L4.' +run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'compute_length') +#[('MAEGEITNLP', 10), ('tGQYLAMDTSgLLYGSQT', 18)] + +run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'compute_molecular_weight') +#[('MAEGEITNLP', 1055.496), ('tGQYLAMDTSgLLYGSQT', 1886.872)] + +run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'compute_hydrophobicity') +#[('MAEGEITNLP', 50.0), ('tGQYLAMDTSgLLYGSQT', 27.778)] + +run_protein_tools('AUGGAUCAUcAAUAA', 'MDKL*', 'check_mutations') +#'Mutations: K3, L4.' + +run_protein_tools('MAEGLP', 'LYGSQT','protein_to_nucleic_acid') +#['ATG GCT/GCC/GCA/GCG GAA/GAG GGT/GGC/GGA/GGG TTA/TTG/CTT/CTC/CTA/CTG CCT/CCC/CCA/CCG', +#'TTA/TTG/CTT/CTC/CTA/CTG TAT/TAC GGT/GGC/GGA/GGG TCT/TCC/TCA/TCG/AGT/AGC CAA/CAG ACT/ACC/ACA/ACG'] + +run_protein_tools('MAEGLP', 'LYGSQT','count_amino_acids') +#[{'M': 1, 'A': 1, 'E': 1, 'G': 1, 'L': 1, 'P': 1}, +#{'L': 1, 'Y': 1, 'G': 1, 'S': 1, 'Q': 1, 'T': 1}] ``` ### Additional information - The program works **only** with protein and RNA sequences. If any of the entered sequences contain inappropriate characters or cannot exist, the program will display an error. Sequences can contain characters of any case. ```python -run_protein_tools('PROTEIN', 'compute_molecular_weight') # ValueError: Invalid protein sequence -run_protein_tools('AUGGAU_AUcAAUAA', 'MDKL*', 'check_mutations')# ValueError: Invalid RNA sequence +run_protein_tools('PROTEIN', 'compute_molecular_weight') #ValueError: Invalid protein sequence +run_protein_tools('AUGGAU_AUcAAUAA', 'MDKL*', 'check_mutations') #ValueError: Invalid RNA sequence +``` +- For the procedure `check_mutations` there are extra requirements for RNA and protein sequences: mRNA sequences must contain **start-codon** and **one of the stop-codons**, protein sequnces must start with **"M"** and ends with **"*"** (stop-codon). +```python +run_protein_tools("AUGGUAGGGAAAUUUUGA", "MGGKF", 'check_mutations') #ValueError: Stop (*) is absent +run_protein_tools("AUGGUAGGGAAAUUUUGA", "GGKF*", 'check_mutations') #ValueError: Start (M) is absent ``` - ### Contacts Please use contacts below to reach out with any comments, concerns, or discussions regarding **protein_tools.py.**
- Artyom Toropov ([@artyomtorr](https://github.com/artyomtorr/))
@@ -41,6 +61,6 @@ Please use contacts below to reach out with any comments, concerns, or discussio *Author contributions:*
-Artyom Toropov (teamlead): functions `is_protein`, `compute_molecular_weight`, `run_protein_tools`
-Sofiya Vinogradova: functions ...,
-Nikita Zherko: functions `compute_hydrophobicity`, `check_mutations` +Artyom Toropov (teamlead): functions `is_protein`, `is_rna`, `compute_molecular_weight`, `run_protein_tools`
+Sofiya Vinogradova: functions `compute_length`, `count_amino_acids`, `protein_to_nucleic_acid`
+Nikita Zherko: functions `compute_hydrophobicity`, `translate_rna`, `check_mutations` From 7acecfbcabe59a712dbf26fdf1e8929940703388 Mon Sep 17 00:00:00 2001 From: sofiyaga Date: Sun, 1 Oct 2023 10:54:36 +0300 Subject: [PATCH 19/25] Update file from HW4_Toropov branch --- protein_tools.py | 385 ++++++++++++++++++++++++++++------------------- 1 file changed, 232 insertions(+), 153 deletions(-) diff --git a/protein_tools.py b/protein_tools.py index e0fb41b..5dc669a 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -1,153 +1,232 @@ -codon_table = { - 'A': ['GCT', 'GCC', 'GCA', 'GCG'], - 'C': ['TGT', 'TGC'], - 'D': ['GAT', 'GAC'], - 'E': ['GAA', 'GAG'], - 'F': ['TTT', 'TTC'], - 'G': ['GGT', 'GGC', 'GGA', 'GGG'], - 'H': ['CAT', 'CAC'], - 'I': ['ATT', 'ATC', 'ATA'], - 'K': ['AAA', 'AAG'], - 'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'], - 'M': ['ATG'], - 'N': ['AAT', 'AAC'], - 'P': ['CCT', 'CCC', 'CCA', 'CCG'], - 'Q': ['CAA', 'CAG'], - 'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], - 'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'], - 'T': ['ACT', 'ACC', 'ACA', 'ACG'], - 'V': ['GTT', 'GTC', 'GTA', 'GTG'], - 'W': ['TGG'], - 'Y': ['TAT', 'TAC']} - - - -def compute_length(*seqs: str): - """ - Compute the length of the input amino acid sequence - - """ - lens = [] - for seq in seqs: - if is_protein(seq): - lens.append(len(seq)) - else: - raise ValueError('Not a protein') - return lens if len(lens) > 1 else lens[0] - - -def level_of_hydrophobic(protein): - gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} - - count_of_gydrophobic = 0 - if is_protein(protein): - for i in range(len(protein)): - if protein[i] in gydrophobic_aminoacids: - count_of_gydrophobic += 1 - - percentage = count_of_gydrophobic / len(protein) * 100 - - return f"Percentage of gydrophobic aminoacids in {protein} = {percentage}%." - - -def translation(seq): - """ - """ - gene_code = { - "F": ["UUC", "UUU"], "L": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"], - "I": ["AUU", "AUC", "AUA"], "M": ["AUG"], "V": ["GUU", "GUC", "GUA", "GUG"], - "S": ["UCU", "UCC", "UCA", "UCG"], "P": ["CCU", "CCC", "CCA", "CCG"], - "T": ["ACU", "ACC", "ACA", "ACG"], "A": ["GCU", "GCC", "GCA", "GCG"], - "Y": ["UAC", "UAU"], "*": ["UAA", "UAG", "UGA"], "H": ["CAU", "CAC"], - "Q": ["CAA", "CAG"], "N": ["AAU", "AAC"], - "K": ["AAA", "AAG"], "D": ["GAU", "GAC"], "E": ["GAA", "GAG"], - "C": ["UGU", "UGC"], "W": ["UGG"], "R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"], - "S": ["AGU", "AGC"], "G": ["GGU", "GGC", "GGA", "GGG"] - } - triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] - protein = [] - for triplet in triplets: - for aminoacid in gene_code.keys(): - if triplet in gene_code[aminoacid]: - protein.append(aminoacid) - - if is_protein("".join(protein)): - start = protein.index("M") - stop = protein.index("*") - return "".join(protein[start:stop + 1]) - else: - return "This sequence doesn't include the gene." - - -def mutations(seq, protein): - correct_protein = translation(seq) - - if is_protein(protein): - bank_of_mutations = [] - for i in range(len(correct_protein)): - if correct_protein[i] != protein[i]: - bank_of_mutations.append(f'{protein[i]}{i + 1}') - - if len(bank_of_mutations) == 0: - return "Protein without mutations." - else: - return "Mutations:" + ", ".join(bank_of_mutations) + "." - else: - return "It isn't a protein." - - -alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} -amino_acid_masses = { - 'A': 71.03711, - 'R': 156.10111, - 'N': 114.04293, - 'D': 115.02694, - 'C': 103.00919, - 'Q': 128.05858, - 'E': 129.04259, - 'G': 57.02146, - 'H': 137.05891, - 'I': 113.08406, - 'L': 113.08406, - 'K': 128.09496, - 'M': 131.04049, - 'F': 147.06841, - 'P': 97.05276, - 'S': 87.03203, - 'T': 101.04768, - 'W': 186.07931, - 'Y': 163.06333, - 'V': 99.06841 -} - - -def is_protein(seq): - unique_chars = set(seq) - return unique_chars <= alphabet_protein - - -def molecular_weight(seq): - molecular_weight = 0 - for amino_acid in seq: - molecular_weight += amino_acid_masses[amino_acid] - return round(molecular_weight, 3) - - -def run_protein_tools(*seqs_and_procedure): - procedure = seqs_and_procedure[-1] - seqs = seqs_and_procedure[:-1] - - results = [] - - for seq in seqs: - seq = seq.upper() - if is_protein(seq) is not True: - raise ValueError("Invalid alphabet") - if procedure == 'molecular_weight': - results.append(molecular_weight(seq)) - - if len(results) == 1: - return results[0] - else: - return results - +alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} + +alphabet_rna = {'A', 'U', 'G', 'C'} + +amino_acid_masses = { + 'A': 71.03711, + 'R': 156.10111, + 'N': 114.04293, + 'D': 115.02694, + 'C': 103.00919, + 'Q': 128.05858, + 'E': 129.04259, + 'G': 57.02146, + 'H': 137.05891, + 'I': 113.08406, + 'L': 113.08406, + 'K': 128.09496, + 'M': 131.04049, + 'F': 147.06841, + 'P': 97.05276, + 'S': 87.03203, + 'T': 101.04768, + 'W': 186.07931, + 'Y': 163.06333, + 'V': 99.06841 +} + +gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} + +dna_codons = { + 'A': ['GCT', 'GCC', 'GCA', 'GCG'], + 'C': ['TGT', 'TGC'], + 'D': ['GAT', 'GAC'], + 'E': ['GAA', 'GAG'], + 'F': ['TTT', 'TTC'], + 'G': ['GGT', 'GGC', 'GGA', 'GGG'], + 'H': ['CAT', 'CAC'], + 'I': ['ATT', 'ATC', 'ATA'], + 'K': ['AAA', 'AAG'], + 'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'], + 'M': ['ATG'], + 'N': ['AAT', 'AAC'], + 'P': ['CCT', 'CCC', 'CCA', 'CCG'], + 'Q': ['CAA', 'CAG'], + 'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], + 'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'], + 'T': ['ACT', 'ACC', 'ACA', 'ACG'], + 'V': ['GTT', 'GTC', 'GTA', 'GTG'], + 'W': ['TGG'], + 'Y': ['TAT', 'TAC'], + '*': ["UAA", "UAG", "UGA"]} + +rna_codons = { + "F": ["UUC", "UUU"], "L": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"], + "I": ["AUU", "AUC", "AUA"], "M": ["AUG"], "V": ["GUU", "GUC", "GUA", "GUG"], + "S": ["UCU", "UCC", "UCA", "UCG"], "P": ["CCU", "CCC", "CCA", "CCG"], + "T": ["ACU", "ACC", "ACA", "ACG"], "A": ["GCU", "GCC", "GCA", "GCG"], + "Y": ["UAC", "UAU"], "*": ["UAA", "UAG", "UGA"], "H": ["CAU", "CAC"], + "Q": ["CAA", "CAG"], "N": ["AAU", "AAC"], + "K": ["AAA", "AAG"], "D": ["GAU", "GAC"], "E": ["GAA", "GAG"], + "C": ["UGU", "UGC"], "W": ["UGG"], "R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"], + "S": ["AGU", "AGC"], "G": ["GGU", "GGC", "GGA", "GGG"] + } + + +def is_protein(seq:str): + """ + Check the existence of a protein sequence, return boolean. + """ + unique_chars = set(seq.upper()) + return unique_chars <= alphabet_protein + + +def is_rna(seq:str): + """ + Check the existence of a RNA sequence, return boolean. + """ + unique_chars = set(seq.upper()) + return unique_chars <= alphabet_rna + + +def compute_molecular_weight(seq:str): + """ + Compute molecular weight (g/mol) of protein sequence. + """ + molecular_weight = 0 + for amino_acid in seq.upper(): + molecular_weight += amino_acid_masses[amino_acid] + return round(molecular_weight, 3) + + +def compute_length(seq:str): + """ + Compute the length of protein sequence. + """ + return len(seq) + + +def compute_hydrophobicity(protein:str) -> tuple: + """ + Compute the percentage of gydrophobic aminoacids in protein sequence. + + Argument: + - protein (str): protein sequence. Include hydrophobic + and hydrophilic aminoacids. + + Return: + - tuple, result of computation percentage of gydrophobic aminoacids. + """ + count_of_gydrophobic = 0 + for i in range(len(protein)): + if protein[i] in gydrophobic_aminoacids: + count_of_gydrophobic += 1 + + percentage = round(count_of_gydrophobic / len(protein) * 100, 3) + + return protein, percentage + + +def translate_rna(seq:str) -> str: + """ + Perform the translation of mRNA seguence into protein sequence. + + Argument: + - seq (str): mRNA sequence. Must contain start-codon and one of + the stop-codons. + + Return: + - str, protein sequence after translation. + Always starts with "M" and ends with "*". + """ + triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] + protein = [] + for triplet in triplets: + for aminoacid in rna_codons.keys(): + if triplet in rna_codons[aminoacid]: + protein.append(aminoacid) + + if protein[-1] != "*": + raise ValueError("Stop-codon (*) is absent in mRNA") + if protein[0] != "M": + raise ValueError("Start-codon (M) is absent in mRNA") + + start = protein.index("M") + stop = protein.index("*") + return "".join(protein[start:stop + 1]) + + +def check_mutations(seq:str, protein:str) -> str: + """ + Check mutations in the protein sequence after translation. + + Use additional function "translation(seq)". + This function doesn't show mutations, which don't lead to + change aminoacids in protein sequence. + + Arguments: + - seq (str): translation sequence of mRNA with/without mutations + - protein (str): protein for comparison with protein after translation. + Every protein starts with "M" and ends with "*" (stop-codon). + Remark: is_protein(seq) doesn't see "*", but it's used in the other part of function. + + Return: + - str, if mRNA without mutations return "Protein without mutations." + If some mutations in protein, return aminoacid(s) and their position(s) + + Examples: + - "AUGGUAGGGAAAUUUUGA", "MVGKF*" -> "Protein without mutations." + - "AUGGUAGGGAAAUUUUGA", "MGGVF*" -> "Mutations:G2, V4." + - "AUGGUAGGGAAAUUUUGA", "MGGKF" –> "ValueError: Stop (*) is absent" + - "AUGGUAGGGAAAUUUUGA", "GGKF*" –> "ValueError: Start (M) is absent" + - "AUGAAAAAAUGA", "MK*" -> "ValueError: Different length of translated protein and protein" + """ + + correct_protein = translation(seq) + bank_of_mutations = [] + + if is_protein(protein[:-1]) is not True: + raise ValueError("Invalid protein sequence") + if is_rna(seq) is not True: + raise ValueError("Invalid RNA sequence") + if protein[-1] != "*": + raise ValueError("Stop (*) is absent") + if protein[0] != "M": + raise ValueError("Start (M) is absent") + if len(protein) != len(seq)/3: + raise ValueError("Different length of translated protein and protein") + + for i in range(len(correct_protein)): + if correct_protein[i] != protein[i]: + bank_of_mutations.append(f'{protein[i]}{i + 1}') + + if len(bank_of_mutations) == 0: + return "Protein without mutations." + else: + return "Mutations:" + ", ".join(bank_of_mutations) + "." + + +def run_protein_tools(*args:str): + """ + Function containing methods for protein analysis. + + Takes arbitrary number of arguments with protein sequencies + and the name of the procedure to be performed (always the + last argument). Returns the result of the procedure as string + if one sequnce is submitted or list if several. + + If procedure 'check_mutations' is used then input must be only three + arguments: RNA sequence, protein sequence and the name of procedure + itself. + """ + *seqs, procedure = args + results = [] + d_of_functions = {'compute_molecular_weight': compute_molecular_weight, + 'compute_length': compute_length, + 'compute_hydrophobicity': compute_hydrophobicity, + } + if procedure == 'check_mutations': + results.append(check_mutations(seqs[0], seqs[1])) + else: + for seq in seqs: + if is_protein(seq) is not True: + raise ValueError("Invalid protein sequence") + if procedure not in d_of_functions: + raise ValueError("Wrong procedure name") + else: + results.append(d_of_functions[procedure](seq)) + if len(results) == 1: + return results[0] + else: + return results + From a2022bc6de1e1bafe51b9522c7ab089a72507c1f Mon Sep 17 00:00:00 2001 From: sofiyaga Date: Sun, 1 Oct 2023 10:57:18 +0300 Subject: [PATCH 20/25] Add docstrings and typing in function compute_length --- protein_tools.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/protein_tools.py b/protein_tools.py index 5dc669a..4e131ec 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -89,11 +89,21 @@ def compute_molecular_weight(seq:str): return round(molecular_weight, 3) -def compute_length(seq:str): +def compute_length(protein: str) -> int: """ - Compute the length of protein sequence. + Compute the length of the input protein sequence. + + Argument: + - protein (str): protein sequence. + + Return: + - string, length of the input protein sequence. + + Example: + + 'MGHIKCE' -> 7 """ - return len(seq) + return len(protein) def compute_hydrophobicity(protein:str) -> tuple: From b89318081fc287652beb7adc795d199c921f8b2c Mon Sep 17 00:00:00 2001 From: sofiyaga Date: Sun, 1 Oct 2023 10:59:32 +0300 Subject: [PATCH 21/25] Add function protein_to_dna --- protein_tools.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/protein_tools.py b/protein_tools.py index 4e131ec..fc6e23d 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -103,7 +103,36 @@ def compute_length(protein: str) -> int: 'MGHIKCE' -> 7 """ - return len(protein) + return len(protein) + +def protein_to_dna(protein: str) -> str: + + """ + Returns possible variants of DNAs for a given protein sequence. + + Argument: + - protein (str): protein sequence. + + Return: + - string, variants of nucleic acids. + If several codons correspond to a given amino acid they are displayed with a '/'. + + Does not distinguish between lowercase and uppercase letters. + + Examples: + + -'MACDRS' -> 'ATG GCT/GCC/GCA/GCG TGT/TGC GAT/GAC CGT/CGC/CGA/CGG/AGA/AGG TCT/TCC/TCA/TCG/AGT/AGC' + -'MaCdrS' -> 'ATG GCT/GCC/GCA/GCG TGT/TGC GAT/GAC CGT/CGC/CGA/CGG/AGA/AGG TCT/TCC/TCA/TCG/AGT/AGC' + + """ + nucleic_acid_seq = '' + + for aa in protein.upper(): + codons = dna_codons.get(aa) + nucleic_acid_seq += '/'.join(codons) + ' ' + + + return nucleic_acid_seq.replace(' ', '', -1) def compute_hydrophobicity(protein:str) -> tuple: From 3742858a22ce2314df332a72da9c3169835a85e1 Mon Sep 17 00:00:00 2001 From: sofiyaga Date: Sun, 1 Oct 2023 11:00:05 +0300 Subject: [PATCH 22/25] def count_amino_acids(protein: str) -> dict: --- protein_tools.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/protein_tools.py b/protein_tools.py index fc6e23d..5b88f98 100644 --- a/protein_tools.py +++ b/protein_tools.py @@ -134,6 +134,33 @@ def protein_to_dna(protein: str) -> str: return nucleic_acid_seq.replace(' ', '', -1) +def count_amino_acids(protein: str) -> dict: + + """ + Calculates the number of each aminoacid in a given protein sequence. + + Argument: + - protein (str): protein sequence. + + Return: + - dictionary, where a key is the aminoacid letter and value is number of this aminoacid. + + Does not distinguish between lowercase and uppercase letters. + + Examples: + + -'MACDRS' -> {'M': 1, 'A': 1, 'C': 1, 'D': 1, 'R': 1, 'S': 1} + -'MaCdrS' -> {'M': 1, 'A': 1, 'C': 1, 'D': 1, 'R': 1, 'S': 1} + + """ + + amino_acids_dict = {} + for aa in protein.upper(): + if aa in amino_acids_dict: + amino_acids_dict[aa] += 1 + else: + amino_acids_dict[aa] = 1 + return amino_acids_dict def compute_hydrophobicity(protein:str) -> tuple: """ From 161edf180f250ca3aaf30d84adc494b1ec681dcb Mon Sep 17 00:00:00 2001 From: Artem Toropov <144557024+artyomtorr@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:04:32 +0100 Subject: [PATCH 23/25] Fix typos in README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f8b086e..734790f 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ The tool works by calling the function `run_protein_tools`, which takes arbitrar - `compute_length` — computes the number of amino acids in protein sequence - `compute_hydrophobicity` — computes the percentage of gydrophobic aminoacids in protein sequence - `check_mutations` — checks missense mutations in the protein sequence after translation -- `protein_to_nucleic_acid`- returns possible variants of DNAs for a given protein sequence +- `protein_to_dna`- returns possible variants of DNAs for a given protein sequence - `count_amino_acids` - calculates the number of each aminoacid in protein sequence ### Examples @@ -31,7 +31,7 @@ run_protein_tools('MAEGEITNLP', 'tGQYLAMDTSgLLYGSQT', 'compute_hydrophobicity') run_protein_tools('AUGGAUCAUcAAUAA', 'MDKL*', 'check_mutations') #'Mutations: K3, L4.' -run_protein_tools('MAEGLP', 'LYGSQT','protein_to_nucleic_acid') +run_protein_tools('MAEGLP', 'LYGSQT','protein_to_dna') #['ATG GCT/GCC/GCA/GCG GAA/GAG GGT/GGC/GGA/GGG TTA/TTG/CTT/CTC/CTA/CTG CCT/CCC/CCA/CCG', #'TTA/TTG/CTT/CTC/CTA/CTG TAT/TAC GGT/GGC/GGA/GGG TCT/TCC/TCA/TCG/AGT/AGC CAA/CAG ACT/ACC/ACA/ACG'] @@ -62,5 +62,5 @@ Please use contacts below to reach out with any comments, concerns, or discussio *Author contributions:*
Artyom Toropov (teamlead): functions `is_protein`, `is_rna`, `compute_molecular_weight`, `run_protein_tools`
-Sofiya Vinogradova: functions `compute_length`, `count_amino_acids`, `protein_to_nucleic_acid`
+Sofiya Vinogradova: functions `compute_length`, `count_amino_acids`, `protein_to_dna`
Nikita Zherko: functions `compute_hydrophobicity`, `translate_rna`, `check_mutations` From c3b14775483432ab2ee509e539b4f14b027be163 Mon Sep 17 00:00:00 2001 From: Artyom Date: Sun, 1 Oct 2023 13:18:06 +0100 Subject: [PATCH 24/25] Fix run_protein_tools function and format PEP8 --- HW4_Toropov/protein_tools.py | 390 ++++++++++++++++++++++++++++++----- 1 file changed, 336 insertions(+), 54 deletions(-) diff --git a/HW4_Toropov/protein_tools.py b/HW4_Toropov/protein_tools.py index 87bff7b..50b670f 100644 --- a/HW4_Toropov/protein_tools.py +++ b/HW4_Toropov/protein_tools.py @@ -1,54 +1,336 @@ -alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} -amino_acid_masses = { - 'A': 71.03711, - 'R': 156.10111, - 'N': 114.04293, - 'D': 115.02694, - 'C': 103.00919, - 'Q': 128.05858, - 'E': 129.04259, - 'G': 57.02146, - 'H': 137.05891, - 'I': 113.08406, - 'L': 113.08406, - 'K': 128.09496, - 'M': 131.04049, - 'F': 147.06841, - 'P': 97.05276, - 'S': 87.03203, - 'T': 101.04768, - 'W': 186.07931, - 'Y': 163.06333, - 'V': 99.06841 -} - - -def is_protein(seq): - unique_chars = set(seq) - return unique_chars <= alphabet_protein - - -def molecular_weight(seq): - molecular_weight = 0 - for amino_acid in seq: - molecular_weight += amino_acid_masses[amino_acid] - return round(molecular_weight, 3) - - -def run_protein_tools(*seqs_and_procedure): - procedure = seqs_and_procedure[-1] - seqs = seqs_and_procedure[:-1] - - results = [] - - for seq in seqs: - seq = seq.upper() - if is_protein(seq) is not True: - raise ValueError("Invalid alphabet") - if procedure == 'molecular_weight': - results.append(molecular_weight(seq)) - - if len(results) == 1: - return results[0] - else: - return results +alphabet_protein = { + "A", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", +} + +alphabet_rna = {"A", "U", "G", "C"} + +amino_acid_masses = { + "A": 71.03711, + "R": 156.10111, + "N": 114.04293, + "D": 115.02694, + "C": 103.00919, + "Q": 128.05858, + "E": 129.04259, + "G": 57.02146, + "H": 137.05891, + "I": 113.08406, + "L": 113.08406, + "K": 128.09496, + "M": 131.04049, + "F": 147.06841, + "P": 97.05276, + "S": 87.03203, + "T": 101.04768, + "W": 186.07931, + "Y": 163.06333, + "V": 99.06841, +} + +gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} + +dna_codons = { + "A": ["GCT", "GCC", "GCA", "GCG"], + "C": ["TGT", "TGC"], + "D": ["GAT", "GAC"], + "E": ["GAA", "GAG"], + "F": ["TTT", "TTC"], + "G": ["GGT", "GGC", "GGA", "GGG"], + "H": ["CAT", "CAC"], + "I": ["ATT", "ATC", "ATA"], + "K": ["AAA", "AAG"], + "L": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"], + "M": ["ATG"], + "N": ["AAT", "AAC"], + "P": ["CCT", "CCC", "CCA", "CCG"], + "Q": ["CAA", "CAG"], + "R": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"], + "S": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"], + "T": ["ACT", "ACC", "ACA", "ACG"], + "V": ["GTT", "GTC", "GTA", "GTG"], + "W": ["TGG"], + "Y": ["TAT", "TAC"], + "*": ["UAA", "UAG", "UGA"], +} + +rna_codons = { + "F": ["UUC", "UUU"], + "L": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"], + "I": ["AUU", "AUC", "AUA"], + "M": ["AUG"], + "V": ["GUU", "GUC", "GUA", "GUG"], + "S": ["UCU", "UCC", "UCA", "UCG"], + "P": ["CCU", "CCC", "CCA", "CCG"], + "T": ["ACU", "ACC", "ACA", "ACG"], + "A": ["GCU", "GCC", "GCA", "GCG"], + "Y": ["UAC", "UAU"], + "*": ["UAA", "UAG", "UGA"], + "H": ["CAU", "CAC"], + "Q": ["CAA", "CAG"], + "N": ["AAU", "AAC"], + "K": ["AAA", "AAG"], + "D": ["GAU", "GAC"], + "E": ["GAA", "GAG"], + "C": ["UGU", "UGC"], + "W": ["UGG"], + "R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"], + "S": ["AGU", "AGC"], + "G": ["GGU", "GGC", "GGA", "GGG"], +} + + +def is_protein(seq: str): + """ + Check the existence of a protein sequence, return boolean. + """ + unique_chars = set(seq.upper()) + return unique_chars <= alphabet_protein + + +def is_rna(seq: str): + """ + Check the existence of a RNA sequence, return boolean. + """ + unique_chars = set(seq.upper()) + return unique_chars <= alphabet_rna + + +def compute_molecular_weight(protein: str) -> tuple: + """ + Compute molecular weight (g/mol) of protein sequence. + + Argument: + - protein (str): protein sequence. + + Return: + - tuple with protein sequence and computed molecular + weight (float rounded to 3 decimal places). + """ + molecular_weight = 0 + for amino_acid in protein.upper(): + molecular_weight += amino_acid_masses[amino_acid] + return protein, round(molecular_weight, 3) + + +def compute_length(protein: str) -> tuple: + """ + Compute the length of the input protein sequence. + + Argument: + - protein (str): protein sequence. + + Return: + - tuple with protein sequence and computed length. + """ + return protein, len(protein) + + +def protein_to_dna(protein: str) -> str: + """ + Returns possible variants of DNAs for a given protein sequence. + + Argument: + - protein (str): protein sequence. + + Return: + - string, variants of nucleic acids. + If several codons correspond to a given amino acid they are displayed with a '/'. + + Does not distinguish between lowercase and uppercase letters. + + Examples: + + -'MACDRS' -> 'ATG GCT/GCC/GCA/GCG TGT/TGC GAT/GAC CGT/CGC/CGA/CGG/AGA/AGG TCT/TCC/TCA/TCG/AGT/AGC' + -'MaCdrS' -> 'ATG GCT/GCC/GCA/GCG TGT/TGC GAT/GAC CGT/CGC/CGA/CGG/AGA/AGG TCT/TCC/TCA/TCG/AGT/AGC' + """ + nucleic_acid_seq = "" + + for aa in protein.upper(): + codons = dna_codons.get(aa) + nucleic_acid_seq += "/".join(codons) + " " + + return nucleic_acid_seq[:-1] + + +def count_amino_acids(protein: str) -> dict: + """ + Calculates the number of each aminoacid in a given protein sequence. + + Argument: + - protein (str): protein sequence. + + Return: + - dictionary, where a key is the aminoacid letter and value is number of this aminoacid. + + Does not distinguish between lowercase and uppercase letters. + + Examples: + + -'MACDRS' -> {'M': 1, 'A': 1, 'C': 1, 'D': 1, 'R': 1, 'S': 1} + -'MaCdrS' -> {'M': 1, 'A': 1, 'C': 1, 'D': 1, 'R': 1, 'S': 1} + """ + amino_acids_dict = {} + for aa in protein.upper(): + if aa in amino_acids_dict: + amino_acids_dict[aa] += 1 + else: + amino_acids_dict[aa] = 1 + return amino_acids_dict + + +def compute_hydrophobicity(protein: str) -> tuple: + """ + Compute the percentage of gydrophobic aminoacids in protein sequence. + + Argument: + - protein (str): protein sequence. Includes hydrophobic + and hydrophilic aminoacids. + + Return: + - tuple with protein sequence and computed percentage + of gydrophobic aminoacids. + """ + count_of_gydrophobic = 0 + for i in range(len(protein)): + if protein[i] in gydrophobic_aminoacids: + count_of_gydrophobic += 1 + + percentage = round(count_of_gydrophobic / len(protein) * 100, 3) + + return protein, percentage + + +def translate_rna(rna: str) -> str: + """ + Perform the translation of mRNA seguence into protein sequence. + + Argument: + - rna (str): mRNA sequence. Must contain start-codon and one of + the stop-codons. + + Return: + - str, protein sequence after translation. + Always starts with "M" and ends with "*". + """ + triplets = [rna[i : i + 3].upper() for i in range(0, len(rna), 3)] + protein = [] + for triplet in triplets: + for aminoacid in rna_codons.keys(): + if triplet in rna_codons[aminoacid]: + protein.append(aminoacid) + + if protein[-1] != "*": + raise ValueError("Stop-codon (*) is absent in mRNA") + if protein[0] != "M": + raise ValueError("Start-codon (M) is absent in mRNA") + + start = protein.index("M") + stop = protein.index("*") + return "".join(protein[start : stop + 1]) + + +def check_mutations(rna: str, protein: str) -> str: + """ + Check missense mutations in the protein sequence after translation. + + Uses additional function "translate_rna(seq)". + + Arguments: + - rna (str): sequence of mRNA with/without mutations. + Must contain start-codon and one of the stop-codons. + - protein (str): protein sequence translated from mRNA. + Must start with "M" and ends with "*" (stop-codon). + + Note: is_protein(seq) doesn't see "*", but it's used in the other part of function. + + Return: + - str, if mRNA without mutations return "Protein without mutations." + If there are mutations in protein, returns aminoacid(s) and their position(s) + + Examples: + - "AUGGUAGGGAAAUUUUGA", "MVGKF*" -> "Protein without mutations." + - "AUGGUAGGGAAAUUUUGA", "MGGVF*" -> "Mutations:G2, V4." + - "AUGGUAGGGAAAUUUUGA", "MGGKF" –> "ValueError: Stop (*) is absent" + - "AUGGUAGGGAAAUUUUGA", "GGKF*" –> "ValueError: Start (M) is absent" + - "AUGAAAAAAUGA", "MK*" -> "ValueError: Different length of translated protein and protein" + """ + correct_protein = translate_rna(rna) + bank_of_mutations = [] + + if is_protein(protein[:-1]) is not True: + raise ValueError("Invalid protein sequence") + if is_rna(rna) is not True: + raise ValueError("Invalid RNA sequence") + if protein[-1] != "*": + raise ValueError("Stop (*) is absent") + if protein[0] != "M": + raise ValueError("Start (M) is absent") + if len(protein) != len(rna) / 3: + raise ValueError("Different length of translated protein and protein") + + for i in range(len(correct_protein)): + if correct_protein[i] != protein[i]: + bank_of_mutations.append(f"{protein[i]}{i + 1}") + + if len(bank_of_mutations) == 0: + return "Protein without mutations." + else: + return "Mutations: " + ", ".join(bank_of_mutations) + "." + + +def run_protein_tools(*args: str): + """ + Function containing methods for protein analysis. + + Takes arbitrary number of arguments with protein sequencies + and the name of the procedure to be performed (always the last + argument). Returns the result of the procedure as string, tuple + or dictionary if one sequnce is submitted or list if several. + + Note: if procedure 'check_mutations' is used then input must + contain only three arguments: RNA sequence, protein sequence + and the name of procedure itself. + """ + *seqs, procedure = args + results = [] + d_of_functions = { + "compute_molecular_weight": compute_molecular_weight, + "compute_length": compute_length, + "compute_hydrophobicity": compute_hydrophobicity, + "count_amino_acids": count_amino_acids, + "protein_to_dna": protein_to_dna + + } + if procedure == "check_mutations": + results.append(check_mutations(seqs[0], seqs[1])) + else: + for seq in seqs: + if is_protein(seq) is not True: + raise ValueError("Invalid protein sequence") + if procedure not in d_of_functions: + raise ValueError("Wrong procedure name") + else: + results.append(d_of_functions[procedure](seq)) + if len(results) == 1: + return results[0] + else: + return results From b2a9f556b1e2f8cee0fee5a38dab1b2dd486a780 Mon Sep 17 00:00:00 2001 From: Artem Toropov <144557024+artyomtorr@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:23:39 +0100 Subject: [PATCH 25/25] Delete extra protein_tools.py --- protein_tools.py | 301 ----------------------------------------------- 1 file changed, 301 deletions(-) delete mode 100644 protein_tools.py diff --git a/protein_tools.py b/protein_tools.py deleted file mode 100644 index 2b1f826..0000000 --- a/protein_tools.py +++ /dev/null @@ -1,301 +0,0 @@ -alphabet_protein = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'} - -alphabet_rna = {'A', 'U', 'G', 'C'} - -amino_acid_masses = { - 'A': 71.03711, - 'R': 156.10111, - 'N': 114.04293, - 'D': 115.02694, - 'C': 103.00919, - 'Q': 128.05858, - 'E': 129.04259, - 'G': 57.02146, - 'H': 137.05891, - 'I': 113.08406, - 'L': 113.08406, - 'K': 128.09496, - 'M': 131.04049, - 'F': 147.06841, - 'P': 97.05276, - 'S': 87.03203, - 'T': 101.04768, - 'W': 186.07931, - 'Y': 163.06333, - 'V': 99.06841 -} - -gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} - -dna_codons = { - 'A': ['GCT', 'GCC', 'GCA', 'GCG'], - 'C': ['TGT', 'TGC'], - 'D': ['GAT', 'GAC'], - 'E': ['GAA', 'GAG'], - 'F': ['TTT', 'TTC'], - 'G': ['GGT', 'GGC', 'GGA', 'GGG'], - 'H': ['CAT', 'CAC'], - 'I': ['ATT', 'ATC', 'ATA'], - 'K': ['AAA', 'AAG'], - 'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'], - 'M': ['ATG'], - 'N': ['AAT', 'AAC'], - 'P': ['CCT', 'CCC', 'CCA', 'CCG'], - 'Q': ['CAA', 'CAG'], - 'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], - 'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'], - 'T': ['ACT', 'ACC', 'ACA', 'ACG'], - 'V': ['GTT', 'GTC', 'GTA', 'GTG'], - 'W': ['TGG'], - 'Y': ['TAT', 'TAC'], - '*': ["UAA", "UAG", "UGA"]} - -rna_codons = { - "F": ["UUC", "UUU"], "L": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"], - "I": ["AUU", "AUC", "AUA"], "M": ["AUG"], "V": ["GUU", "GUC", "GUA", "GUG"], - "S": ["UCU", "UCC", "UCA", "UCG"], "P": ["CCU", "CCC", "CCA", "CCG"], - "T": ["ACU", "ACC", "ACA", "ACG"], "A": ["GCU", "GCC", "GCA", "GCG"], - "Y": ["UAC", "UAU"], "*": ["UAA", "UAG", "UGA"], "H": ["CAU", "CAC"], - "Q": ["CAA", "CAG"], "N": ["AAU", "AAC"], - "K": ["AAA", "AAG"], "D": ["GAU", "GAC"], "E": ["GAA", "GAG"], - "C": ["UGU", "UGC"], "W": ["UGG"], "R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"], - "S": ["AGU", "AGC"], "G": ["GGU", "GGC", "GGA", "GGG"] - } - - -def is_protein(seq:str): - """ - Check the existence of a protein sequence, return boolean. - """ - unique_chars = set(seq.upper()) - return unique_chars <= alphabet_protein - - -def is_rna(seq:str): - """ - Check the existence of a RNA sequence, return boolean. - """ - unique_chars = set(seq.upper()) - return unique_chars <= alphabet_rna - - -def compute_molecular_weight(seq:str) -> tuple: - """ - Compute molecular weight (g/mol) of protein sequence. - - Argument: - - protein (str): protein sequence. - - Return: - - tuple with protein sequence and computed molecular - weight (float rounded to 3 decimal places). - """ - molecular_weight = 0 - for amino_acid in seq.upper(): - molecular_weight += amino_acid_masses[amino_acid] - return seq, round(molecular_weight, 3) - - -def compute_length(protein: str) -> tuple: - """ - Compute the length of the input protein sequence. - - Argument: - - protein (str): protein sequence. - - Return: - - tuple with protein sequence and computed length. - """ - return seq, len(seq) - -def protein_to_dna(protein: str) -> str: - - """ - Returns possible variants of DNAs for a given protein sequence. - - Argument: - - protein (str): protein sequence. - - Return: - - string, variants of nucleic acids. - If several codons correspond to a given amino acid they are displayed with a '/'. - - Does not distinguish between lowercase and uppercase letters. - - Examples: - - -'MACDRS' -> 'ATG GCT/GCC/GCA/GCG TGT/TGC GAT/GAC CGT/CGC/CGA/CGG/AGA/AGG TCT/TCC/TCA/TCG/AGT/AGC' - -'MaCdrS' -> 'ATG GCT/GCC/GCA/GCG TGT/TGC GAT/GAC CGT/CGC/CGA/CGG/AGA/AGG TCT/TCC/TCA/TCG/AGT/AGC' - - """ - nucleic_acid_seq = '' - - for aa in protein.upper(): - codons = dna_codons.get(aa) - nucleic_acid_seq += '/'.join(codons) + ' ' - - return nucleic_acid_seq.replace(' ', '', -1) - - -def count_amino_acids(protein: str) -> dict: - - """ - Calculates the number of each aminoacid in a given protein sequence. - - Argument: - - protein (str): protein sequence. - - Return: - - dictionary, where a key is the aminoacid letter and value is number of this aminoacid. - - Does not distinguish between lowercase and uppercase letters. - - Examples: - - -'MACDRS' -> {'M': 1, 'A': 1, 'C': 1, 'D': 1, 'R': 1, 'S': 1} - -'MaCdrS' -> {'M': 1, 'A': 1, 'C': 1, 'D': 1, 'R': 1, 'S': 1} - - """ - - amino_acids_dict = {} - for aa in protein.upper(): - if aa in amino_acids_dict: - amino_acids_dict[aa] += 1 - else: - amino_acids_dict[aa] = 1 - return amino_acids_dict - - -def compute_hydrophobicity(protein:str) -> tuple: - """ - Compute the percentage of gydrophobic aminoacids in protein sequence. - - Argument: - - protein (str): protein sequence. Includes hydrophobic - and hydrophilic aminoacids. - - Return: - - tuple with protein sequence and computed percentage - of gydrophobic aminoacids. - """ - count_of_gydrophobic = 0 - for i in range(len(protein)): - if protein[i] in gydrophobic_aminoacids: - count_of_gydrophobic += 1 - - percentage = round(count_of_gydrophobic / len(protein) * 100, 3) - - return protein, percentage - - -def translate_rna(seq:str) -> str: - """ - Perform the translation of mRNA seguence into protein sequence. - - Argument: - - seq (str): mRNA sequence. Must contain start-codon and one of - the stop-codons. - - Return: - - str, protein sequence after translation. - Always starts with "M" and ends with "*". - """ - triplets = [seq[i:i + 3].upper() for i in range(0, len(seq), 3)] - protein = [] - for triplet in triplets: - for aminoacid in rna_codons.keys(): - if triplet in rna_codons[aminoacid]: - protein.append(aminoacid) - - if protein[-1] != "*": - raise ValueError("Stop-codon (*) is absent in mRNA") - if protein[0] != "M": - raise ValueError("Start-codon (M) is absent in mRNA") - - start = protein.index("M") - stop = protein.index("*") - return "".join(protein[start:stop + 1]) - - -def check_mutations(seq:str, protein:str) -> str: - """ - Check missense mutations in the protein sequence after translation. - - Uses additional function "translate_rna(seq)". - - Arguments: - - seq (str): sequence of mRNA with/without mutations. - Must contain start-codon and one of the stop-codons. - - protein (str): protein sequence translated from mRNA. - Must start with "M" and ends with "*" (stop-codon). - - Note: is_protein(seq) doesn't see "*", but it's used in the other part of function. - - Return: - - str, if mRNA without mutations return "Protein without mutations." - If there are mutations in protein, returns aminoacid(s) and their position(s) - - Examples: - - "AUGGUAGGGAAAUUUUGA", "MVGKF*" -> "Protein without mutations." - - "AUGGUAGGGAAAUUUUGA", "MGGVF*" -> "Mutations:G2, V4." - - "AUGGUAGGGAAAUUUUGA", "MGGKF" –> "ValueError: Stop (*) is absent" - - "AUGGUAGGGAAAUUUUGA", "GGKF*" –> "ValueError: Start (M) is absent" - - "AUGAAAAAAUGA", "MK*" -> "ValueError: Different length of translated protein and protein" - """ - correct_protein = translate_rna(seq) - bank_of_mutations = [] - - if is_protein(protein[:-1]) is not True: - raise ValueError("Invalid protein sequence") - if is_rna(seq) is not True: - raise ValueError("Invalid RNA sequence") - if protein[-1] != "*": - raise ValueError("Stop (*) is absent") - if protein[0] != "M": - raise ValueError("Start (M) is absent") - if len(protein) != len(seq)/3: - raise ValueError("Different length of translated protein and protein") - - for i in range(len(correct_protein)): - if correct_protein[i] != protein[i]: - bank_of_mutations.append(f'{protein[i]}{i + 1}') - - if len(bank_of_mutations) == 0: - return "Protein without mutations." - else: - return "Mutations: " + ", ".join(bank_of_mutations) + "." - - -def run_protein_tools(*args:str): - """ - Function containing methods for protein analysis. - - Takes arbitrary number of arguments with protein sequencies - and the name of the procedure to be performed (always the last - argument). Returns the result of the procedure as string, tuple - or dictionary if one sequnce is submitted or list if several. - - Note: if procedure 'check_mutations' is used then input must - contain only three arguments: RNA sequence, protein sequence - and the name of procedure itself. - """ - *seqs, procedure = args - results = [] - d_of_functions = {'compute_molecular_weight': compute_molecular_weight, - 'compute_length': compute_length, - 'compute_hydrophobicity': compute_hydrophobicity, - } - if procedure == 'check_mutations': - results.append(check_mutations(seqs[0], seqs[1])) - else: - for seq in seqs: - if is_protein(seq) is not True: - raise ValueError("Invalid protein sequence") - if procedure not in d_of_functions: - raise ValueError("Wrong procedure name") - else: - results.append(d_of_functions[procedure](seq)) - if len(results) == 1: - return results[0] - else: - return results