-
Notifications
You must be signed in to change notification settings - Fork 45
HW4_Toropov #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
HW4_Toropov #2
Changes from all commits
795f31a
d2db32d
8ecd46d
715793a
3b28338
4fb8e57
0584195
490b985
f58a53a
1746a23
e48afa0
34ed7cd
21c12af
aa4618c
ed17f8a
a25248c
7d1e07e
140222d
f85159c
21a57d9
e0280bf
d7575e2
e1bc19e
5fddd4f
54171eb
7acecfb
a2022bc
b893180
3742858
71535e5
dc1022f
161edf1
c3b1477
b2a9f55
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,336 @@ | ||||||||||
alphabet_protein = { | ||||||||||
"A", | ||||||||||
"C", | ||||||||||
"D", | ||||||||||
"E", | ||||||||||
"F", | ||||||||||
"G", | ||||||||||
"H", | ||||||||||
"I", | ||||||||||
"K", | ||||||||||
"L", | ||||||||||
"M", | ||||||||||
"N", | ||||||||||
"P", | ||||||||||
"Q", | ||||||||||
"R", | ||||||||||
"S", | ||||||||||
"T", | ||||||||||
"V", | ||||||||||
"W", | ||||||||||
"Y", | ||||||||||
} | ||||||||||
|
||||||||||
alphabet_rna = {"A", "U", "G", "C"} | ||||||||||
|
||||||||||
amino_acid_masses = { | ||||||||||
"A": 71.03711, | ||||||||||
"R": 156.10111, | ||||||||||
"N": 114.04293, | ||||||||||
"D": 115.02694, | ||||||||||
"C": 103.00919, | ||||||||||
"Q": 128.05858, | ||||||||||
"E": 129.04259, | ||||||||||
"G": 57.02146, | ||||||||||
"H": 137.05891, | ||||||||||
"I": 113.08406, | ||||||||||
"L": 113.08406, | ||||||||||
"K": 128.09496, | ||||||||||
"M": 131.04049, | ||||||||||
"F": 147.06841, | ||||||||||
"P": 97.05276, | ||||||||||
"S": 87.03203, | ||||||||||
"T": 101.04768, | ||||||||||
"W": 186.07931, | ||||||||||
"Y": 163.06333, | ||||||||||
"V": 99.06841, | ||||||||||
} | ||||||||||
|
||||||||||
gydrophobic_aminoacids = {"A", "V", "L", "I", "P", "F", "W", "M"} | ||||||||||
|
||||||||||
dna_codons = { | ||||||||||
"A": ["GCT", "GCC", "GCA", "GCG"], | ||||||||||
"C": ["TGT", "TGC"], | ||||||||||
"D": ["GAT", "GAC"], | ||||||||||
"E": ["GAA", "GAG"], | ||||||||||
"F": ["TTT", "TTC"], | ||||||||||
"G": ["GGT", "GGC", "GGA", "GGG"], | ||||||||||
"H": ["CAT", "CAC"], | ||||||||||
"I": ["ATT", "ATC", "ATA"], | ||||||||||
"K": ["AAA", "AAG"], | ||||||||||
"L": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"], | ||||||||||
"M": ["ATG"], | ||||||||||
"N": ["AAT", "AAC"], | ||||||||||
"P": ["CCT", "CCC", "CCA", "CCG"], | ||||||||||
"Q": ["CAA", "CAG"], | ||||||||||
"R": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"], | ||||||||||
"S": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"], | ||||||||||
"T": ["ACT", "ACC", "ACA", "ACG"], | ||||||||||
"V": ["GTT", "GTC", "GTA", "GTG"], | ||||||||||
"W": ["TGG"], | ||||||||||
"Y": ["TAT", "TAC"], | ||||||||||
"*": ["UAA", "UAG", "UGA"], | ||||||||||
} | ||||||||||
|
||||||||||
rna_codons = { | ||||||||||
"F": ["UUC", "UUU"], | ||||||||||
"L": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"], | ||||||||||
"I": ["AUU", "AUC", "AUA"], | ||||||||||
"M": ["AUG"], | ||||||||||
"V": ["GUU", "GUC", "GUA", "GUG"], | ||||||||||
"S": ["UCU", "UCC", "UCA", "UCG"], | ||||||||||
"P": ["CCU", "CCC", "CCA", "CCG"], | ||||||||||
"T": ["ACU", "ACC", "ACA", "ACG"], | ||||||||||
"A": ["GCU", "GCC", "GCA", "GCG"], | ||||||||||
"Y": ["UAC", "UAU"], | ||||||||||
"*": ["UAA", "UAG", "UGA"], | ||||||||||
"H": ["CAU", "CAC"], | ||||||||||
"Q": ["CAA", "CAG"], | ||||||||||
"N": ["AAU", "AAC"], | ||||||||||
"K": ["AAA", "AAG"], | ||||||||||
"D": ["GAU", "GAC"], | ||||||||||
"E": ["GAA", "GAG"], | ||||||||||
"C": ["UGU", "UGC"], | ||||||||||
"W": ["UGG"], | ||||||||||
"R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"], | ||||||||||
"S": ["AGU", "AGC"], | ||||||||||
"G": ["GGU", "GGC", "GGA", "GGG"], | ||||||||||
} | ||||||||||
|
||||||||||
|
||||||||||
def is_protein(seq: str): | ||||||||||
""" | ||||||||||
Check the existence of a protein sequence, return boolean. | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ну оно не совсем existence проверяет) |
||||||||||
""" | ||||||||||
unique_chars = set(seq.upper()) | ||||||||||
return unique_chars <= alphabet_protein | ||||||||||
|
||||||||||
|
||||||||||
def is_rna(seq: str): | ||||||||||
""" | ||||||||||
Check the existence of a RNA sequence, return boolean. | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Аналогично про existance |
||||||||||
""" | ||||||||||
unique_chars = set(seq.upper()) | ||||||||||
return unique_chars <= alphabet_rna | ||||||||||
|
||||||||||
|
||||||||||
def compute_molecular_weight(protein: str) -> tuple: | ||||||||||
""" | ||||||||||
Compute molecular weight (g/mol) of protein sequence. | ||||||||||
|
||||||||||
Argument: | ||||||||||
- protein (str): protein sequence. | ||||||||||
|
||||||||||
Return: | ||||||||||
- tuple with protein sequence and computed molecular | ||||||||||
weight (float rounded to 3 decimal places). | ||||||||||
""" | ||||||||||
molecular_weight = 0 | ||||||||||
for amino_acid in protein.upper(): | ||||||||||
molecular_weight += amino_acid_masses[amino_acid] | ||||||||||
return protein, round(molecular_weight, 3) | ||||||||||
Comment on lines
+117
to
+131
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Хорошая докстринга, здорово что упомянули размерность. |
||||||||||
|
||||||||||
|
||||||||||
def compute_length(protein: str) -> tuple: | ||||||||||
""" | ||||||||||
Compute the length of the input protein sequence. | ||||||||||
|
||||||||||
Argument: | ||||||||||
- protein (str): protein sequence. | ||||||||||
|
||||||||||
Return: | ||||||||||
- tuple with protein sequence and computed length. | ||||||||||
""" | ||||||||||
return protein, len(protein) | ||||||||||
Comment on lines
+134
to
+144
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Та же история про возвращение кортежа. |
||||||||||
|
||||||||||
|
||||||||||
def protein_to_dna(protein: str) -> str: | ||||||||||
artyomtorr marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
""" | ||||||||||
Returns possible variants of DNAs for a given protein sequence. | ||||||||||
|
||||||||||
Argument: | ||||||||||
- protein (str): protein sequence. | ||||||||||
|
||||||||||
Return: | ||||||||||
- string, variants of nucleic acids. | ||||||||||
If several codons correspond to a given amino acid they are displayed with a '/'. | ||||||||||
|
||||||||||
Does not distinguish between lowercase and uppercase letters. | ||||||||||
|
||||||||||
Examples: | ||||||||||
|
||||||||||
-'MACDRS' -> 'ATG GCT/GCC/GCA/GCG TGT/TGC GAT/GAC CGT/CGC/CGA/CGG/AGA/AGG TCT/TCC/TCA/TCG/AGT/AGC' | ||||||||||
-'MaCdrS' -> 'ATG GCT/GCC/GCA/GCG TGT/TGC GAT/GAC CGT/CGC/CGA/CGG/AGA/AGG TCT/TCC/TCA/TCG/AGT/AGC' | ||||||||||
Comment on lines
+162
to
+163
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Прикольно вышли из ситуации с неоднозначностью кодонов. Не знаю касколько этим удобно пользоваться правда будет) |
||||||||||
""" | ||||||||||
nucleic_acid_seq = "" | ||||||||||
|
||||||||||
for aa in protein.upper(): | ||||||||||
codons = dna_codons.get(aa) | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Со словарями чаще все таки делают не через .get а просто квадратными скобочками |
||||||||||
nucleic_acid_seq += "/".join(codons) + " " | ||||||||||
|
||||||||||
return nucleic_acid_seq[:-1] | ||||||||||
Comment on lines
+169
to
+171
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Я так понимаю [:-1] в конце нужно чтобы убрать последний пробел который добавляется в цикле через + " "? В целом ок, но возможно чуть питонестее было бы использовать тут 2 join. Первый как у вас и есть, по "/", а второй соединяет в себе каждый набор кодонов, и он уже будет с " ". Возможно вычислительнее это даже будет чуть-чуть затранее, но зато прям очень чисто с точки зрения логики.
|
||||||||||
|
||||||||||
|
||||||||||
def count_amino_acids(protein: str) -> dict: | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "count" ок, но часто в таких историях ("сколько каждого элемента") используют так или инча слова типа "info", "stats", ... В таких случаях очень хорош ChatGPT чтобы придумывать нейминги. |
||||||||||
""" | ||||||||||
Calculates the number of each aminoacid in a given protein sequence. | ||||||||||
|
||||||||||
Argument: | ||||||||||
- protein (str): protein sequence. | ||||||||||
|
||||||||||
Return: | ||||||||||
- dictionary, where a key is the aminoacid letter and value is number of this aminoacid. | ||||||||||
|
||||||||||
Does not distinguish between lowercase and uppercase letters. | ||||||||||
|
||||||||||
Examples: | ||||||||||
|
||||||||||
-'MACDRS' -> {'M': 1, 'A': 1, 'C': 1, 'D': 1, 'R': 1, 'S': 1} | ||||||||||
-'MaCdrS' -> {'M': 1, 'A': 1, 'C': 1, 'D': 1, 'R': 1, 'S': 1} | ||||||||||
""" | ||||||||||
amino_acids_dict = {} | ||||||||||
for aa in protein.upper(): | ||||||||||
if aa in amino_acids_dict: | ||||||||||
amino_acids_dict[aa] += 1 | ||||||||||
else: | ||||||||||
amino_acids_dict[aa] = 1 | ||||||||||
return amino_acids_dict | ||||||||||
Comment on lines
+192
to
+197
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Йоу, здорово что сделали не через count! Так у вас получавется всего один прогон по белку, красота:) |
||||||||||
|
||||||||||
|
||||||||||
def compute_hydrophobicity(protein: str) -> tuple: | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Оч классная идея!! |
||||||||||
""" | ||||||||||
Compute the percentage of gydrophobic aminoacids in protein sequence. | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ну это тогда не совсем hydrophobicity) |
||||||||||
|
||||||||||
Argument: | ||||||||||
- protein (str): protein sequence. Includes hydrophobic | ||||||||||
and hydrophilic aminoacids. | ||||||||||
|
||||||||||
Return: | ||||||||||
- tuple with protein sequence and computed percentage | ||||||||||
of gydrophobic aminoacids. | ||||||||||
""" | ||||||||||
count_of_gydrophobic = 0 | ||||||||||
for i in range(len(protein)): | ||||||||||
if protein[i] in gydrophobic_aminoacids: | ||||||||||
Comment on lines
+213
to
+214
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Можно итерироваться по индексам там, где нужны именно элементы |
||||||||||
count_of_gydrophobic += 1 | ||||||||||
|
||||||||||
percentage = round(count_of_gydrophobic / len(protein) * 100, 3) | ||||||||||
|
||||||||||
return protein, percentage | ||||||||||
|
||||||||||
|
||||||||||
def translate_rna(rna: str) -> str: | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ну-у, тут вы все таки берете РНК а не белок на вход, но ладно) |
||||||||||
""" | ||||||||||
Perform the translation of mRNA seguence into protein sequence. | ||||||||||
|
||||||||||
Argument: | ||||||||||
- rna (str): mRNA sequence. Must contain start-codon and one of | ||||||||||
the stop-codons. | ||||||||||
|
||||||||||
Return: | ||||||||||
- str, protein sequence after translation. | ||||||||||
Always starts with "M" and ends with "*". | ||||||||||
""" | ||||||||||
triplets = [rna[i : i + 3].upper() for i in range(0, len(rna), 3)] | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||||||||||
protein = [] | ||||||||||
for triplet in triplets: | ||||||||||
for aminoacid in rna_codons.keys(): | ||||||||||
if triplet in rna_codons[aminoacid]: | ||||||||||
protein.append(aminoacid) | ||||||||||
Comment on lines
+236
to
+239
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Как то очень странно выглядит... Тут лучше было бы тогда обратный словарь сделать. триплет -> аминокислота И что будет если if будет неверным? Такое может случится? Я так понимаю это просто игнорится? |
||||||||||
|
||||||||||
if protein[-1] != "*": | ||||||||||
raise ValueError("Stop-codon (*) is absent in mRNA") | ||||||||||
if protein[0] != "M": | ||||||||||
raise ValueError("Start-codon (M) is absent in mRNA") | ||||||||||
Comment on lines
+241
to
+244
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔥 |
||||||||||
|
||||||||||
start = protein.index("M") | ||||||||||
stop = protein.index("*") | ||||||||||
return "".join(protein[start : stop + 1]) | ||||||||||
|
||||||||||
|
||||||||||
def check_mutations(rna: str, protein: str) -> str: | ||||||||||
""" | ||||||||||
Check missense mutations in the protein sequence after translation. | ||||||||||
|
||||||||||
Uses additional function "translate_rna(seq)". | ||||||||||
|
||||||||||
Arguments: | ||||||||||
- rna (str): sequence of mRNA with/without mutations. | ||||||||||
Must contain start-codon and one of the stop-codons. | ||||||||||
- protein (str): protein sequence translated from mRNA. | ||||||||||
Must start with "M" and ends with "*" (stop-codon). | ||||||||||
|
||||||||||
Note: is_protein(seq) doesn't see "*", but it's used in the other part of function. | ||||||||||
|
||||||||||
Return: | ||||||||||
- str, if mRNA without mutations return "Protein without mutations." | ||||||||||
If there are mutations in protein, returns aminoacid(s) and their position(s) | ||||||||||
|
||||||||||
Examples: | ||||||||||
- "AUGGUAGGGAAAUUUUGA", "MVGKF*" -> "Protein without mutations." | ||||||||||
- "AUGGUAGGGAAAUUUUGA", "MGGVF*" -> "Mutations:G2, V4." | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Круто что вы показываете что заменилось! Тут было бы использовать все таки нотацию типа "K2N" где K - то что должно быть по РНК, и N - то что по факту |
||||||||||
- "AUGGUAGGGAAAUUUUGA", "MGGKF" –> "ValueError: Stop (*) is absent" | ||||||||||
- "AUGGUAGGGAAAUUUUGA", "GGKF*" –> "ValueError: Start (M) is absent" | ||||||||||
- "AUGAAAAAAUGA", "MK*" -> "ValueError: Different length of translated protein and protein" | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Мб это делеция?😁 |
||||||||||
""" | ||||||||||
correct_protein = translate_rna(rna) | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А, все, понял, функция выше была нужна для этой. Тогда ок |
||||||||||
bank_of_mutations = [] | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
|
||||||||||
if is_protein(protein[:-1]) is not True: | ||||||||||
raise ValueError("Invalid protein sequence") | ||||||||||
if is_rna(rna) is not True: | ||||||||||
raise ValueError("Invalid RNA sequence") | ||||||||||
if protein[-1] != "*": | ||||||||||
raise ValueError("Stop (*) is absent") | ||||||||||
if protein[0] != "M": | ||||||||||
raise ValueError("Start (M) is absent") | ||||||||||
Comment on lines
+283
to
+286
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. У вас же есть аналогичные ошибки в translate_rna. Мб их не нужно дублировать. |
||||||||||
if len(protein) != len(rna) / 3: | ||||||||||
raise ValueError("Different length of translated protein and protein") | ||||||||||
|
||||||||||
for i in range(len(correct_protein)): | ||||||||||
if correct_protein[i] != protein[i]: | ||||||||||
bank_of_mutations.append(f"{protein[i]}{i + 1}") | ||||||||||
|
||||||||||
if len(bank_of_mutations) == 0: | ||||||||||
return "Protein without mutations." | ||||||||||
else: | ||||||||||
return "Mutations: " + ", ".join(bank_of_mutations) + "." | ||||||||||
|
||||||||||
|
||||||||||
def run_protein_tools(*args: str): | ||||||||||
""" | ||||||||||
Function containing methods for protein analysis. | ||||||||||
|
||||||||||
Takes arbitrary number of arguments with protein sequencies | ||||||||||
and the name of the procedure to be performed (always the last | ||||||||||
argument). Returns the result of the procedure as string, tuple | ||||||||||
or dictionary if one sequnce is submitted or list if several. | ||||||||||
|
||||||||||
Note: if procedure 'check_mutations' is used then input must | ||||||||||
contain only three arguments: RNA sequence, protein sequence | ||||||||||
and the name of procedure itself. | ||||||||||
""" | ||||||||||
*seqs, procedure = args | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Это ок, как в прошлом ДЗ, но все таки в реальных тулах такое может быть не очень очевидно. Я бы сделал название процедуры именованным аргументом. |
||||||||||
results = [] | ||||||||||
d_of_functions = { | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
"compute_molecular_weight": compute_molecular_weight, | ||||||||||
"compute_length": compute_length, | ||||||||||
"compute_hydrophobicity": compute_hydrophobicity, | ||||||||||
"count_amino_acids": count_amino_acids, | ||||||||||
"protein_to_dna": protein_to_dna | ||||||||||
|
||||||||||
} | ||||||||||
if procedure == "check_mutations": | ||||||||||
results.append(check_mutations(seqs[0], seqs[1])) | ||||||||||
else: | ||||||||||
for seq in seqs: | ||||||||||
if is_protein(seq) is not True: | ||||||||||
raise ValueError("Invalid protein sequence") | ||||||||||
if procedure not in d_of_functions: | ||||||||||
raise ValueError("Wrong procedure name") | ||||||||||
else: | ||||||||||
results.append(d_of_functions[procedure](seq)) | ||||||||||
if len(results) == 1: | ||||||||||
return results[0] | ||||||||||
else: | ||||||||||
return results |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Круто что вы вынесли эти переменные отдельно вначале вне функций. Несколько моментов:
Тут название понятное, но кажется можно придумать что-то получше. Что-то типа
proteinogenic_aminoacids
.На надо тут было разбирать каждую букву на отдальной строке... Понятно, что не все в одну строку, но по несколько АК на строке
Аналогично про все константы ниже