From ce4da7d336cc2c084da70f5278537b4386c4b754 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Mon, 25 Sep 2023 13:53:29 +0300 Subject: [PATCH 01/88] Create protein_analysis_tool.py Added new empty file protein_analysis_tool.py --- protein_analysis_tool.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 protein_analysis_tool.py diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/protein_analysis_tool.py @@ -0,0 +1 @@ + From db55ca6434c818f42fe2ad350b5e40113e69fe94 Mon Sep 17 00:00:00 2001 From: Dasha Date: Thu, 28 Sep 2023 22:03:33 +0300 Subject: [PATCH 02/88] Add function get_amino_acid_sum --- protein_analysis_tool.py | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 8b13789..798a5b4 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -1 +1,42 @@ +def get_amino_acid_sum(protein_sequences): + for protein_sequence in range(len(protein_sequences)): + dictionary = {'A': 0, 'C': 0, 'D': 0, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, + 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, + 'T': 0, 'V': 0, 'W': 0, 'Y': 0} + for amino_acid in protein_sequences[protein_sequence]: + dictionary[amino_acid] += 1 + clone = { + 'Аланин': dictionary['A'], + 'Цистеин': dictionary['C'], + 'Аспарг кислота': dictionary['D'], + 'Глутаминовая кислота': dictionary['E'], + 'Фенилаланин': dictionary['F'], + 'Глицин': dictionary['G'], + 'Гистидин': dictionary['H'], + 'Изолейцин': dictionary['I'], + 'Лизин': dictionary['K'], + 'Лейцин': dictionary['L'], + 'Метионин': dictionary['M'], + 'Аспаргин': dictionary['N'], + 'Пролин': dictionary['P'], + 'Глутамин': dictionary['Q'], + 'Аргинин': dictionary['R'], + 'Серин': dictionary['S'], + 'Трианин': dictionary['T'], + 'Валин': dictionary['V'], + 'Триптофан': dictionary['W'], + 'Тирозин': dictionary['Y'] + } + print('количество аминокислот в последовательности ', protein_sequence + 1, ':') + for key, value in clone.items(): + print(key,value) + + + + + + + + +get_amino_acid_sum(['MSRQEADLKVSIKKACSTEEAAPKRKHVRACIVFTWDHRSSKAFYNGLRLLPIQNDEIPLFKSLITIHKVLQEGHPSAIKEGIKNRDWIQSLGHVFPGDGMKRYGRLIREYDRYLIRKIDFHNSHKGFNGTFEYEEYVSLKTVSDPNEGYEAIMDLMVLQDSINDLQRLLFASIDSSSHSELKISALVPLIAESYGIFKF']) From b6ae4de2836ee409b428a89d4990101e9a2d46cc Mon Sep 17 00:00:00 2001 From: Dasha Date: Fri, 29 Sep 2023 20:29:38 +0300 Subject: [PATCH 03/88] Add function codon_optimization --- protein_analysis_tool.py | 42 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 798a5b4..78c7756 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -34,9 +34,49 @@ def get_amino_acid_sum(protein_sequences): +def beautiful_print(codon_optimization_list): + for nucleotide_sequence in range(len(codon_optimization_list)): + print('sequence ', nucleotide_sequence + 1) + print(codon_optimization_list[nucleotide_sequence]) +def codon_optimization(protein_sequences_of_cell_type): + cell_type = protein_sequences_of_cell_type[-1] + protein_sequences = protein_sequences_of_cell_type[0:len(protein_sequences_of_cell_type)-1] + if len(protein_sequences_of_cell_type) < 2: + print('Ошибка вы не ввели вид клеток для оптимизации кодонов') + exit() + if cell_type == 'Esherichia coli' or 'E.coli': + codon_optimization_Ecoli = [] + Ecoli_triplets = {'A': 'GCG', 'C': 'TGC', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT', 'G': 'GGC', + 'H': 'CAT', 'I': 'ATT', 'K': 'AAA', 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', + 'P': 'CCG', 'Q': 'CAG', 'R': 'CGT', 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', + 'W': 'TGG', 'Y': 'TAT'} + replacer_Ecoli = Ecoli_triplets.get + for amino_acid in range(len(protein_sequences)): + codon_optimization_Ecoli += [''.join([replacer_Ecoli(n, n) for n in protein_sequences[amino_acid]])] + beautiful_print(codon_optimization_Ecoli) + exit() + if cell_type == 'Pichia pastoris' or 'P.pastoris': + codon_optimization_Ppastoris = [] + Ppastoris_triplets = {'A': 'GCT', 'C': 'TGT', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT', 'G': 'GGT', + 'H': 'CAT', 'I': 'ATT', 'K': 'AAG', 'L': 'TTG', 'M': 'ATG', 'N': 'AAC', + 'P': 'CCA', 'Q': 'CAA', 'R': 'AGA', 'S': 'TCT', 'T': 'ACT', 'V': 'GTT', + 'W': 'TGG', 'Y': 'TAC'} + replacer_Ppastoris = Ppastoris_triplets.get + for amino_acid in range(len(protein_sequences)): + codon_optimization_Ppastoris += [''.join([replacer_Ppastoris(n, n) for n in protein_sequences[amino_acid]])] + beautiful_print(codon_optimization_Ppastoris) + exit() -get_amino_acid_sum(['MSRQEADLKVSIKKACSTEEAAPKRKHVRACIVFTWDHRSSKAFYNGLRLLPIQNDEIPLFKSLITIHKVLQEGHPSAIKEGIKNRDWIQSLGHVFPGDGMKRYGRLIREYDRYLIRKIDFHNSHKGFNGTFEYEEYVSLKTVSDPNEGYEAIMDLMVLQDSINDLQRLLFASIDSSSHSELKISALVPLIAESYGIFKF']) + + + +b = codon_optimization(['MSRQEADLKVSIKKACSTEEAAPK','RKHVRACIVFTWDHRSSKAFYNGLRLL', 'P.pastoris']) +#print(b) + +#for i in range(len(b)): + # print('sequence ', i+1) + # print(b[i]) From 9f5363653404d543b52394ed94ee7aa5ad19f142 Mon Sep 17 00:00:00 2001 From: Yulia Volkova Date: Fri, 29 Sep 2023 19:47:02 +0200 Subject: [PATCH 04/88] Add main protein function, validate, molecular weight and one letter to three code functions --- protein_analysis_tool.py | 112 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 8b13789..3f274b0 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -1 +1,113 @@ +def protein(*args: Tuple[str]) -> List: + """ + Function protein does: + -calculate predicted molecular weight of amino acid (aa) sequences in kDa (procedure name: molecular_weight) + -translate aa sequences from one-letter to three-letter code + - + - + - + - + + Arguments: + - + - + + Return: + - list, the result of the operation + """ + aa_seqs = [] + procedure = args[-1] + procedures = ('molecular_weight', 'one_letter_to_three') + + for index in range(len(args)-1): + aa_seqs.append(args[index]) + + for aa_seq in aa_seqs: + validate(aa_seq) + + if procedure not in procedures: + raise ValueError('Requested procedure is not defined') + + if procedure == 'molecular_weight': + return molecular_weight(aa_seqs) + + if procedure == 'one_letter_to_three': + return one_letter_to_three(aa_seqs) + + +def validate(aa_seq: str) -> None: + """Validates if aa sequence consists of only amino acid characters""" + aa_seq_set = set(aa_seq.upper()) + all_aa = {'A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'} + difference = aa_seq_set.difference(all_aa) + if len(difference) > 0: + raise ValueError('Invalid alphabet, please use only single letter amino acid code') + + +def molecular_weight(aa_seqs: List[str]) -> List[float]: + """Calculates predicated molecular weight of aa sequences. Returns list of floats""" + aa_weights = { + 'A': 89.09, + 'R': 174.20, + 'N': 132.12, + 'D': 133.10, + 'C': 121.16, + 'E': 147.13, + 'Q': 146.15, + 'G': 75.07, + 'H': 155.16, + 'I': 131.18, + 'L': 131.18, + 'K': 146.19, + 'M': 149.21, + 'F': 165.19, + 'P': 115.13, + 'S': 105.09, + 'T': 119.12, + 'W': 204.23, + 'Y': 181.19, + 'V': 117.15 + } + molecular_weights = [] + for seq in aa_seqs: + total_weight = 0 + for aa in seq: + aa = aa.upper() + total_weight += aa_weights[aa] + molecular_weights.append(round(total_weight/1000, 2)) + return molecular_weights + + +def one_letter_to_three(aa_seqs: List[str]) -> List[str]: + """Translates one letter coded aa sequences to three letter coded""" + three_letter_codes = { + 'A': 'Ala', + 'R': 'Arg', + 'N': 'Asn', + 'D': 'Asp', + 'C': 'Cys', + 'E': 'Glu', + 'Q': 'Gln', + 'G': 'Gly', + 'H': 'His', + 'I': 'Ile', + 'L': 'Leu', + 'K': 'Lys', + 'M': 'Met', + 'F': 'Phe', + 'P': 'Pro', + 'S': 'Ser', + 'T': 'Thr', + 'W': 'Trp', + 'Y': 'Tyr', + 'V': 'Val' + } + three_letters_seqs = [] + for seq in aa_seqs: + three_letters_seq = [] + for aa in seq: + aa = aa.upper() + three_letters_seq.append(three_letter_codes[aa]) + three_letters_seqs.append(''.join(three_letters_seq)) + return three_letters_seqs From 5888227009d49b7d712ed7efef85b026b7b45f7f Mon Sep 17 00:00:00 2001 From: Yulia Volkova Date: Fri, 29 Sep 2023 20:22:11 +0200 Subject: [PATCH 05/88] Add readme draft --- README.md | 72 +++++++++++++++---------------------------------------- 1 file changed, 19 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index f918170..dd13064 100644 --- a/README.md +++ b/README.md @@ -1,65 +1,31 @@ -# HW 4. Functions 2 -> *This is the repo for the fourth homework of the BI Python 2023 course* +# Protein Info -### Homework description +This project consists of one function "protein" that helps user to predict molecular weight of amino acid (aa) sequences, translate aa sequences from one-letter to three-letter code etc. Sequences are accepted as single-letter code: 20 aa without stop codon (A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V). -На прошлой неделе вы делали утилиту для работы с последовательностями нуклеиновых кислот (с весьма строгим ТЗ). Пришло время для чего-то более самостоятельного. +## Technology: -#### Основное задание +python +## How to use: -Напишите утилиту для работы с последовательностями белков. Там должно быть минимум 5 различных операций, должна быть какая-то точка входа через которую пользователь будет всё это дело использовать. На этом, по сути, всё. Всё целиком зависит от вашей фантазии и креативности. Можете опираться на ДЗ №2 и №3. +This function accepts arguments as a list of strings. Last argument in the list should be a procedure that should be applied to the sequences. -Самая главная часть задания - это файл `README.md`. Сделайте краткое введение, напишите описание тула, приведите документацию по использованию со списком аргументов. Добавьте примеры использования. Возможно, вы захотите сделать секцию Troubleshooting. ***Почему это нужно?*** В этот раз проверяющий не будет знать того, как должен работать ваш тул. Это ваш авторский код. Даже самая прекрасная функциональность, не будучи отраженной в README, скорее всего останется незамеченной. README - это ваш способ познакомить пользователя с тулом, показать всё лучше и обосновать, почему именно ваша команда должна получить наивысший балл. +## List of procedures: -Есть люди которые, любят писать документации, а есть те - кто не любит. Найдите в вашей команде того, кто любит. И в будущем в своих рабочих проектах всегда держите рядом такого человек (или будьте им). +- `molecular_weight` — returns list of float values, that indicate predicted molecular weights of given aa sequences (in kDa) +- `one_letter_to_three` — will return list of strings, containing the same sequences written in three-letter code +- +- +- +- -Примеры некоторых README, которыми можно вдохновляться: +## Example of use: -- [MetaFX](https://github.com/ctlab/metafx), тул Артёма Иванова. Там еще и [wiki](https://github.com/ctlab/metafx/wiki) крутое. -- [samovar](https://github.com/nvaulin/samovar) -- [MetaGEM](https://github.com/franciscozorrilla/metaGEM) -- [Pharokka](https://github.com/gbouras13/pharokka) +> protein("ACD", "AD", "one_letter_to_three") # ['AlaCysAsp', 'AlaAsp'] +> protein("ACD", "AD", "molecular_weight") # [0.34, 0.22] -Типовые секции, на которые стоит обратить внимание: Title, Overview, Usage, Options, Examples, Troubleshooting, Contacts. +## Possible erros: +> `ValueError`('Invalid alphabet, please use only single letter amino acid code') # Will occure if character other than A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V are used. +> `ValueError`('Requested procedure is not defined') # Will occure if last argument does not correspond to any listed procedure (see List of procedures). -**Tехническое требование к заданию.** -Это задание будет выполняться в командах по 3 человека. Каждый из членов команды должен внести ***как минимум*** 2 функции. Каждое внесение функции должно сопровождаться коммитом с осмысленным описанием коммита. Ниже приведена последовательность действий для успешного выполнения задания (аналогично ДЗ №2): - -1. Посмотрите состав своей команды здесь ([**ССЫЛКА**](https://docs.google.com/spreadsheets/d/1KMBBBu8LqauRpDJb0v1ldPwpvzNn8-KakcHexAcqLsE/edit?usp=sharing)). -2. Тимлид делает форк данного репозитория. **В форке создает ветку `HW4_`, в ветке создает папку `HW4_`, в этой папке вы всё делаете.** -3. Члены команды могут либо делать свои форки, либо работать в репозитории тимлида в качестве колабораторов ("contributors"). В любом случае делаете клоны => пишите код локально => пушите. -4. В конце тимлид делайет pull-request из `HW4_` своего репозитория в `main` этого. - - -А также: -- Сопроводите программу лучшим `README.md` файлом в вашей жизни (на английском языке). -- В этом ДЗ проблемы с качеством кода (нейминги, пустые строки, анноатции типов, док.стринги, пробелы) могут привести к снижению балла. Воспользуйтесь линтерами чтобы себя обезопасить. IDE по типу PyCharm или VSCode имеют фунцонал по авто-исправлению многих проблем такого рода. - -Автотестов на GitHub в этом ДЗ нет, но вы можете прогнать линтеры на качество кода локально (как в ДЗ №3, подробнее читайте [тут](https://plausible-cannon-091.notion.site/Code-auto-checks-02b2ea69c1d545fca07b50ce5933ed5f?pvs=4)). - -- Программа должна сохранять регистр символов. -- Программа должна работать только с последовательностями белков. -- Запрещается использование сторонних модулей. - - -### Форма сдачи - -Прикрепите ссылку на pull-request тимлида в Google Class (можете сделать от лица каждого члена команды, но это не обязательно). - - -### Pазбалловка - -- За каждую из 5 операций - максимум **1.5 балла** -- За README - максимум **2.5 балла** -- Если вы не внесли как минимум 2 функции от себя, вы получаете 0 баллов (на баллы остальных членов команды это не влияет). -- За фото созвона в README можно получить 0.2 доп. балла (но не более 10 баллов суммарно) - - - -### **Предполагаемый учебный результат** - -Это задание позволит вам проявить креативность и учиться быть не только кодером, но и автором. Также это задание поможет окончательно закрепить материал по функциям который мы прошли. - -Удачи! ✨✨ From 75e9f4f1c5071fbf85d5190ba5a39bc4699cc935 Mon Sep 17 00:00:00 2001 From: Yulia Volkova Date: Fri, 29 Sep 2023 20:31:30 +0200 Subject: [PATCH 06/88] Correct input and output data types of functions --- protein_analysis_tool.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 3f274b0..99649e1 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -1,4 +1,4 @@ -def protein(*args: Tuple[str]) -> List: +def protein(*args: list) -> list: """ Function protein does: -calculate predicted molecular weight of amino acid (aa) sequences in kDa (procedure name: molecular_weight) @@ -44,7 +44,7 @@ def validate(aa_seq: str) -> None: raise ValueError('Invalid alphabet, please use only single letter amino acid code') -def molecular_weight(aa_seqs: List[str]) -> List[float]: +def molecular_weight(aa_seqs: list) -> list: """Calculates predicated molecular weight of aa sequences. Returns list of floats""" aa_weights = { 'A': 89.09, @@ -78,7 +78,7 @@ def molecular_weight(aa_seqs: List[str]) -> List[float]: return molecular_weights -def one_letter_to_three(aa_seqs: List[str]) -> List[str]: +def one_letter_to_three(aa_seqs: list) -> list: """Translates one letter coded aa sequences to three letter coded""" three_letter_codes = { 'A': 'Ala', @@ -111,3 +111,4 @@ def one_letter_to_three(aa_seqs: List[str]) -> List[str]: three_letters_seqs.append(''.join(three_letters_seq)) return three_letters_seqs + From 56717ee1a5ebaf2e7768ab7d37b2b67784bc8aff Mon Sep 17 00:00:00 2001 From: Yulia Volkova Date: Fri, 29 Sep 2023 20:37:04 +0200 Subject: [PATCH 07/88] Fixed new line issue --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index dd13064..395812d 100644 --- a/README.md +++ b/README.md @@ -21,11 +21,13 @@ This function accepts arguments as a list of strings. Last argument in the list ## Example of use: -> protein("ACD", "AD", "one_letter_to_three") # ['AlaCysAsp', 'AlaAsp'] -> protein("ACD", "AD", "molecular_weight") # [0.34, 0.22] +> protein("ACD", "AD", "one_letter_to_three") # ['AlaCysAsp', 'AlaAsp'] \ +> protein("ACD", "AD", "molecular_weight") # [0.34, 0.22] \ ## Possible erros: -> `ValueError`('Invalid alphabet, please use only single letter amino acid code') # Will occure if character other than A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V are used. -> `ValueError`('Requested procedure is not defined') # Will occure if last argument does not correspond to any listed procedure (see List of procedures). +> `ValueError`('Invalid alphabet, please use only single letter amino acid code') # Will occure if character other than A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V are used. \ +> `ValueError`('Requested procedure is not defined') # Will occure if last argument does not correspond to any listed procedure (see List of procedures). \ + + From 0bd7c37dc12290a03e855a07b869f42b1297365e Mon Sep 17 00:00:00 2001 From: Dasha Date: Fri, 29 Sep 2023 22:26:41 +0300 Subject: [PATCH 08/88] Add input of cell type for user --- protein_analysis_tool.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 78c7756..86da395 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -41,14 +41,10 @@ def beautiful_print(codon_optimization_list): -def codon_optimization(protein_sequences_of_cell_type): - cell_type = protein_sequences_of_cell_type[-1] - protein_sequences = protein_sequences_of_cell_type[0:len(protein_sequences_of_cell_type)-1] - if len(protein_sequences_of_cell_type) < 2: - print('Ошибка вы не ввели вид клеток для оптимизации кодонов') - exit() +def codon_optimization(protein_sequences): + cell_type = input('Введите вид организма для оптимизации кодонов: ') - if cell_type == 'Esherichia coli' or 'E.coli': + if cell_type == 'Esherichia coli' or cell_type == 'E.coli': codon_optimization_Ecoli = [] Ecoli_triplets = {'A': 'GCG', 'C': 'TGC', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT', 'G': 'GGC', 'H': 'CAT', 'I': 'ATT', 'K': 'AAA', 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', @@ -58,8 +54,8 @@ def codon_optimization(protein_sequences_of_cell_type): for amino_acid in range(len(protein_sequences)): codon_optimization_Ecoli += [''.join([replacer_Ecoli(n, n) for n in protein_sequences[amino_acid]])] beautiful_print(codon_optimization_Ecoli) - exit() - if cell_type == 'Pichia pastoris' or 'P.pastoris': + + if cell_type == 'Pichia pastoris' or cell_type == 'P.pastoris': codon_optimization_Ppastoris = [] Ppastoris_triplets = {'A': 'GCT', 'C': 'TGT', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT', 'G': 'GGT', 'H': 'CAT', 'I': 'ATT', 'K': 'AAG', 'L': 'TTG', 'M': 'ATG', 'N': 'AAC', @@ -69,12 +65,13 @@ def codon_optimization(protein_sequences_of_cell_type): for amino_acid in range(len(protein_sequences)): codon_optimization_Ppastoris += [''.join([replacer_Ppastoris(n, n) for n in protein_sequences[amino_acid]])] beautiful_print(codon_optimization_Ppastoris) - exit() + else: + print('Для оптимизации кодонов доступны следующие виды организмов:') -b = codon_optimization(['MSRQEADLKVSIKKACSTEEAAPK','RKHVRACIVFTWDHRSSKAFYNGLRLL', 'P.pastoris']) +b = codon_optimization(['MSRQEADLKVSIKKACSTEEAAPK','RKHVRACIVFTWDHRSSKAFYNGLRLL']) #print(b) #for i in range(len(b)): From 90d9af76f46a5822dffc050eb1f13c1dfade789a Mon Sep 17 00:00:00 2001 From: Dasha Date: Sat, 30 Sep 2023 08:26:16 +0300 Subject: [PATCH 09/88] Fixed a bug in the function codon_optimization and add a mouse for optimization --- protein_analysis_tool.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 86da395..30f6885 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -65,13 +65,24 @@ def codon_optimization(protein_sequences): for amino_acid in range(len(protein_sequences)): codon_optimization_Ppastoris += [''.join([replacer_Ppastoris(n, n) for n in protein_sequences[amino_acid]])] beautiful_print(codon_optimization_Ppastoris) + + if cell_type == 'Mouse' or cell_type == 'mouse': + codon_optimization_Mouse = [] + Mouse_triplets = {'A': 'GCC', 'C': 'TGC', 'D': 'GAC', 'E': 'GAG', 'F': 'TTC', 'G': 'GGC', + 'H': 'CAC', 'I': 'ATC', 'K': 'AAG', 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', + 'P': 'CCC', 'Q': 'CAG', 'R': 'CGG', 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', + 'W': 'TGG', 'Y': 'TAC'} + replacer_Mouse = Mouse_triplets.get + for amino_acid in range(len(protein_sequences)): + codon_optimization_Mouse += [''.join([replacer_Mouse(n, n) for n in protein_sequences[amino_acid]])] + beautiful_print(codon_optimization_Mouse) else: - print('Для оптимизации кодонов доступны следующие виды организмов:') + print('Для оптимизации кодонов доступны следующие виды организмов: Esherichia coli, Pichia pastoris, Mouse') -b = codon_optimization(['MSRQEADLKVSIKKACSTEEAAPK','RKHVRACIVFTWDHRSSKAFYNGLRLL']) +b = get_amino_acid_sum(['MSRQEADLKVSIKKACSTEEAAPK','RKHVRACIVFTWDHRSSKAFYNGLRLL']) #print(b) #for i in range(len(b)): From 3d91794a45c3a9978818511461c8d88323e0962c Mon Sep 17 00:00:00 2001 From: Dasha Date: Sat, 30 Sep 2023 17:29:27 +0300 Subject: [PATCH 10/88] Add docstrings and typing to functions --- protein_analysis_tool.py | 78 ++++++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 30f6885..0c682b1 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -1,5 +1,14 @@ +def get_amino_acid_sum(protein_sequences: list) -> None: + """ + Counts the amount of each amino acid in the injected protein sequences -def get_amino_acid_sum(protein_sequences): + Arguments: + - protein_sequences (list): list of injected protein sequence + + Return: + - None + - Only print the amount of each amino acid in the injected protein sequences + """ for protein_sequence in range(len(protein_sequences)): dictionary = {'A': 0, 'C': 0, 'D': 0, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, @@ -30,61 +39,78 @@ def get_amino_acid_sum(protein_sequences): } print('количество аминокислот в последовательности ', protein_sequence + 1, ':') for key, value in clone.items(): - print(key,value) + print(key, value) + +def beautiful_print(codon_optimization_list: list) -> None: + """ + Makes a user-friendly output of a codon-optimized DNA sequence + Arguments: + - codon_optimization_list (list): list of codon-optimized protein sequence -def beautiful_print(codon_optimization_list): + Return: + - None + - Only print the number of the introduced protein sequence and the codon-optimized DNA sequence + """ for nucleotide_sequence in range(len(codon_optimization_list)): print('sequence ', nucleotide_sequence + 1) print(codon_optimization_list[nucleotide_sequence]) +def codon_optimization(protein_sequences, cell_type): + """ + Makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: + Esherichia coli, Pichia pastoris, Mouse + + Arguments: + - protein_sequences (list): list of injected protein sequence + - cell_type (str): user-entered cell type for codon optimization -def codon_optimization(protein_sequences): - cell_type = input('Введите вид организма для оптимизации кодонов: ') + Return: + - None + - Only print the number of the introduced protein sequence and the codon-optimized DNA sequence + """ if cell_type == 'Esherichia coli' or cell_type == 'E.coli': codon_optimization_Ecoli = [] - Ecoli_triplets = {'A': 'GCG', 'C': 'TGC', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT', 'G': 'GGC', + ecoli_triplets = {'A': 'GCG', 'C': 'TGC', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT', 'G': 'GGC', 'H': 'CAT', 'I': 'ATT', 'K': 'AAA', 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', 'P': 'CCG', 'Q': 'CAG', 'R': 'CGT', 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', 'W': 'TGG', 'Y': 'TAT'} - replacer_Ecoli = Ecoli_triplets.get + replacer_ecoli = ecoli_triplets.get for amino_acid in range(len(protein_sequences)): - codon_optimization_Ecoli += [''.join([replacer_Ecoli(n, n) for n in protein_sequences[amino_acid]])] - beautiful_print(codon_optimization_Ecoli) + codon_optimization_Ecoli += [''.join([replacer_ecoli(n, n) for n in protein_sequences[amino_acid]])] + return beautiful_print(codon_optimization_Ecoli) if cell_type == 'Pichia pastoris' or cell_type == 'P.pastoris': - codon_optimization_Ppastoris = [] - Ppastoris_triplets = {'A': 'GCT', 'C': 'TGT', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT', 'G': 'GGT', + codon_optimization_ppastoris = [] + ppastoris_triplets = {'A': 'GCT', 'C': 'TGT', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT', 'G': 'GGT', 'H': 'CAT', 'I': 'ATT', 'K': 'AAG', 'L': 'TTG', 'M': 'ATG', 'N': 'AAC', 'P': 'CCA', 'Q': 'CAA', 'R': 'AGA', 'S': 'TCT', 'T': 'ACT', 'V': 'GTT', 'W': 'TGG', 'Y': 'TAC'} - replacer_Ppastoris = Ppastoris_triplets.get + replacer_ppastoris = ppastoris_triplets.get for amino_acid in range(len(protein_sequences)): - codon_optimization_Ppastoris += [''.join([replacer_Ppastoris(n, n) for n in protein_sequences[amino_acid]])] - beautiful_print(codon_optimization_Ppastoris) + codon_optimization_ppastoris += [''.join([replacer_ppastoris(n, n) for n in protein_sequences[amino_acid]])] + return beautiful_print(codon_optimization_ppastoris) if cell_type == 'Mouse' or cell_type == 'mouse': - codon_optimization_Mouse = [] - Mouse_triplets = {'A': 'GCC', 'C': 'TGC', 'D': 'GAC', 'E': 'GAG', 'F': 'TTC', 'G': 'GGC', + codon_optimization_mouse = [] + mouse_triplets = {'A': 'GCC', 'C': 'TGC', 'D': 'GAC', 'E': 'GAG', 'F': 'TTC', 'G': 'GGC', 'H': 'CAC', 'I': 'ATC', 'K': 'AAG', 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', 'P': 'CCC', 'Q': 'CAG', 'R': 'CGG', 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', 'W': 'TGG', 'Y': 'TAC'} - replacer_Mouse = Mouse_triplets.get + replacer_Mouse = mouse_triplets.get for amino_acid in range(len(protein_sequences)): - codon_optimization_Mouse += [''.join([replacer_Mouse(n, n) for n in protein_sequences[amino_acid]])] - beautiful_print(codon_optimization_Mouse) + codon_optimization_mouse += [''.join([replacer_Mouse(n, n) for n in protein_sequences[amino_acid]])] + return beautiful_print(codon_optimization_mouse) else: print('Для оптимизации кодонов доступны следующие виды организмов: Esherichia coli, Pichia pastoris, Mouse') +codon_optimization(['MSRQEADLKVSIKKACSTEEAAPK', 'RKHVRACIVFTWDHRSSKAFYNGLRLL'], 'E.coli') +# print(b) - -b = get_amino_acid_sum(['MSRQEADLKVSIKKACSTEEAAPK','RKHVRACIVFTWDHRSSKAFYNGLRLL']) -#print(b) - -#for i in range(len(b)): - # print('sequence ', i+1) - # print(b[i]) +# for i in range(len(b)): +# print('sequence ', i+1) +# print(b[i]) From a1412f7cccbc28e7dede1dac498a4d0e6b9ad5da Mon Sep 17 00:00:00 2001 From: Dasha Date: Sat, 30 Sep 2023 17:59:49 +0300 Subject: [PATCH 11/88] Added an English translation --- protein_analysis_tool.py | 51 ++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 0c682b1..72501a4 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -16,28 +16,28 @@ def get_amino_acid_sum(protein_sequences: list) -> None: for amino_acid in protein_sequences[protein_sequence]: dictionary[amino_acid] += 1 clone = { - 'Аланин': dictionary['A'], - 'Цистеин': dictionary['C'], - 'Аспарг кислота': dictionary['D'], - 'Глутаминовая кислота': dictionary['E'], - 'Фенилаланин': dictionary['F'], - 'Глицин': dictionary['G'], - 'Гистидин': dictionary['H'], - 'Изолейцин': dictionary['I'], - 'Лизин': dictionary['K'], - 'Лейцин': dictionary['L'], - 'Метионин': dictionary['M'], - 'Аспаргин': dictionary['N'], - 'Пролин': dictionary['P'], - 'Глутамин': dictionary['Q'], - 'Аргинин': dictionary['R'], - 'Серин': dictionary['S'], - 'Трианин': dictionary['T'], - 'Валин': dictionary['V'], - 'Триптофан': dictionary['W'], - 'Тирозин': dictionary['Y'] + 'Alanin': dictionary['A'], + 'Cysteine': dictionary['C'], + 'Aspartic acid': dictionary['D'], + 'Glutamic acid': dictionary['E'], + 'Phenylalanine': dictionary['F'], + 'Glycine': dictionary['G'], + 'Histidine': dictionary['H'], + 'Isoleucine': dictionary['I'], + 'Lysine': dictionary['K'], + 'Leucine': dictionary['L'], + 'Methionine': dictionary['M'], + 'Aspargin': dictionary['N'], + 'Proline': dictionary['P'], + 'Glutamine': dictionary['Q'], + 'Arginine': dictionary['R'], + 'Serin': dictionary['S'], + 'Threonine': dictionary['T'], + 'Valin': dictionary['V'], + 'Tryptophan': dictionary['W'], + 'Tyrosine': dictionary['Y'] } - print('количество аминокислот в последовательности ', protein_sequence + 1, ':') + print('The number of amino acids in the sequence ', protein_sequence + 1, ':') for key, value in clone.items(): print(key, value) @@ -54,11 +54,11 @@ def beautiful_print(codon_optimization_list: list) -> None: - Only print the number of the introduced protein sequence and the codon-optimized DNA sequence """ for nucleotide_sequence in range(len(codon_optimization_list)): - print('sequence ', nucleotide_sequence + 1) + print('Sequence ', nucleotide_sequence + 1) print(codon_optimization_list[nucleotide_sequence]) -def codon_optimization(protein_sequences, cell_type): +def codon_optimization(protein_sequences, cell_type=None): """ Makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse @@ -105,10 +105,11 @@ def codon_optimization(protein_sequences, cell_type): codon_optimization_mouse += [''.join([replacer_Mouse(n, n) for n in protein_sequences[amino_acid]])] return beautiful_print(codon_optimization_mouse) else: - print('Для оптимизации кодонов доступны следующие виды организмов: Esherichia coli, Pichia pastoris, Mouse') + print('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' + 'Mouse') -codon_optimization(['MSRQEADLKVSIKKACSTEEAAPK', 'RKHVRACIVFTWDHRSSKAFYNGLRLL'], 'E.coli') +codon_optimization(['MSRQEADLKVSIKKACSTEEAAPK', 'RKHVRACIVFTWDHRSSKAFYNGLRLL'], 'jkjh') # print(b) # for i in range(len(b)): From f2c122027911abd8ded25f9abaf6d2f63a59ba95 Mon Sep 17 00:00:00 2001 From: Dasha Date: Sat, 30 Sep 2023 18:07:00 +0300 Subject: [PATCH 12/88] Remove all unnecessary --- protein_analysis_tool.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 72501a4..1372867 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -109,9 +109,4 @@ def codon_optimization(protein_sequences, cell_type=None): 'Mouse') -codon_optimization(['MSRQEADLKVSIKKACSTEEAAPK', 'RKHVRACIVFTWDHRSSKAFYNGLRLL'], 'jkjh') -# print(b) -# for i in range(len(b)): -# print('sequence ', i+1) -# print(b[i]) From a6ca7f11b98e63fbd88fed7361c6b596ee8b69c7 Mon Sep 17 00:00:00 2001 From: Ivan Date: Sat, 30 Sep 2023 18:18:34 +0300 Subject: [PATCH 13/88] Added functions: lenght, reverse, name_transform, check_amino_acid --- protein_analysis_tool.py | 77 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 8b13789..1dbb633 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -1 +1,78 @@ +amino_short_names_dic = {'A':'Ala', 'R':'Arg', 'N':'Asn', + 'D':'Asp', 'V':'Val', 'H':'His', + 'G':'Gly', 'Q':'Gln', 'E':'Glu', + 'I':'Ile', 'L':'Leu', 'K':'Lys', + 'M':'Met', 'P':'Pro', 'S':'Ser', + 'Y':'Tyr', 'T':'Thr', 'W':'Trp', + 'F':'Phe', 'C':'Cys'} +amino_names_dic = {'ala': 'A', 'arg': 'R', 'asn': 'N', + 'asp': 'D', 'val': 'V', 'his': 'H', + 'gly': 'G', 'gln': 'Q', 'glu': 'E', + 'ile': 'I', 'leu': 'L', 'lys': 'K', + 'met': 'M', 'pro': 'P', 'ser': 'S', + 'tyr': 'Y', 'thr': 'T', 'trp': 'W', + 'phe': 'F', 'cys': 'C'} +amino_names_dic_reverse = {'Ala': 'A', 'Arg': 'R', 'Asn': 'N', + 'Asp': 'D', 'Val': 'V', 'His': 'H', + 'Gly': 'G', 'Gln': 'Q', 'Glu': 'E', + 'Ile': 'I', 'Leu': 'L', 'Lys': 'K', + 'Met': 'M', 'Pro': 'P', 'Ser': 'S', + 'Tyr': 'Y', 'Thr': 'T', 'Trp': 'W', + 'Phe': 'F', 'Cys': 'C'} + +def lenght(seqs): + result = [len(seq) for seq in seqs] + print(result) + return result + + +def reverse(seqs): + result = [seq[::-1] for seq in seqs] + return result +def name_transform(seqs, format): + result = [] + print(seqs) + if format == 1: + for seq in seqs: + seq = seq.upper() + for letter in seq: + if check_amino_acid(letter): + pass + result.append(seq) + return result + elif format == 3: + for seq in seqs: + seq = seq.lower() + seq3 = [seq[i:i+3] for i in range(0, len(seq), 3)] + for triplet in seq3: + if check_amino_acid(triplet): + pass + else: return False + seq_transformed = "".join([amino_names_dic.get(seq) for seq in seq3]) + result.append(seq_transformed) + return result + + else: + print('Error unsupported format. Only formats 1 and 3 are supported') + return False + + +def check_amino_acid(input): + if len(input) == 1: + letter = input + if letter not in amino_short_names_dic.keys(): + print(f'Error {letter} is not an amino acid. Correct your input') + return False + else: return True + elif len(input) == 3: + triplet = input + if triplet not in amino_names_dic.keys(): + print(f'Error {triplet} is not an amino acid. Correct your input') + return False + else: return True + else: + print(f'Error {input} is incorrect form of amino acid notation. Correct your input') + return False + + \ No newline at end of file From 7d4824713790c727c21f25c38d870740218f3248 Mon Sep 17 00:00:00 2001 From: Yulia Volkova Date: Sat, 30 Sep 2023 17:29:29 +0200 Subject: [PATCH 14/88] Add functions get_amino_acid_sum and codon_optimization to main --- protein_analysis_tool.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 99649e1..9102ba2 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -1,4 +1,4 @@ -def protein(*args: list) -> list: +def protein(*args: list, operator: str, cell_type=None) -> list: """ Function protein does: -calculate predicted molecular weight of amino acid (aa) sequences in kDa (procedure name: molecular_weight) @@ -16,10 +16,10 @@ def protein(*args: list) -> list: - list, the result of the operation """ aa_seqs = [] - procedure = args[-1] - procedures = ('molecular_weight', 'one_letter_to_three') + procedure = operator + procedures = ('molecular_weight', 'one_letter_to_three', 'get_amino_acid_sum', 'codon_optimization') - for index in range(len(args)-1): + for index in range(len(args)): aa_seqs.append(args[index]) for aa_seq in aa_seqs: @@ -34,6 +34,11 @@ def protein(*args: list) -> list: if procedure == 'one_letter_to_three': return one_letter_to_three(aa_seqs) + if procedure == 'get_amino_acid_sum': + return get_amino_acid_sum(aa_seqs) + + if procedure == 'codon_optimization': + return codon_optimization(aa_seqs) def validate(aa_seq: str) -> None: """Validates if aa sequence consists of only amino acid characters""" @@ -110,5 +115,3 @@ def one_letter_to_three(aa_seqs: list) -> list: three_letters_seq.append(three_letter_codes[aa]) three_letters_seqs.append(''.join(three_letters_seq)) return three_letters_seqs - - From 5d3b9211514c927b958e1a716da9457dd5488a4f Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:05:58 +0300 Subject: [PATCH 15/88] Update protein_analysis_tool.py Debug amino_names_transform. Debug protein_analysis input --- protein_analysis_tool.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 70d2b2e..40125a3 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -21,7 +21,7 @@ 'Tyr': 'Y', 'Thr': 'T', 'Trp': 'W', 'Phe': 'F', 'Cys': 'C'} -def protein(*args: list, operator: str, cell_type=None) -> list: +def protein_analysis(*args: list, procedure: str, cell_type:str = None, format:int) -> list: """ Function protein does: -calculate predicted molecular weight of amino acid (aa) sequences in kDa (procedure name: molecular_weight) @@ -37,12 +37,9 @@ def protein(*args: list, operator: str, cell_type=None) -> list: Return: - list, the result of the operation """ - aa_seqs = [] - procedure = operator + aa_seqs = args procedures = ('molecular_weight', 'one_letter_to_three', 'get_amino_acid_sum', 'codon_optimization') - - for index in range(len(args)): - aa_seqs.append(args[index]) + aa_seqs = name_transform(aa_seqs, format): for aa_seq in aa_seqs: validate(aa_seq) @@ -200,7 +197,7 @@ def beautiful_print(codon_optimization_list: list) -> None: print(codon_optimization_list[nucleotide_sequence]) -def codon_optimization(protein_sequences, cell_type=None): +def codon_optimization(protein_sequences, cell_type:str = None): """ Makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse @@ -259,15 +256,16 @@ def lenght(seqs): def reverse(seqs): result = [seq[::-1] for seq in seqs] return result -def name_transform(seqs, format): + +def name_transform(seqs:list, format:int): result = [] - print(seqs) if format == 1: for seq in seqs: seq = seq.upper() for letter in seq: if check_amino_acid(letter): pass + else: return False result.append(seq) return result elif format == 3: @@ -287,21 +285,21 @@ def name_transform(seqs, format): return False -def check_amino_acid(input): - if len(input) == 1: - letter = input +def check_amino_acid(input_amino): + if len(input_amino) == 1: + letter = input_amino if letter not in amino_short_names_dic.keys(): print(f'Error {letter} is not an amino acid. Correct your input') return False else: return True - elif len(input) == 3: - triplet = input + elif len(input_amino) == 3: + triplet = input_amino if triplet not in amino_names_dic.keys(): print(f'Error {triplet} is not an amino acid. Correct your input') return False else: return True else: - print(f'Error {input} is incorrect form of amino acid notation. Correct your input') + print(f'Error {input_amino} is incorrect form of amino acid notation. Correct your input') return False From f3ad59f5ba6a895ea9bb7dac5ef4a516ce31e31a Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:18:31 +0300 Subject: [PATCH 16/88] Update protein_analysis_tool.py Redisign functions molecular_weight and one_letter_to_three. Dictionaries removed from functions --- protein_analysis_tool.py | 76 +++++++++++----------------------------- 1 file changed, 21 insertions(+), 55 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 40125a3..6cef8a1 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -20,6 +20,14 @@ 'Met': 'M', 'Pro': 'P', 'Ser': 'S', 'Tyr': 'Y', 'Thr': 'T', 'Trp': 'W', 'Phe': 'F', 'Cys': 'C'} + aa_weights = {'A': 89.09, 'R': 174.20, 'N': 132.12, + 'D': 133.10, 'C': 121.16, 'E': 147.13, + 'Q': 146.15, 'G': 75.07, 'H': 155.16, + 'I': 131.18, 'L': 131.18, 'K': 146.19, + 'M': 149.21, 'F': 165.19, 'P': 115.13, + 'S': 105.09, 'T': 119.12, 'W': 204.23, + 'Y': 181.19, 'V': 117.15} + def protein_analysis(*args: list, procedure: str, cell_type:str = None, format:int) -> list: """ @@ -38,11 +46,11 @@ def protein_analysis(*args: list, procedure: str, cell_type:str = None, format:i - list, the result of the operation """ aa_seqs = args - procedures = ('molecular_weight', 'one_letter_to_three', 'get_amino_acid_sum', 'codon_optimization') + procedures = ('molecular_weight', 'one_letter_to_three', 'get_amino_acid_sum', 'codon_optimization', 'lenght') aa_seqs = name_transform(aa_seqs, format): - for aa_seq in aa_seqs: - validate(aa_seq) + # for aa_seq in aa_seqs: + # validate(aa_seq) if procedure not in procedures: raise ValueError('Requested procedure is not defined') @@ -59,39 +67,18 @@ def protein_analysis(*args: list, procedure: str, cell_type:str = None, format:i if procedure == 'codon_optimization': return codon_optimization(aa_seqs) -def validate(aa_seq: str) -> None: - """Validates if aa sequence consists of only amino acid characters""" - aa_seq_set = set(aa_seq.upper()) - all_aa = {'A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'} - difference = aa_seq_set.difference(all_aa) - if len(difference) > 0: - raise ValueError('Invalid alphabet, please use only single letter amino acid code') +# def validate(aa_seq: str) -> None: +# """Validates if aa sequence consists of only amino acid characters""" +# aa_seq_set = set(aa_seq.upper()) +# all_aa = {'A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'} +# difference = aa_seq_set.difference(all_aa) +# if len(difference) > 0: +# raise ValueError('Invalid alphabet, please use only single letter amino acid code') def molecular_weight(aa_seqs: list) -> list: """Calculates predicated molecular weight of aa sequences. Returns list of floats""" - aa_weights = { - 'A': 89.09, - 'R': 174.20, - 'N': 132.12, - 'D': 133.10, - 'C': 121.16, - 'E': 147.13, - 'Q': 146.15, - 'G': 75.07, - 'H': 155.16, - 'I': 131.18, - 'L': 131.18, - 'K': 146.19, - 'M': 149.21, - 'F': 165.19, - 'P': 115.13, - 'S': 105.09, - 'T': 119.12, - 'W': 204.23, - 'Y': 181.19, - 'V': 117.15 - } + molecular_weights = [] for seq in aa_seqs: total_weight = 0 @@ -104,34 +91,13 @@ def molecular_weight(aa_seqs: list) -> list: def one_letter_to_three(aa_seqs: list) -> list: """Translates one letter coded aa sequences to three letter coded""" - three_letter_codes = { - 'A': 'Ala', - 'R': 'Arg', - 'N': 'Asn', - 'D': 'Asp', - 'C': 'Cys', - 'E': 'Glu', - 'Q': 'Gln', - 'G': 'Gly', - 'H': 'His', - 'I': 'Ile', - 'L': 'Leu', - 'K': 'Lys', - 'M': 'Met', - 'F': 'Phe', - 'P': 'Pro', - 'S': 'Ser', - 'T': 'Thr', - 'W': 'Trp', - 'Y': 'Tyr', - 'V': 'Val' - } + three_letters_seqs = [] for seq in aa_seqs: three_letters_seq = [] for aa in seq: aa = aa.upper() - three_letters_seq.append(three_letter_codes[aa]) + three_letters_seq.append(amino_short_names_dic[aa]) three_letters_seqs.append(''.join(three_letters_seq)) return three_letters_seqs ======= From a14fd22b38ca08bb141b8e321c72182db9b8a5b5 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:29:50 +0300 Subject: [PATCH 17/88] Update protein_analysis_tool.py Add return to get_amino_acid_sum and correct output format --- protein_analysis_tool.py | 56 +++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 6cef8a1..8d16c46 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -5,6 +5,7 @@ 'M':'Met', 'P':'Pro', 'S':'Ser', 'Y':'Tyr', 'T':'Thr', 'W':'Trp', 'F':'Phe', 'C':'Cys'} + amino_names_dic = {'ala': 'A', 'arg': 'R', 'asn': 'N', 'asp': 'D', 'val': 'V', 'his': 'H', 'gly': 'G', 'gln': 'Q', 'glu': 'E', @@ -20,7 +21,8 @@ 'Met': 'M', 'Pro': 'P', 'Ser': 'S', 'Tyr': 'Y', 'Thr': 'T', 'Trp': 'W', 'Phe': 'F', 'Cys': 'C'} - aa_weights = {'A': 89.09, 'R': 174.20, 'N': 132.12, + +aa_weights = {'A': 89.09, 'R': 174.20, 'N': 132.12, 'D': 133.10, 'C': 121.16, 'E': 147.13, 'Q': 146.15, 'G': 75.07, 'H': 155.16, 'I': 131.18, 'L': 131.18, 'K': 146.19, @@ -100,8 +102,7 @@ def one_letter_to_three(aa_seqs: list) -> list: three_letters_seq.append(amino_short_names_dic[aa]) three_letters_seqs.append(''.join(three_letters_seq)) return three_letters_seqs -======= - + def get_amino_acid_sum(protein_sequences: list) -> None: """ @@ -114,37 +115,40 @@ def get_amino_acid_sum(protein_sequences: list) -> None: - None - Only print the amount of each amino acid in the injected protein sequences """ + result = [] for protein_sequence in range(len(protein_sequences)): - dictionary = {'A': 0, 'C': 0, 'D': 0, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, + amino_acid_count = {'A': 0, 'C': 0, 'D': 0, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0} for amino_acid in protein_sequences[protein_sequence]: - dictionary[amino_acid] += 1 + amino_acid_count[amino_acid] += 1 clone = { - 'Alanin': dictionary['A'], - 'Cysteine': dictionary['C'], - 'Aspartic acid': dictionary['D'], - 'Glutamic acid': dictionary['E'], - 'Phenylalanine': dictionary['F'], - 'Glycine': dictionary['G'], - 'Histidine': dictionary['H'], - 'Isoleucine': dictionary['I'], - 'Lysine': dictionary['K'], - 'Leucine': dictionary['L'], - 'Methionine': dictionary['M'], - 'Aspargin': dictionary['N'], - 'Proline': dictionary['P'], - 'Glutamine': dictionary['Q'], - 'Arginine': dictionary['R'], - 'Serin': dictionary['S'], - 'Threonine': dictionary['T'], - 'Valin': dictionary['V'], - 'Tryptophan': dictionary['W'], - 'Tyrosine': dictionary['Y'] + 'Alanin': amino_acid_count['A'], + 'Cysteine': amino_acid_count['C'], + 'Aspartic acid': amino_acid_count['D'], + 'Glutamic acid': amino_acid_count['E'], + 'Phenylalanine': amino_acid_count['F'], + 'Glycine': amino_acid_count['G'], + 'Histidine': amino_acid_count['H'], + 'Isoleucine': amino_acid_count['I'], + 'Lysine': amino_acid_count['K'], + 'Leucine': amino_acid_count['L'], + 'Methionine': amino_acid_count['M'], + 'Aspargin': amino_acid_count['N'], + 'Proline': amino_acid_count['P'], + 'Glutamine': amino_acid_count['Q'], + 'Arginine': amino_acid_count['R'], + 'Serin': amino_acid_count['S'], + 'Threonine': amino_acid_count['T'], + 'Valin': amino_acid_count['V'], + 'Tryptophan': amino_acid_count['W'], + 'Tyrosine': amino_acid_count['Y'] } print('The number of amino acids in the sequence ', protein_sequence + 1, ':') for key, value in clone.items(): print(key, value) + result.append(amino_acid_count) +return result def beautiful_print(codon_optimization_list: list) -> None: @@ -157,7 +161,7 @@ def beautiful_print(codon_optimization_list: list) -> None: Return: - None - Only print the number of the introduced protein sequence and the codon-optimized DNA sequence - """ + """ for nucleotide_sequence in range(len(codon_optimization_list)): print('Sequence ', nucleotide_sequence + 1) print(codon_optimization_list[nucleotide_sequence]) From 76172bc93690e816e9f8aad80355832d2eb665b0 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:34:01 +0300 Subject: [PATCH 18/88] Update protein_analysis_tool.py Delete functions beautiful_print and reverse. Redesign codon_optimization output --- protein_analysis_tool.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 8d16c46..eef3dc3 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -151,20 +151,20 @@ def get_amino_acid_sum(protein_sequences: list) -> None: return result -def beautiful_print(codon_optimization_list: list) -> None: - """ - Makes a user-friendly output of a codon-optimized DNA sequence +# def codon_optimization_list: list) -> None: +# """ +# Makes a user-friendly output of a codon-optimized DNA sequence - Arguments: - - codon_optimization_list (list): list of codon-optimized protein sequence +# Arguments: +# - codon_optimization_list (list): list of codon-optimized protein sequence - Return: - - None - - Only print the number of the introduced protein sequence and the codon-optimized DNA sequence - """ - for nucleotide_sequence in range(len(codon_optimization_list)): - print('Sequence ', nucleotide_sequence + 1) - print(codon_optimization_list[nucleotide_sequence]) +# Return: +# - None +# - Only print the number of the introduced protein sequence and the codon-optimized DNA sequence +# """ +# for nucleotide_sequence in range(len(codon_optimization_list)): +# print('Sequence ', nucleotide_sequence + 1) +# print(codon_optimization_list[nucleotide_sequence]) def codon_optimization(protein_sequences, cell_type:str = None): @@ -182,15 +182,15 @@ def codon_optimization(protein_sequences, cell_type:str = None): """ if cell_type == 'Esherichia coli' or cell_type == 'E.coli': - codon_optimization_Ecoli = [] + codon_optimization_ecoli = [] ecoli_triplets = {'A': 'GCG', 'C': 'TGC', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT', 'G': 'GGC', 'H': 'CAT', 'I': 'ATT', 'K': 'AAA', 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', 'P': 'CCG', 'Q': 'CAG', 'R': 'CGT', 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', 'W': 'TGG', 'Y': 'TAT'} replacer_ecoli = ecoli_triplets.get for amino_acid in range(len(protein_sequences)): - codon_optimization_Ecoli += [''.join([replacer_ecoli(n, n) for n in protein_sequences[amino_acid]])] - return beautiful_print(codon_optimization_Ecoli) + codon_optimization_ecoli += [''.join([replacer_ecoli(n, n) for n in protein_sequences[amino_acid]])] + return codon_optimization_ecoli if cell_type == 'Pichia pastoris' or cell_type == 'P.pastoris': codon_optimization_ppastoris = [] @@ -201,7 +201,7 @@ def codon_optimization(protein_sequences, cell_type:str = None): replacer_ppastoris = ppastoris_triplets.get for amino_acid in range(len(protein_sequences)): codon_optimization_ppastoris += [''.join([replacer_ppastoris(n, n) for n in protein_sequences[amino_acid]])] - return beautiful_print(codon_optimization_ppastoris) + return codon_optimization_ppastoris if cell_type == 'Mouse' or cell_type == 'mouse': codon_optimization_mouse = [] @@ -212,7 +212,7 @@ def codon_optimization(protein_sequences, cell_type:str = None): replacer_Mouse = mouse_triplets.get for amino_acid in range(len(protein_sequences)): codon_optimization_mouse += [''.join([replacer_Mouse(n, n) for n in protein_sequences[amino_acid]])] - return beautiful_print(codon_optimization_mouse) + return codon_optimization_mouse else: print('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' 'Mouse') @@ -223,9 +223,9 @@ def lenght(seqs): return result -def reverse(seqs): - result = [seq[::-1] for seq in seqs] - return result +# def reverse(seqs): +# result = [seq[::-1] for seq in seqs] +# return result def name_transform(seqs:list, format:int): result = [] From 40b85d46c035795f500d92c5fa70ecdc38618271 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:42:44 +0300 Subject: [PATCH 19/88] Update protein_analysis_tool.py Add support lenght function --- protein_analysis_tool.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index eef3dc3..1b83ff0 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -49,7 +49,7 @@ def protein_analysis(*args: list, procedure: str, cell_type:str = None, format:i """ aa_seqs = args procedures = ('molecular_weight', 'one_letter_to_three', 'get_amino_acid_sum', 'codon_optimization', 'lenght') - aa_seqs = name_transform(aa_seqs, format): + aa_seqs = name_transform(aa_seqs, format) # for aa_seq in aa_seqs: # validate(aa_seq) @@ -68,6 +68,9 @@ def protein_analysis(*args: list, procedure: str, cell_type:str = None, format:i if procedure == 'codon_optimization': return codon_optimization(aa_seqs) + + if procedure == 'lenght': + return lenght(aa_seqs) # def validate(aa_seq: str) -> None: # """Validates if aa sequence consists of only amino acid characters""" @@ -148,7 +151,7 @@ def get_amino_acid_sum(protein_sequences: list) -> None: for key, value in clone.items(): print(key, value) result.append(amino_acid_count) -return result + return result # def codon_optimization_list: list) -> None: From 44de06a99d4a198b454d136a713d654ee6e5bbbf Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:45:11 +0300 Subject: [PATCH 20/88] Update protein_analysis_tool.py Delete print from get_amino_acid_sum function --- protein_analysis_tool.py | 50 ++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 1b83ff0..dfb265a 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -125,31 +125,31 @@ def get_amino_acid_sum(protein_sequences: list) -> None: 'T': 0, 'V': 0, 'W': 0, 'Y': 0} for amino_acid in protein_sequences[protein_sequence]: amino_acid_count[amino_acid] += 1 - clone = { - 'Alanin': amino_acid_count['A'], - 'Cysteine': amino_acid_count['C'], - 'Aspartic acid': amino_acid_count['D'], - 'Glutamic acid': amino_acid_count['E'], - 'Phenylalanine': amino_acid_count['F'], - 'Glycine': amino_acid_count['G'], - 'Histidine': amino_acid_count['H'], - 'Isoleucine': amino_acid_count['I'], - 'Lysine': amino_acid_count['K'], - 'Leucine': amino_acid_count['L'], - 'Methionine': amino_acid_count['M'], - 'Aspargin': amino_acid_count['N'], - 'Proline': amino_acid_count['P'], - 'Glutamine': amino_acid_count['Q'], - 'Arginine': amino_acid_count['R'], - 'Serin': amino_acid_count['S'], - 'Threonine': amino_acid_count['T'], - 'Valin': amino_acid_count['V'], - 'Tryptophan': amino_acid_count['W'], - 'Tyrosine': amino_acid_count['Y'] - } - print('The number of amino acids in the sequence ', protein_sequence + 1, ':') - for key, value in clone.items(): - print(key, value) + # clone = { + # 'Alanin': amino_acid_count['A'], + # 'Cysteine': amino_acid_count['C'], + # 'Aspartic acid': amino_acid_count['D'], + # 'Glutamic acid': amino_acid_count['E'], + # 'Phenylalanine': amino_acid_count['F'], + # 'Glycine': amino_acid_count['G'], + # 'Histidine': amino_acid_count['H'], + # 'Isoleucine': amino_acid_count['I'], + # 'Lysine': amino_acid_count['K'], + # 'Leucine': amino_acid_count['L'], + # 'Methionine': amino_acid_count['M'], + # 'Aspargin': amino_acid_count['N'], + # 'Proline': amino_acid_count['P'], + # 'Glutamine': amino_acid_count['Q'], + # 'Arginine': amino_acid_count['R'], + # 'Serin': amino_acid_count['S'], + # 'Threonine': amino_acid_count['T'], + # 'Valin': amino_acid_count['V'], + # 'Tryptophan': amino_acid_count['W'], + # 'Tyrosine': amino_acid_count['Y'] + # } + # print('The number of amino acids in the sequence ', protein_sequence + 1, ':') + # for key, value in clone.items(): + # print(key, value) result.append(amino_acid_count) return result From e79e7778f86f27e6a8f7585a5f64952e7ff359d5 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:49:30 +0300 Subject: [PATCH 21/88] Update protein_analysis_tool.py Rename 'lenght' to 'length' --- protein_analysis_tool.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index dfb265a..1038cc8 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -48,7 +48,7 @@ def protein_analysis(*args: list, procedure: str, cell_type:str = None, format:i - list, the result of the operation """ aa_seqs = args - procedures = ('molecular_weight', 'one_letter_to_three', 'get_amino_acid_sum', 'codon_optimization', 'lenght') + procedures = ('molecular_weight', 'one_letter_to_three', 'get_amino_acid_sum', 'codon_optimization', 'length') aa_seqs = name_transform(aa_seqs, format) # for aa_seq in aa_seqs: @@ -69,8 +69,8 @@ def protein_analysis(*args: list, procedure: str, cell_type:str = None, format:i if procedure == 'codon_optimization': return codon_optimization(aa_seqs) - if procedure == 'lenght': - return lenght(aa_seqs) + if procedure == 'length': + return length(aa_seqs) # def validate(aa_seq: str) -> None: # """Validates if aa sequence consists of only amino acid characters""" @@ -151,7 +151,7 @@ def get_amino_acid_sum(protein_sequences: list) -> None: # for key, value in clone.items(): # print(key, value) result.append(amino_acid_count) - return result + return result # def codon_optimization_list: list) -> None: @@ -220,7 +220,7 @@ def codon_optimization(protein_sequences, cell_type:str = None): print('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' 'Mouse') -def lenght(seqs): +def length(seqs): result = [len(seq) for seq in seqs] print(result) return result From 4a34165399dcb1d10fcb18192e3a2b71677b1315 Mon Sep 17 00:00:00 2001 From: Sokolova Dasha <144246645+stegodasha@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:52:01 +0300 Subject: [PATCH 22/88] Update README.md Add to list of procedures get_amino_acid_sum and codon_optimization --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 395812d..a93d5a5 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ This function accepts arguments as a list of strings. Last argument in the list - `molecular_weight` — returns list of float values, that indicate predicted molecular weights of given aa sequences (in kDa) - `one_letter_to_three` — will return list of strings, containing the same sequences written in three-letter code -- +- `get_amino_acid_sum` — сounts the amount of each amino acid in the injected protein sequences +- `codon_optimization` — makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse - - - @@ -24,6 +25,7 @@ This function accepts arguments as a list of strings. Last argument in the list > protein("ACD", "AD", "one_letter_to_three") # ['AlaCysAsp', 'AlaAsp'] \ > protein("ACD", "AD", "molecular_weight") # [0.34, 0.22] \ + ## Possible erros: > `ValueError`('Invalid alphabet, please use only single letter amino acid code') # Will occure if character other than A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V are used. \ > `ValueError`('Requested procedure is not defined') # Will occure if last argument does not correspond to any listed procedure (see List of procedures). \ From 06d15058a29c0d1bc0807d38af1974c2258c4b31 Mon Sep 17 00:00:00 2001 From: Sokolova Dasha <144246645+stegodasha@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:03:12 +0300 Subject: [PATCH 23/88] Fixed docstrings for functions codon_optimization and get_amino_acid_sum in protein_analysis_tool.py --- protein_analysis_tool.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 1038cc8..96084b8 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -107,7 +107,7 @@ def one_letter_to_three(aa_seqs: list) -> list: return three_letters_seqs -def get_amino_acid_sum(protein_sequences: list) -> None: +def get_amino_acid_sum(protein_sequences: list) -> list: """ Counts the amount of each amino acid in the injected protein sequences @@ -115,7 +115,7 @@ def get_amino_acid_sum(protein_sequences: list) -> None: - protein_sequences (list): list of injected protein sequence Return: - - None + - List of dictionary with amino acid amount - Only print the amount of each amino acid in the injected protein sequences """ result = [] @@ -170,7 +170,7 @@ def get_amino_acid_sum(protein_sequences: list) -> None: # print(codon_optimization_list[nucleotide_sequence]) -def codon_optimization(protein_sequences, cell_type:str = None): +def codon_optimization(protein_sequences, cell_type:str) -> list: """ Makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse @@ -180,7 +180,7 @@ def codon_optimization(protein_sequences, cell_type:str = None): - cell_type (str): user-entered cell type for codon optimization Return: - - None + - List of codon-optimized DNA - Only print the number of the introduced protein sequence and the codon-optimized DNA sequence """ @@ -217,7 +217,7 @@ def codon_optimization(protein_sequences, cell_type:str = None): codon_optimization_mouse += [''.join([replacer_Mouse(n, n) for n in protein_sequences[amino_acid]])] return codon_optimization_mouse else: - print('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' + return print('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' 'Mouse') def length(seqs): From f835deaf3848826f47ffe1adf534ff0c6f8f0a1c Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:12:14 +0300 Subject: [PATCH 24/88] Update README.md Add contact information --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index a93d5a5..035925d 100644 --- a/README.md +++ b/README.md @@ -31,5 +31,17 @@ This function accepts arguments as a list of strings. Last argument in the list > `ValueError`('Requested procedure is not defined') # Will occure if last argument does not correspond to any listed procedure (see List of procedures). \ +## Private policy and contacts +This tool can be freely distributed and used. +If you have any suggestions for improving the tool or if you find a bug, please contact us by email +This tool was developed by the "workaholics" team: +Yulia Volkova volkova.yulia.leonidovna@gmail.com +Dasha Sokolova kalabanova_dasha@mail.ru +Ivan Kozin ivan.d.kozin@gmail.com +Team photo: +![Снимок экрана 2023-09-29 210559](https://github.com/ivandkoz/HW4_Functions2_Kozin/assets/63678919/35c0e02c-a905-405f-9f25-17e74e14901b) + + + From 42b8d30e1fe6a6e45983829388caee1c0f60aab9 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:13:41 +0300 Subject: [PATCH 25/88] Update README.md Add team photo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 035925d..ce0e614 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Yulia Volkova volkova.yulia.leonidovna@gmail.com Dasha Sokolova kalabanova_dasha@mail.ru Ivan Kozin ivan.d.kozin@gmail.com Team photo: -![Снимок экрана 2023-09-29 210559](https://github.com/ivandkoz/HW4_Functions2_Kozin/assets/63678919/35c0e02c-a905-405f-9f25-17e74e14901b) +![Снимок экрана 2023-09-29 210559_2](https://github.com/ivandkoz/HW4_Functions2_Kozin/assets/63678919/ad1302a1-d139-4c82-b7eb-d5b9ac1897e8) From 2b4ec1509325e543771fe8ce15b3db0b41f8e40a Mon Sep 17 00:00:00 2001 From: Sokolova Dasha <144246645+stegodasha@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:15:53 +0300 Subject: [PATCH 26/88] Fixed docstrings for functions codon_optimization and get _amino_acid_sum and delete the excess in protein_analysis_tool.py --- protein_analysis_tool.py | 45 +--------------------------------------- 1 file changed, 1 insertion(+), 44 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 96084b8..e4b71af 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -116,7 +116,6 @@ def get_amino_acid_sum(protein_sequences: list) -> list: Return: - List of dictionary with amino acid amount - - Only print the amount of each amino acid in the injected protein sequences """ result = [] for protein_sequence in range(len(protein_sequences)): @@ -125,52 +124,11 @@ def get_amino_acid_sum(protein_sequences: list) -> list: 'T': 0, 'V': 0, 'W': 0, 'Y': 0} for amino_acid in protein_sequences[protein_sequence]: amino_acid_count[amino_acid] += 1 - # clone = { - # 'Alanin': amino_acid_count['A'], - # 'Cysteine': amino_acid_count['C'], - # 'Aspartic acid': amino_acid_count['D'], - # 'Glutamic acid': amino_acid_count['E'], - # 'Phenylalanine': amino_acid_count['F'], - # 'Glycine': amino_acid_count['G'], - # 'Histidine': amino_acid_count['H'], - # 'Isoleucine': amino_acid_count['I'], - # 'Lysine': amino_acid_count['K'], - # 'Leucine': amino_acid_count['L'], - # 'Methionine': amino_acid_count['M'], - # 'Aspargin': amino_acid_count['N'], - # 'Proline': amino_acid_count['P'], - # 'Glutamine': amino_acid_count['Q'], - # 'Arginine': amino_acid_count['R'], - # 'Serin': amino_acid_count['S'], - # 'Threonine': amino_acid_count['T'], - # 'Valin': amino_acid_count['V'], - # 'Tryptophan': amino_acid_count['W'], - # 'Tyrosine': amino_acid_count['Y'] - # } - # print('The number of amino acids in the sequence ', protein_sequence + 1, ':') - # for key, value in clone.items(): - # print(key, value) result.append(amino_acid_count) return result -# def codon_optimization_list: list) -> None: -# """ -# Makes a user-friendly output of a codon-optimized DNA sequence - -# Arguments: -# - codon_optimization_list (list): list of codon-optimized protein sequence - -# Return: -# - None -# - Only print the number of the introduced protein sequence and the codon-optimized DNA sequence -# """ -# for nucleotide_sequence in range(len(codon_optimization_list)): -# print('Sequence ', nucleotide_sequence + 1) -# print(codon_optimization_list[nucleotide_sequence]) - - -def codon_optimization(protein_sequences, cell_type:str) -> list: +def codon_optimization(protein_sequences: list, cell_type:str) -> list: """ Makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse @@ -181,7 +139,6 @@ def codon_optimization(protein_sequences, cell_type:str) -> list: Return: - List of codon-optimized DNA - - Only print the number of the introduced protein sequence and the codon-optimized DNA sequence """ if cell_type == 'Esherichia coli' or cell_type == 'E.coli': From 7b877eb588f3dd12d51fcf2323703bbcbf403f2b Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:18:11 +0300 Subject: [PATCH 27/88] Update README.md Debug new lines problems in contacts --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ce0e614..266d40a 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,12 @@ This function accepts arguments as a list of strings. Last argument in the list ## Private policy and contacts This tool can be freely distributed and used. -If you have any suggestions for improving the tool or if you find a bug, please contact us by email -This tool was developed by the "workaholics" team: -Yulia Volkova volkova.yulia.leonidovna@gmail.com -Dasha Sokolova kalabanova_dasha@mail.ru -Ivan Kozin ivan.d.kozin@gmail.com -Team photo: +\If you have any suggestions for improving the tool or if you find a bug, please contact us by email. +\This tool was developed by the "workaholics" team: +\Yulia Volkova volkova.yulia.leonidovna@gmail.com +\Dasha Sokolova kalabanova_dasha@mail.ru +\Team leader: Ivan Kozin ivan.d.kozin@gmail.com +\Team photo: ![Снимок экрана 2023-09-29 210559_2](https://github.com/ivandkoz/HW4_Functions2_Kozin/assets/63678919/ad1302a1-d139-4c82-b7eb-d5b9ac1897e8) From 088d895f5d04bb2858ebdf90c0d7eaff65adc5b5 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:21:45 +0300 Subject: [PATCH 28/88] Update README.md Debug newlines --- README.md | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 266d40a..fbe2f9e 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,18 @@ This function accepts arguments as a list of strings. Last argument in the list ## Private policy and contacts This tool can be freely distributed and used. -\If you have any suggestions for improving the tool or if you find a bug, please contact us by email. -\This tool was developed by the "workaholics" team: -\Yulia Volkova volkova.yulia.leonidovna@gmail.com -\Dasha Sokolova kalabanova_dasha@mail.ru -\Team leader: Ivan Kozin ivan.d.kozin@gmail.com -\Team photo: +\ +If you have any suggestions for improving the tool or if you find a bug, please contact us by email. +\ +This tool was developed by the "workaholics" team: +\ +Yulia Volkova volkova.yulia.leonidovna@gmail.com +\ +Dasha Sokolova kalabanova_dasha@mail.ru +\ +Team leader: Ivan Kozin ivan.d.kozin@gmail.com +\ +Team photo: ![Снимок экрана 2023-09-29 210559_2](https://github.com/ivandkoz/HW4_Functions2_Kozin/assets/63678919/ad1302a1-d139-4c82-b7eb-d5b9ac1897e8) From 5f30ff73f3f1e484457e779053372ccf179be28b Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:25:09 +0300 Subject: [PATCH 29/88] Update README.md Newlines debug --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index fbe2f9e..e99f967 100644 --- a/README.md +++ b/README.md @@ -33,17 +33,17 @@ This function accepts arguments as a list of strings. Last argument in the list ## Private policy and contacts This tool can be freely distributed and used. -\ +
If you have any suggestions for improving the tool or if you find a bug, please contact us by email. -\ +
This tool was developed by the "workaholics" team: -\ +
Yulia Volkova volkova.yulia.leonidovna@gmail.com -\ +
Dasha Sokolova kalabanova_dasha@mail.ru -\ +
Team leader: Ivan Kozin ivan.d.kozin@gmail.com -\ +
Team photo: ![Снимок экрана 2023-09-29 210559_2](https://github.com/ivandkoz/HW4_Functions2_Kozin/assets/63678919/ad1302a1-d139-4c82-b7eb-d5b9ac1897e8) From e4b2de0d61d0f077d90c1598954fd21d308af227 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:37:54 +0200 Subject: [PATCH 30/88] Update protein_analysis_tool.py Add arguments and returns to docstring of main, correct arguments types --- protein_analysis_tool.py | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index e4b71af..3f0c7e5 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -31,21 +31,27 @@ 'Y': 181.19, 'V': 117.15} -def protein_analysis(*args: list, procedure: str, cell_type:str = None, format:int) -> list: +def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format:int) -> list: """ - Function protein does: - -calculate predicted molecular weight of amino acid (aa) sequences in kDa (procedure name: molecular_weight) - -translate aa sequences from one-letter to three-letter code - - - - - - - - + Function protein_analysis: + - calculates predicted molecular weight of amino acid sequences in kDa (procedure name: molecular_weight) + - translate aa sequences from one-letter to three-letter code (procedure name: one_letter_to_three) + - calculates total amount of each amino acid in the sequences (procedure name: get_amino_acid_sum) + - makes DNA based codon optimization for the introduced amino acid sequences, support 3 types of cells: Esherichia coli, Pichia pastoris, Mouse (procedure name: codon_optimization) + - calculates length of amino acid sequences (procedure name: length) + Arguments: - - - - + - tuple of protein sequences written one letter or three letter code (not mixed) + - name of procedure as string + - cell type (required only for codon_optimization procedure) + - format of code for the protein sequences as int: 1 for one letter, 3 for three letter code Return: - - list, the result of the operation + - molecular_weight procedure returns list of floats + - one_letter_to_three procedure returns list of strings + - get_amino_acid_sum procedure returns list of dictionaries + - codon_optimization procedure returns list of strings + - length procedure returns list of int values """ aa_seqs = args procedures = ('molecular_weight', 'one_letter_to_three', 'get_amino_acid_sum', 'codon_optimization', 'length') @@ -81,7 +87,7 @@ def protein_analysis(*args: list, procedure: str, cell_type:str = None, format:i # raise ValueError('Invalid alphabet, please use only single letter amino acid code') -def molecular_weight(aa_seqs: list) -> list: +def molecular_weight(aa_seqs: tuple) -> list: """Calculates predicated molecular weight of aa sequences. Returns list of floats""" molecular_weights = [] @@ -94,7 +100,7 @@ def molecular_weight(aa_seqs: list) -> list: return molecular_weights -def one_letter_to_three(aa_seqs: list) -> list: +def one_letter_to_three(aa_seqs: tuple) -> list: """Translates one letter coded aa sequences to three letter coded""" three_letters_seqs = [] @@ -107,7 +113,7 @@ def one_letter_to_three(aa_seqs: list) -> list: return three_letters_seqs -def get_amino_acid_sum(protein_sequences: list) -> list: +def get_amino_acid_sum(protein_sequences: tuple) -> list: """ Counts the amount of each amino acid in the injected protein sequences @@ -128,7 +134,7 @@ def get_amino_acid_sum(protein_sequences: list) -> list: return result -def codon_optimization(protein_sequences: list, cell_type:str) -> list: +def codon_optimization(protein_sequences: tuple, cell_type:str) -> list: """ Makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse @@ -174,7 +180,7 @@ def codon_optimization(protein_sequences: list, cell_type:str) -> list: codon_optimization_mouse += [''.join([replacer_Mouse(n, n) for n in protein_sequences[amino_acid]])] return codon_optimization_mouse else: - return print('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' + print('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' 'Mouse') def length(seqs): From b82302145a790b101df6e3013c4e93f752650865 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:43:20 +0300 Subject: [PATCH 31/88] Update protein_analysis_tool.py Add input types to length and check_amino_acid functions --- protein_analysis_tool.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 3f0c7e5..850147d 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -183,9 +183,8 @@ def codon_optimization(protein_sequences: tuple, cell_type:str) -> list: print('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' 'Mouse') -def length(seqs): +def length(seqs:touple): result = [len(seq) for seq in seqs] - print(result) return result @@ -193,7 +192,7 @@ def length(seqs): # result = [seq[::-1] for seq in seqs] # return result -def name_transform(seqs:list, format:int): +def name_transform(seqs:touple, format:int): result = [] if format == 1: for seq in seqs: @@ -221,7 +220,7 @@ def name_transform(seqs:list, format:int): return False -def check_amino_acid(input_amino): +def check_amino_acid(input_amino:str): if len(input_amino) == 1: letter = input_amino if letter not in amino_short_names_dic.keys(): From 4b0962ce6d57204fddc7af8dc191f1826409adc2 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:44:42 +0300 Subject: [PATCH 32/88] Update protein_analysis_tool.py Correct input codon_optimization in main function --- protein_analysis_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 850147d..3cc0053 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -73,7 +73,7 @@ def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format: return get_amino_acid_sum(aa_seqs) if procedure == 'codon_optimization': - return codon_optimization(aa_seqs) + return codon_optimization(aa_seqs, cell_type) if procedure == 'length': return length(aa_seqs) From a9ad144cee17d0c2c8f3ac8222056e3fdbe0df55 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:49:07 +0300 Subject: [PATCH 33/88] Update protein_analysis_tool.py Redesign codon optimization error message --- protein_analysis_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 3cc0053..c4e2fb5 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -180,7 +180,7 @@ def codon_optimization(protein_sequences: tuple, cell_type:str) -> list: codon_optimization_mouse += [''.join([replacer_Mouse(n, n) for n in protein_sequences[amino_acid]])] return codon_optimization_mouse else: - print('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' + raise ValueError('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' 'Mouse') def length(seqs:touple): From bd060b2ac1ea100228c2d07734b5b66c82fc9e58 Mon Sep 17 00:00:00 2001 From: Sokolova Dasha <144246645+stegodasha@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:53:21 +0300 Subject: [PATCH 34/88] Add example of use function get_amino_acid_sum and codon_optimization --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index e99f967..dd2ef3b 100644 --- a/README.md +++ b/README.md @@ -24,11 +24,16 @@ This function accepts arguments as a list of strings. Last argument in the list > protein("ACD", "AD", "one_letter_to_three") # ['AlaCysAsp', 'AlaAsp'] \ > protein("ACD", "AD", "molecular_weight") # [0.34, 0.22] \ +> protein("ACD", "AD", "get_amino_acid_sum") # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] \ +> protein("ACD", "AD", "codon_optimization", cell_type = 'E.coli') # ['GCGTGCGAT', 'GCGGAT'] + + ## Possible erros: > `ValueError`('Invalid alphabet, please use only single letter amino acid code') # Will occure if character other than A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V are used. \ > `ValueError`('Requested procedure is not defined') # Will occure if last argument does not correspond to any listed procedure (see List of procedures). \ +> `ValueError`('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse) # Will occure if the cell type is incorrectly entered to optimize codons. ## Private policy and contacts From 5687bcfc19bb6fe2203cd20e9070289f127dd381 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:24:10 +0300 Subject: [PATCH 35/88] Update protein_analysis_tool.py Add amino_brutto dictionary and brutto_count function --- protein_analysis_tool.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index c4e2fb5..989e251 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -30,6 +30,17 @@ 'S': 105.09, 'T': 119.12, 'W': 204.23, 'Y': 181.19, 'V': 117.15} +amino_brutto = {'A':(3,7,1,2,0), 'R':(6,14,4,2,0), + 'N':(4,8,2,3,0), 'D':(4,7,1,4,0), + 'V':(5,11,1,2,0), 'H':(6,9,3,2,0), + 'G':(2,5,1,2,0), 'Q':(5,10,2,3,0), + 'E':(5,9,1,4,0), 'I':(6,13,1,2,0), + 'L':(6,13,1,2,0), 'K':(6,14,2,2,0), + 'M':(5,11,1,2,1), 'P':(5,9,1,2,0), + 'S':(3,7,1,3,0), 'Y':(9,11,1,3,0), + 'T':(4,9,11,1,3,0), 'W':(11,12,2,2,0), + 'F':(9,11,1,2,0), 'C':(3,7,1,2,1)} + def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format:int) -> list: """ @@ -54,7 +65,7 @@ def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format: - length procedure returns list of int values """ aa_seqs = args - procedures = ('molecular_weight', 'one_letter_to_three', 'get_amino_acid_sum', 'codon_optimization', 'length') + procedures = ('molecular_weight', 'one_letter_to_three', 'get_amino_acid_sum', 'codon_optimization', 'length', 'brutto_count') aa_seqs = name_transform(aa_seqs, format) # for aa_seq in aa_seqs: @@ -77,6 +88,9 @@ def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format: if procedure == 'length': return length(aa_seqs) + + if procedure == 'brutto_count': + return brutto_count(aa_seqs) # def validate(aa_seq: str) -> None: # """Validates if aa sequence consists of only amino acid characters""" @@ -236,5 +250,16 @@ def check_amino_acid(input_amino:str): else: print(f'Error {input_amino} is incorrect form of amino acid notation. Correct your input') return False - + + +def brutto_count(seqs:touple): + elements = ['C', 'H', 'N', 'O', 'S'] + result = [] + for seq in seqs: + brutto_list = [amino_brutto.get(letter) for letter in seq] + brutto_pair = list(zip(*brutto_list)) + brutto = [sum(i) for i in brutto_pair] + brutto_dict = dict(zip(elements, brutto)) + result.append(brutto_dict) + return result From 8edf3e7603137f463be49776c5c9815dfa8759cb Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:27:26 +0300 Subject: [PATCH 36/88] Update protein_analysis_tool.py Debug input types --- protein_analysis_tool.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 989e251..6ab29d2 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -197,7 +197,7 @@ def codon_optimization(protein_sequences: tuple, cell_type:str) -> list: raise ValueError('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' 'Mouse') -def length(seqs:touple): +def length(seqs:tuple): result = [len(seq) for seq in seqs] return result @@ -206,7 +206,7 @@ def length(seqs:touple): # result = [seq[::-1] for seq in seqs] # return result -def name_transform(seqs:touple, format:int): +def name_transform(seqs:tuple, format:int): result = [] if format == 1: for seq in seqs: @@ -252,7 +252,7 @@ def check_amino_acid(input_amino:str): return False -def brutto_count(seqs:touple): +def brutto_count(seqs:tuple): elements = ['C', 'H', 'N', 'O', 'S'] result = [] for seq in seqs: From 312d090364f15d4a48ff6404296ba397c9495d13 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:32:55 +0200 Subject: [PATCH 37/88] Update README.md Make corrections to adjust readme to actual code --- README.md | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index dd2ef3b..11a72b9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,11 @@ # Protein Info -This project consists of one function "protein" that helps user to predict molecular weight of amino acid (aa) sequences, translate aa sequences from one-letter to three-letter code etc. Sequences are accepted as single-letter code: 20 aa without stop codon (A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V). +This project consists of one function "protein_analysis" that helps user to: +- predict molecular weight of amino acid (aa) sequences +- translate aa sequences from one-letter to three-letter code +- calculate total amount of each amino acid in the sequences +- make DNA based codon optimization for the introduced amino acid sequences with the support for 3 cell types: Esherichia coli, Pichia pastoris, Mouse +- calculate length of amino acid sequences ## Technology: @@ -8,7 +13,13 @@ python ## How to use: -This function accepts arguments as a list of strings. Last argument in the list should be a procedure that should be applied to the sequences. +Call the "protein_analysis" funcion with following arguments. +Requred arguments: + - tuple of protein sequences written one letter or three letter code without stop codos. Please do not use sequences in different formats in the same function call! + - name of procedure as string (see list of precedures) + - format of code for the protein sequences as int: 1 for one letter, 3 for three letter code +Optional argument: + - cell type (required only for codon_optimization procedure). Accepted cell types Esherichia coli, Pichia pastoris, Mouse ## List of procedures: @@ -16,23 +27,21 @@ This function accepts arguments as a list of strings. Last argument in the list - `one_letter_to_three` — will return list of strings, containing the same sequences written in three-letter code - `get_amino_acid_sum` — сounts the amount of each amino acid in the injected protein sequences - `codon_optimization` — makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse -- -- -- +- `length` - calculates length of amino acid sequences +- `brutto_count` ## Example of use: -> protein("ACD", "AD", "one_letter_to_three") # ['AlaCysAsp', 'AlaAsp'] \ -> protein("ACD", "AD", "molecular_weight") # [0.34, 0.22] \ -> protein("ACD", "AD", "get_amino_acid_sum") # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] \ -> protein("ACD", "AD", "codon_optimization", cell_type = 'E.coli') # ['GCGTGCGAT', 'GCGGAT'] +> protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1) # ['AlaCysAsp', 'AlaAsp'] \ +> protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3) # [0.37, 0.22] \ +> protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] \ +> protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1) # ['GCGTGCGAT', 'GCGGAT'] -## Possible erros: -> `ValueError`('Invalid alphabet, please use only single letter amino acid code') # Will occure if character other than A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V are used. \ -> `ValueError`('Requested procedure is not defined') # Will occure if last argument does not correspond to any listed procedure (see List of procedures). \ +## Possible errors: +> `ValueError`('Requested procedure is not defined') # Will occure if proc argument does not correspond to any listed procedure (see List of procedures). \ > `ValueError`('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse) # Will occure if the cell type is incorrectly entered to optimize codons. From db553cdf3291c6a69139b6f3a5a8ccc273abd195 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:37:10 +0200 Subject: [PATCH 38/88] Update README.md Make examples of use prettier --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 11a72b9..379063e 100644 --- a/README.md +++ b/README.md @@ -32,10 +32,10 @@ Optional argument: ## Example of use: -> protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1) # ['AlaCysAsp', 'AlaAsp'] \ -> protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3) # [0.37, 0.22] \ -> protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] \ -> protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1) # ['GCGTGCGAT', 'GCGGAT'] +> ```protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1)``` # ['AlaCysAsp', 'AlaAsp'] \ +> ```protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3)``` # [0.37, 0.22] \ +> ```protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1)``` # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] \ +> ```protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1)``` # ['GCGTGCGAT', 'GCGGAT'] From f03c1aba897e20a3a1c4d220acf5292359d6a165 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:41:41 +0300 Subject: [PATCH 39/88] Update protein_analysis_tool.py Restructured main function (protein_analysis). Add new way to call other functions --- protein_analysis_tool.py | 45 ++++++++++++---------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 6ab29d2..eabb030 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -64,41 +64,22 @@ def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format: - codon_optimization procedure returns list of strings - length procedure returns list of int values """ + aa_seqs = args - procedures = ('molecular_weight', 'one_letter_to_three', 'get_amino_acid_sum', 'codon_optimization', 'length', 'brutto_count') aa_seqs = name_transform(aa_seqs, format) - - # for aa_seq in aa_seqs: - # validate(aa_seq) - - if procedure not in procedures: + procedures = {'molecular_weight': molecular_weight, + 'one_letter_to_three': one_letter_to_three, + 'get_amino_acid_sum': get_amino_acid_sum, + 'codon_optimization': codon_optimization, + 'length': lenght, + 'brutto_count': brutto_count} + + if procedure not in procedures.keys(): raise ValueError('Requested procedure is not defined') - - if procedure == 'molecular_weight': - return molecular_weight(aa_seqs) - - if procedure == 'one_letter_to_three': - return one_letter_to_three(aa_seqs) - - if procedure == 'get_amino_acid_sum': - return get_amino_acid_sum(aa_seqs) - - if procedure == 'codon_optimization': - return codon_optimization(aa_seqs, cell_type) - - if procedure == 'length': - return length(aa_seqs) - - if procedure == 'brutto_count': - return brutto_count(aa_seqs) - -# def validate(aa_seq: str) -> None: -# """Validates if aa sequence consists of only amino acid characters""" -# aa_seq_set = set(aa_seq.upper()) -# all_aa = {'A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'} -# difference = aa_seq_set.difference(all_aa) -# if len(difference) > 0: -# raise ValueError('Invalid alphabet, please use only single letter amino acid code') + elif procedure == 'codon_optimization': + return procedures.get(procedure)(aa_seqs, cell_type) + else: + return procedures.get(procedure)(aa_seqs) def molecular_weight(aa_seqs: tuple) -> list: From 5f60697b8b6bb38694e292cc7a2cb27baa5ba770 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:46:09 +0300 Subject: [PATCH 40/88] Update protein_analysis_tool.py Change errors message in check_amino_acid and name_transform functions --- protein_analysis_tool.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index eabb030..eb81e1b 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -182,10 +182,6 @@ def length(seqs:tuple): result = [len(seq) for seq in seqs] return result - -# def reverse(seqs): -# result = [seq[::-1] for seq in seqs] -# return result def name_transform(seqs:tuple, format:int): result = [] @@ -211,26 +207,22 @@ def name_transform(seqs:tuple, format:int): return result else: - print('Error unsupported format. Only formats 1 and 3 are supported') - return False - + raise ValueError('Error unsupported format. Only formats 1 and 3 are supported') + def check_amino_acid(input_amino:str): if len(input_amino) == 1: letter = input_amino if letter not in amino_short_names_dic.keys(): - print(f'Error {letter} is not an amino acid. Correct your input') - return False + raise ValueError(f'Error {letter} is not an amino acid. Correct your input') else: return True elif len(input_amino) == 3: triplet = input_amino if triplet not in amino_names_dic.keys(): - print(f'Error {triplet} is not an amino acid. Correct your input') - return False + raise ValueError(f'Error {triplet} is not an amino acid. Correct your input') else: return True else: - print(f'Error {input_amino} is incorrect form of amino acid notation. Correct your input') - return False + raise ValueError(f'Error {input_amino} is incorrect form of amino acid notation. Correct your input') def brutto_count(seqs:tuple): From f3360e0faec42c331a2b932f23cbbec81356db93 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:52:53 +0300 Subject: [PATCH 41/88] Update protein_analysis_tool.py moved dictionaries from functions to the beginning of the code --- protein_analysis_tool.py | 46 +++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index eb81e1b..0a19666 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -41,6 +41,29 @@ 'T':(4,9,11,1,3,0), 'W':(11,12,2,2,0), 'F':(9,11,1,2,0), 'C':(3,7,1,2,1)} +ecoli_triplets = {'A': 'GCG', 'C': 'TGC', 'D': 'GAT', + 'E': 'GAA', 'F': 'TTT', 'G': 'GGC', + 'H': 'CAT', 'I': 'ATT', 'K': 'AAA', + 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', + 'P': 'CCG', 'Q': 'CAG', 'R': 'CGT', + 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', + 'W': 'TGG', 'Y': 'TAT'} + +ppastoris_triplets = {'A': 'GCT', 'C': 'TGT', 'D': 'GAT', + 'E': 'GAA', 'F': 'TTT', 'G': 'GGT', + 'H': 'CAT', 'I': 'ATT', 'K': 'AAG', + 'L': 'TTG', 'M': 'ATG', 'N': 'AAC', + 'P': 'CCA', 'Q': 'CAA', 'R': 'AGA', + 'S': 'TCT', 'T': 'ACT', 'V': 'GTT', + 'W': 'TGG', 'Y': 'TAC'} + +mouse_triplets = {'A': 'GCC', 'C': 'TGC', 'D': 'GAC', + 'E': 'GAG', 'F': 'TTC', 'G': 'GGC', + 'H': 'CAC', 'I': 'ATC', 'K': 'AAG', + 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', + 'P': 'CCC', 'Q': 'CAG', 'R': 'CGG', + 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', + 'W': 'TGG', 'Y': 'TAC'} def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format:int) -> list: """ @@ -50,6 +73,7 @@ def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format: - calculates total amount of each amino acid in the sequences (procedure name: get_amino_acid_sum) - makes DNA based codon optimization for the introduced amino acid sequences, support 3 types of cells: Esherichia coli, Pichia pastoris, Mouse (procedure name: codon_optimization) - calculates length of amino acid sequences (procedure name: length) + - counts the number of atoms of each type in a sequence (procedure name: brutto_count) Arguments: - tuple of protein sequences written one letter or three letter code (not mixed) @@ -120,9 +144,13 @@ def get_amino_acid_sum(protein_sequences: tuple) -> list: """ result = [] for protein_sequence in range(len(protein_sequences)): - amino_acid_count = {'A': 0, 'C': 0, 'D': 0, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, - 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, - 'T': 0, 'V': 0, 'W': 0, 'Y': 0} + amino_acid_count = {'A': 0, 'C': 0, 'D': 0, + 'E': 0, 'F': 0, 'G': 0, + 'H': 0, 'I': 0, 'K': 0, + 'L': 0, 'M': 0, 'N': 0, + 'P': 0, 'Q': 0, 'R': 0, + 'S': 0, 'T': 0, 'V': 0, + 'W': 0, 'Y': 0} for amino_acid in protein_sequences[protein_sequence]: amino_acid_count[amino_acid] += 1 result.append(amino_acid_count) @@ -144,10 +172,6 @@ def codon_optimization(protein_sequences: tuple, cell_type:str) -> list: if cell_type == 'Esherichia coli' or cell_type == 'E.coli': codon_optimization_ecoli = [] - ecoli_triplets = {'A': 'GCG', 'C': 'TGC', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT', 'G': 'GGC', - 'H': 'CAT', 'I': 'ATT', 'K': 'AAA', 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', - 'P': 'CCG', 'Q': 'CAG', 'R': 'CGT', 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', - 'W': 'TGG', 'Y': 'TAT'} replacer_ecoli = ecoli_triplets.get for amino_acid in range(len(protein_sequences)): codon_optimization_ecoli += [''.join([replacer_ecoli(n, n) for n in protein_sequences[amino_acid]])] @@ -155,10 +179,6 @@ def codon_optimization(protein_sequences: tuple, cell_type:str) -> list: if cell_type == 'Pichia pastoris' or cell_type == 'P.pastoris': codon_optimization_ppastoris = [] - ppastoris_triplets = {'A': 'GCT', 'C': 'TGT', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT', 'G': 'GGT', - 'H': 'CAT', 'I': 'ATT', 'K': 'AAG', 'L': 'TTG', 'M': 'ATG', 'N': 'AAC', - 'P': 'CCA', 'Q': 'CAA', 'R': 'AGA', 'S': 'TCT', 'T': 'ACT', 'V': 'GTT', - 'W': 'TGG', 'Y': 'TAC'} replacer_ppastoris = ppastoris_triplets.get for amino_acid in range(len(protein_sequences)): codon_optimization_ppastoris += [''.join([replacer_ppastoris(n, n) for n in protein_sequences[amino_acid]])] @@ -166,10 +186,6 @@ def codon_optimization(protein_sequences: tuple, cell_type:str) -> list: if cell_type == 'Mouse' or cell_type == 'mouse': codon_optimization_mouse = [] - mouse_triplets = {'A': 'GCC', 'C': 'TGC', 'D': 'GAC', 'E': 'GAG', 'F': 'TTC', 'G': 'GGC', - 'H': 'CAC', 'I': 'ATC', 'K': 'AAG', 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', - 'P': 'CCC', 'Q': 'CAG', 'R': 'CGG', 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', - 'W': 'TGG', 'Y': 'TAC'} replacer_Mouse = mouse_triplets.get for amino_acid in range(len(protein_sequences)): codon_optimization_mouse += [''.join([replacer_Mouse(n, n) for n in protein_sequences[amino_acid]])] From 416a09cc8d54e971e7fc4ec8823a203a60e8c2e9 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:59:14 +0300 Subject: [PATCH 42/88] Update README.md Change design to python code in "examples of use" and "Possible errors" --- README.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 379063e..9da11dc 100644 --- a/README.md +++ b/README.md @@ -32,18 +32,20 @@ Optional argument: ## Example of use: -> ```protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1)``` # ['AlaCysAsp', 'AlaAsp'] \ -> ```protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3)``` # [0.37, 0.22] \ -> ```protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1)``` # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] \ -> ```protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1)``` # ['GCGTGCGAT', 'GCGGAT'] - +```python +protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1) # ['AlaCysAsp', 'AlaAsp'] \ +protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3) # [0.37, 0.22] \ +protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1 # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] \ +protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1)``` # ['GCGTGCGAT', 'GCGGAT'] +``` ## Possible errors: +```python > `ValueError`('Requested procedure is not defined') # Will occure if proc argument does not correspond to any listed procedure (see List of procedures). \ > `ValueError`('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse) # Will occure if the cell type is incorrectly entered to optimize codons. - +``` ## Private policy and contacts This tool can be freely distributed and used. From f5ded0f0fb1e174c2e95add9d7e425012a2a8210 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 22:00:35 +0300 Subject: [PATCH 43/88] Update README.md add ")" in python code --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9da11dc..11d18ce 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Optional argument: ```python protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1) # ['AlaCysAsp', 'AlaAsp'] \ protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3) # [0.37, 0.22] \ -protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1 # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] \ +protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] \ protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1)``` # ['GCGTGCGAT', 'GCGGAT'] ``` From a2e31c2476456229d5c36cbe438b7981cdbe6a76 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 22:01:57 +0300 Subject: [PATCH 44/88] Update protein_analysis_tool.py Rename "lenght" to "length" --- protein_analysis_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 0a19666..3873f31 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -95,7 +95,7 @@ def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format: 'one_letter_to_three': one_letter_to_three, 'get_amino_acid_sum': get_amino_acid_sum, 'codon_optimization': codon_optimization, - 'length': lenght, + 'length': lengtр, 'brutto_count': brutto_count} if procedure not in procedures.keys(): From 6441637df6ac5b6be76decc86c2247a00b62249f Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 22:03:37 +0300 Subject: [PATCH 45/88] Update README.md Newlines debug --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 11d18ce..c1edb99 100644 --- a/README.md +++ b/README.md @@ -33,9 +33,10 @@ Optional argument: ## Example of use: ```python -protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1) # ['AlaCysAsp', 'AlaAsp'] \ -protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3) # [0.37, 0.22] \ -protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] \ +protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1) # ['AlaCysAsp', 'AlaAsp'] +protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3) # [0.37, 0.22] +protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, +{'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1)``` # ['GCGTGCGAT', 'GCGGAT'] ``` From c1068c50262bbaecf2355aed09a53264a635711a Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 22:05:54 +0300 Subject: [PATCH 46/88] Update README.md Newlines debug --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c1edb99..33b4405 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,9 @@ Optional argument: ```python protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1) # ['AlaCysAsp', 'AlaAsp'] protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3) # [0.37, 0.22] -protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, -{'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] +protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) + # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, +# {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1)``` # ['GCGTGCGAT', 'GCGGAT'] ``` From 8fa218416a29165e0338dc2d0aa88271c9eece95 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:06:16 +0200 Subject: [PATCH 47/88] Update protein_analysis_tool.py Fix length name in procedure dictionary --- protein_analysis_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 3873f31..af7cf0c 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -95,7 +95,7 @@ def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format: 'one_letter_to_three': one_letter_to_three, 'get_amino_acid_sum': get_amino_acid_sum, 'codon_optimization': codon_optimization, - 'length': lengtр, + 'length': length, 'brutto_count': brutto_count} if procedure not in procedures.keys(): From 22fcc698eaece305ec6957620bc263fdbd6db223 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 22:08:31 +0300 Subject: [PATCH 48/88] Update README.md Debug comments in python code --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 33b4405..7895c78 100644 --- a/README.md +++ b/README.md @@ -35,9 +35,7 @@ Optional argument: ```python protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1) # ['AlaCysAsp', 'AlaAsp'] protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3) # [0.37, 0.22] -protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) - # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, -# {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] +protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1)``` # ['GCGTGCGAT', 'GCGGAT'] ``` From c6fc03d87f9df1ef83960364f845505aad52118b Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 22:09:59 +0300 Subject: [PATCH 49/88] Update README.md Comments debug --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7895c78..88ad36f 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,8 @@ Optional argument: ```python protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1) # ['AlaCysAsp', 'AlaAsp'] protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3) # [0.37, 0.22] -protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] +protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, + # {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1)``` # ['GCGTGCGAT', 'GCGGAT'] ``` From c3cf0c3c643480805795a28960653512323b99e9 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:25:12 +0200 Subject: [PATCH 50/88] Update README.md Add personal contribution --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 88ad36f..63466cb 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,11 @@ Team leader: Ivan Kozin ivan.d.kozin@gmail.com Team photo: ![Снимок экрана 2023-09-29 210559_2](https://github.com/ivandkoz/HW4_Functions2_Kozin/assets/63678919/ad1302a1-d139-4c82-b7eb-d5b9ac1897e8) - +## Personal contribution +Ivan Kozin as a team leader coordinated work with github repository and wrote functions: length, brutto_count, check_amino_acid, name_transform. +Dasha Sokolova wrote functions: get_amino_acid_sum and codon_optimization functions. +Yulia Volkova wrote main function (protein_analysis), molecular_weight and one_letter_to_three functions. +Writting README, debugging code and testing it has been done by the efforts of all team. From bab0d9581c1b38f1921a4d92008c1b33945d9b19 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:25:49 +0200 Subject: [PATCH 51/88] Update README.md Add new lines to presonal contribution --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 63466cb..826ab34 100644 --- a/README.md +++ b/README.md @@ -65,9 +65,9 @@ Team photo: ![Снимок экрана 2023-09-29 210559_2](https://github.com/ivandkoz/HW4_Functions2_Kozin/assets/63678919/ad1302a1-d139-4c82-b7eb-d5b9ac1897e8) ## Personal contribution -Ivan Kozin as a team leader coordinated work with github repository and wrote functions: length, brutto_count, check_amino_acid, name_transform. -Dasha Sokolova wrote functions: get_amino_acid_sum and codon_optimization functions. -Yulia Volkova wrote main function (protein_analysis), molecular_weight and one_letter_to_three functions. +Ivan Kozin as a team leader coordinated work with github repository and wrote functions: length, brutto_count, check_amino_acid, name_transform.
+Dasha Sokolova wrote functions: get_amino_acid_sum and codon_optimization functions.
+Yulia Volkova wrote main function (protein_analysis), molecular_weight and one_letter_to_three functions.
Writting README, debugging code and testing it has been done by the efforts of all team. From 5a7971408045e8a1d47e44a201748e073b6cfc7f Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:48:31 +0200 Subject: [PATCH 52/88] Update README.md make contribution part pretty --- README.md | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 826ab34..d262558 100644 --- a/README.md +++ b/README.md @@ -65,9 +65,22 @@ Team photo: ![Снимок экрана 2023-09-29 210559_2](https://github.com/ivandkoz/HW4_Functions2_Kozin/assets/63678919/ad1302a1-d139-4c82-b7eb-d5b9ac1897e8) ## Personal contribution -Ivan Kozin as a team leader coordinated work with github repository and wrote functions: length, brutto_count, check_amino_acid, name_transform.
-Dasha Sokolova wrote functions: get_amino_acid_sum and codon_optimization functions.
-Yulia Volkova wrote main function (protein_analysis), molecular_weight and one_letter_to_three functions.
+`Ivan Kozin` (team leader) worte functions: +- length +- brutto_count +- check_amino_acid +- name_transform +- mamaged work with guthub repository + +`Dasha Sokolova` wrote functions: +- get_amino_acid_sum +- codon_optimization functions + +`Yulia Volkova` wrote functions: +- main (protein_analysis) +- molecular_weight +- one_letter_to_three functions + Writting README, debugging code and testing it has been done by the efforts of all team. From 25f5dd79728fc53319bde738153937545e4149f0 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:50:16 +0200 Subject: [PATCH 53/88] Update README.md correct misspelling --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d262558..3908e87 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Team photo: - brutto_count - check_amino_acid - name_transform -- mamaged work with guthub repository +- managed work with guthub repository `Dasha Sokolova` wrote functions: - get_amino_acid_sum From ba34ff71f44ab67cce5655fcb54875bf17706ccf Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 23:08:32 +0300 Subject: [PATCH 54/88] Update protein_analysis_tool.py Made default format 1. Add check for possible incorrect format specification. Add functions check_amino_acid_three_letter, check_length --- protein_analysis_tool.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index af7cf0c..59ef37a 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -65,7 +65,7 @@ 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', 'W': 'TGG', 'Y': 'TAC'} -def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format:int) -> list: +def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format:int = 1) -> list: """ Function protein_analysis: - calculates predicted molecular weight of amino acid sequences in kDa (procedure name: molecular_weight) @@ -201,14 +201,20 @@ def length(seqs:tuple): def name_transform(seqs:tuple, format:int): result = [] + multiple_of_three = [] + test_three_letters = [] if format == 1: for seq in seqs: + multiple_of_three.append(check_length(seq)) + test_three_letters.append(check_amino_acid_three_letter(seq)) seq = seq.upper() for letter in seq: if check_amino_acid(letter): pass else: return False result.append(seq) + if all(multiple_of_three) and all(test_three_letters): + print('Note: all your sequences are similar to three-letter ones. Check the format value') return result elif format == 3: for seq in seqs: @@ -251,4 +257,19 @@ def brutto_count(seqs:tuple): brutto_dict = dict(zip(elements, brutto)) result.append(brutto_dict) return result - + + +def check_length(seq): + seq_len = len(seq) + if seq_len % 3 == 0: + return True + else: return False + + +def check_amino_acid_three_letter(seq): + seq = seq.lower() + seq3 = [seq[i:i+3] for i in range(0, len(seq), 3)] + for triplet in seq3: + if triplet not in amino_names_dic.keys(): + return False + else: return True From eefb16ddc9bccaceaf28b51b593cfaf0ecd3bb1f Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 30 Sep 2023 23:13:29 +0300 Subject: [PATCH 55/88] Update README.md Add two func into Ivan func list --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3908e87..4be8d4a 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,8 @@ Optional argument: - `one_letter_to_three` — will return list of strings, containing the same sequences written in three-letter code - `get_amino_acid_sum` — сounts the amount of each amino acid in the injected protein sequences - `codon_optimization` — makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse -- `length` - calculates length of amino acid sequences -- `brutto_count` +- `length` — calculates length of amino acid sequences +- `brutto_count` — counts the number of atoms of each type in a sequence ## Example of use: @@ -44,7 +44,7 @@ protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.col ## Possible errors: ```python -> `ValueError`('Requested procedure is not defined') # Will occure if proc argument does not correspond to any listed procedure (see List of procedures). \ +> `ValueError`('Requested procedure is not defined') # Will occure if proc argument does not correspond to any listed procedure (see List of procedures). > `ValueError`('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse) # Will occure if the cell type is incorrectly entered to optimize codons. ``` @@ -70,6 +70,8 @@ Team photo: - brutto_count - check_amino_acid - name_transform +- check_amino_acid_three_letter +- check_length - managed work with guthub repository `Dasha Sokolova` wrote functions: From 8725c9828bf3b8d058aafbad481f03f8f3519da9 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:30:10 +0300 Subject: [PATCH 56/88] Update README.md Delete unnecessary "```" --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4be8d4a..f6070f5 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1) # ['Ala protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3) # [0.37, 0.22] protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, # {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] -protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1)``` # ['GCGTGCGAT', 'GCGGAT'] +protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1) # ['GCGTGCGAT', 'GCGGAT'] ``` From bf2b34ac845523905df5a2232d237b79642f4dab Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 09:46:08 +0200 Subject: [PATCH 57/88] Update protein_analysis_tool.py Change arguments and outputs types --- protein_analysis_tool.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 59ef37a..6600489 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -65,7 +65,7 @@ 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', 'W': 'TGG', 'Y': 'TAC'} -def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format:int = 1) -> list: +def protein_analysis(*args: str, procedure: str, cell_type:str = None, format:int = 1) -> list: """ Function protein_analysis: - calculates predicted molecular weight of amino acid sequences in kDa (procedure name: molecular_weight) @@ -76,7 +76,7 @@ def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format: - counts the number of atoms of each type in a sequence (procedure name: brutto_count) Arguments: - - tuple of protein sequences written one letter or three letter code (not mixed) + - one or multiple string of protein sequences written one letter or three letter code (not mixed) (see examples of use) - name of procedure as string - cell type (required only for codon_optimization procedure) - format of code for the protein sequences as int: 1 for one letter, 3 for three letter code @@ -106,7 +106,7 @@ def protein_analysis(*args: tuple, procedure: str, cell_type:str = None, format: return procedures.get(procedure)(aa_seqs) -def molecular_weight(aa_seqs: tuple) -> list: +def molecular_weight(aa_seqs: list) -> list: """Calculates predicated molecular weight of aa sequences. Returns list of floats""" molecular_weights = [] @@ -119,7 +119,7 @@ def molecular_weight(aa_seqs: tuple) -> list: return molecular_weights -def one_letter_to_three(aa_seqs: tuple) -> list: +def one_letter_to_three(aa_seqs: list) -> list: """Translates one letter coded aa sequences to three letter coded""" three_letters_seqs = [] @@ -132,7 +132,7 @@ def one_letter_to_three(aa_seqs: tuple) -> list: return three_letters_seqs -def get_amino_acid_sum(protein_sequences: tuple) -> list: +def get_amino_acid_sum(protein_sequences: list) -> list: """ Counts the amount of each amino acid in the injected protein sequences @@ -157,7 +157,7 @@ def get_amino_acid_sum(protein_sequences: tuple) -> list: return result -def codon_optimization(protein_sequences: tuple, cell_type:str) -> list: +def codon_optimization(protein_sequences: list, cell_type:str) -> list: """ Makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse @@ -194,12 +194,12 @@ def codon_optimization(protein_sequences: tuple, cell_type:str) -> list: raise ValueError('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' 'Mouse') -def length(seqs:tuple): +def length(seqs:list): result = [len(seq) for seq in seqs] return result -def name_transform(seqs:tuple, format:int): +def name_transform(seqs:list, format:int) -> list: result = [] multiple_of_three = [] test_three_letters = [] @@ -232,7 +232,7 @@ def name_transform(seqs:tuple, format:int): raise ValueError('Error unsupported format. Only formats 1 and 3 are supported') -def check_amino_acid(input_amino:str): +def check_amino_acid(input_amino:str) -> bool: if len(input_amino) == 1: letter = input_amino if letter not in amino_short_names_dic.keys(): @@ -247,7 +247,7 @@ def check_amino_acid(input_amino:str): raise ValueError(f'Error {input_amino} is incorrect form of amino acid notation. Correct your input') -def brutto_count(seqs:tuple): +def brutto_count(seqs: list) -> list: elements = ['C', 'H', 'N', 'O', 'S'] result = [] for seq in seqs: @@ -259,14 +259,14 @@ def brutto_count(seqs:tuple): return result -def check_length(seq): +def check_length(seq: str) -> bool: seq_len = len(seq) if seq_len % 3 == 0: return True else: return False -def check_amino_acid_three_letter(seq): +def check_amino_acid_three_letter(seq: str) -> bool: seq = seq.lower() seq3 = [seq[i:i+3] for i in range(0, len(seq), 3)] for triplet in seq3: From c461f2f40ad2ddd7d33de96b45412d6857f16773 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:49:25 +0300 Subject: [PATCH 58/88] Update README.md Update "how to use" --- README.md | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f6070f5..1dcdd4b 100644 --- a/README.md +++ b/README.md @@ -12,14 +12,25 @@ This project consists of one function "protein_analysis" that helps user to: python ## How to use: - +**protein_analysis** +protein_analysis(**args, procedure, cell_type=None, format=1*) +**Parametrs:** + ***args** : +Any number of lines with amino acid sequences + **procedure** : ***str*** + описание процедур + **cell_type** : ***str, defalut None*** + описание клеток + **format** : ***int, defalut 1*** + описание форматов + Call the "protein_analysis" funcion with following arguments. Requred arguments: - - tuple of protein sequences written one letter or three letter code without stop codos. Please do not use sequences in different formats in the same function call! - - name of procedure as string (see list of precedures) - - format of code for the protein sequences as int: 1 for one letter, 3 for three letter code +- tuple of protein sequences written one letter or three letter code without stop codos. Please do not use sequences in different formats in the same function call! +- name of procedure as string (see list of precedures) +- format of code for the protein sequences as int: 1 for one letter, 3 for three letter code Optional argument: - - cell type (required only for codon_optimization procedure). Accepted cell types Esherichia coli, Pichia pastoris, Mouse +- cell type (required only for codon_optimization procedure). Accepted cell types Esherichia coli, Pichia pastoris, Mouse ## List of procedures: From 9475b6eba6647986d1fc5b668f999aa01e5078d3 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:50:27 +0300 Subject: [PATCH 59/88] Update README.md Newlines debug --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1dcdd4b..2af6c8b 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,13 @@ python **protein_analysis** protein_analysis(**args, procedure, cell_type=None, format=1*) **Parametrs:** - ***args** : + ***args** :
Any number of lines with amino acid sequences - **procedure** : ***str*** + **procedure** : ***str***
описание процедур - **cell_type** : ***str, defalut None*** + **cell_type** : ***str, defalut None***
описание клеток - **format** : ***int, defalut 1*** + **format** : ***int, defalut 1***
описание форматов Call the "protein_analysis" funcion with following arguments. From 69bb4cde2d6a59d2aa55049cd30f36d610642884 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 09:52:21 +0200 Subject: [PATCH 60/88] Update protein_analysis_tool.py Change argument in name_transform --- protein_analysis_tool.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 6600489..544c92c 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -71,7 +71,8 @@ def protein_analysis(*args: str, procedure: str, cell_type:str = None, format:in - calculates predicted molecular weight of amino acid sequences in kDa (procedure name: molecular_weight) - translate aa sequences from one-letter to three-letter code (procedure name: one_letter_to_three) - calculates total amount of each amino acid in the sequences (procedure name: get_amino_acid_sum) - - makes DNA based codon optimization for the introduced amino acid sequences, support 3 types of cells: Esherichia coli, Pichia pastoris, Mouse (procedure name: codon_optimization) + - makes DNA based codon optimization for the introduced amino acid sequences, support 3 types of cells: + Esherichia coli, Pichia pastoris, Mouse (procedure name: codon_optimization) - calculates length of amino acid sequences (procedure name: length) - counts the number of atoms of each type in a sequence (procedure name: brutto_count) @@ -199,7 +200,7 @@ def length(seqs:list): return result -def name_transform(seqs:list, format:int) -> list: +def name_transform(seqs:tuple, format:int) -> list: result = [] multiple_of_three = [] test_three_letters = [] From 725a9b1a9c0b987e37fd6ef7ef7cb3be7ee87a23 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 09:54:05 +0200 Subject: [PATCH 61/88] Update protein_analysis_tool.py Make strings in docstring shorter --- protein_analysis_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 544c92c..5f59519 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -77,7 +77,7 @@ def protein_analysis(*args: str, procedure: str, cell_type:str = None, format:in - counts the number of atoms of each type in a sequence (procedure name: brutto_count) Arguments: - - one or multiple string of protein sequences written one letter or three letter code (not mixed) (see examples of use) + - one or multiple string of protein sequences written one letter or three letter code (not mixed) - name of procedure as string - cell type (required only for codon_optimization procedure) - format of code for the protein sequences as int: 1 for one letter, 3 for three letter code From 4dc0f07b76dc904b107355aeef208414ee38b199 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:01:28 +0200 Subject: [PATCH 62/88] Update README.md Add info about brutto count to general information --- README.md | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 2af6c8b..e5627fd 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,8 @@ This project consists of one function "protein_analysis" that helps user to: - translate aa sequences from one-letter to three-letter code - calculate total amount of each amino acid in the sequences - make DNA based codon optimization for the introduced amino acid sequences with the support for 3 cell types: Esherichia coli, Pichia pastoris, Mouse -- calculate length of amino acid sequences +- calculate length of amino acid sequences +- count the number of atoms of each type in a sequence (brutto formula) ## Technology: @@ -13,16 +14,16 @@ python ## How to use: **protein_analysis** -protein_analysis(**args, procedure, cell_type=None, format=1*) -**Parametrs:** - ***args** :
-Any number of lines with amino acid sequences - **procedure** : ***str***
- описание процедур - **cell_type** : ***str, defalut None***
- описание клеток - **format** : ***int, defalut 1***
- описание форматов +protein_analysis(**args, procedure, cell_type=None, format=1*)
+**Parametrs:**
+ ***args** : +Any number of lines with amino acid sequences
+ **procedure** : ***str*** + описание процедур
+ **cell_type** : ***str, defalut None*** + описание клеток
+ **format** : ***int, defalut 1*** + описание форматов
Call the "protein_analysis" funcion with following arguments. Requred arguments: From bd1141bc93b19a4d2b79061178a8e0228a37bc90 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 11:01:29 +0300 Subject: [PATCH 63/88] Update protein_analysis_tool.py Delete unnecessary returns from name_transform function --- protein_analysis_tool.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 5f59519..abc678d 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -195,6 +195,7 @@ def codon_optimization(protein_sequences: list, cell_type:str) -> list: raise ValueError('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' 'Mouse') + def length(seqs:list): result = [len(seq) for seq in seqs] return result @@ -212,7 +213,6 @@ def name_transform(seqs:tuple, format:int) -> list: for letter in seq: if check_amino_acid(letter): pass - else: return False result.append(seq) if all(multiple_of_three) and all(test_three_letters): print('Note: all your sequences are similar to three-letter ones. Check the format value') @@ -224,11 +224,9 @@ def name_transform(seqs:tuple, format:int) -> list: for triplet in seq3: if check_amino_acid(triplet): pass - else: return False seq_transformed = "".join([amino_names_dic.get(seq) for seq in seq3]) result.append(seq_transformed) return result - else: raise ValueError('Error unsupported format. Only formats 1 and 3 are supported') From a15b33167d0e72c3a2dff08e2e286e544961d9d8 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 11:11:30 +0300 Subject: [PATCH 64/88] Update README.md Newlines debug --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e5627fd..c0e98ef 100644 --- a/README.md +++ b/README.md @@ -16,13 +16,13 @@ python **protein_analysis** protein_analysis(**args, procedure, cell_type=None, format=1*)
**Parametrs:**
- ***args** : + ***args** :
Any number of lines with amino acid sequences
- **procedure** : ***str*** + **procedure** : ***str***
описание процедур
- **cell_type** : ***str, defalut None*** + **cell_type** : ***str, defalut None***
описание клеток
- **format** : ***int, defalut 1*** + **format** : ***int, defalut 1***
описание форматов
Call the "protein_analysis" funcion with following arguments. From 946897927c0154915a51850806f9f4eb7e8d437c Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:35:03 +0200 Subject: [PATCH 65/88] Update protein_analysis_tool.py Add docstring to some functions --- protein_analysis_tool.py | 74 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index abc678d..734e5d7 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -88,6 +88,7 @@ def protein_analysis(*args: str, procedure: str, cell_type:str = None, format:in - get_amino_acid_sum procedure returns list of dictionaries - codon_optimization procedure returns list of strings - length procedure returns list of int values + - brutto_count procedure returns list of dictionaries with counts of atoms in the sequence """ aa_seqs = args @@ -108,7 +109,13 @@ def protein_analysis(*args: str, procedure: str, cell_type:str = None, format:in def molecular_weight(aa_seqs: list) -> list: - """Calculates predicated molecular weight of aa sequences. Returns list of floats""" + """Calculates predicated molecular weight of aa sequences. + + Arguments: + - aa_seqs (list): list of string with the protein sequences + + Return: + - List of of floats corresponding to the molecular weight in kDa""" molecular_weights = [] for seq in aa_seqs: @@ -121,7 +128,13 @@ def molecular_weight(aa_seqs: list) -> list: def one_letter_to_three(aa_seqs: list) -> list: - """Translates one letter coded aa sequences to three letter coded""" + """Translates one-letter coded amino acid sequences to three-letter coded + Arguments: + - aa_seqs (list): list of string with the protein sequences + + Return: + - List of of strings with three-letter coded sequences + """ three_letters_seqs = [] for seq in aa_seqs: @@ -196,12 +209,31 @@ def codon_optimization(protein_sequences: list, cell_type:str) -> list: 'Mouse') -def length(seqs:list): +def length(seqs:list) -> list: + """ + Counts total length of amino acid sequence. + + Arguments: + - seqs (list): list of string with the protein sequences + + Return: + - list of int values corresponding to the length of sequences + """ result = [len(seq) for seq in seqs] return result def name_transform(seqs:tuple, format:int) -> list: + """ + Transforms the amino acid sequences given to protein_analysis function from three-letter code to one-letter code, + makes sequences unified (for one-letter format all letters to upper and for three-letter format all letters to lower). + + Arguments: + - seqs (tuple): tuple of string with the protein sequences + + Return: + - list of strings with the transformed sequences + """ result = [] multiple_of_three = [] test_three_letters = [] @@ -232,6 +264,15 @@ def name_transform(seqs:tuple, format:int) -> list: def check_amino_acid(input_amino:str) -> bool: + """ + Checks whether the entered string is an amino acid (either three-letter encoding or one-letter encoded). + + Arguments: + - input_amino (str): string corresponding to one amino acid (in three-letter code or one-letter code) + + Return: + - bool: True if amino acid is a valid amino acid, otherwise ValueError is amino acid is not correct + """ if len(input_amino) == 1: letter = input_amino if letter not in amino_short_names_dic.keys(): @@ -247,6 +288,15 @@ def check_amino_acid(input_amino:str) -> bool: def brutto_count(seqs: list) -> list: + """ + Calculates the brutto formula of the amino acid sequences. + + Arguments: + - seqs (list): list of string with the protein sequences + + Return: + - list of dictionaries with counts of each elemet included (elements C,H,N,O,S) + """ elements = ['C', 'H', 'N', 'O', 'S'] result = [] for seq in seqs: @@ -259,6 +309,15 @@ def brutto_count(seqs: list) -> list: def check_length(seq: str) -> bool: + """ + Checks if the sequence is divisible by three. + + Arguments: + - seq (str): string of protein sequence + + Return: + - bool: True if sequence is divisible by three, otherwise False + """ seq_len = len(seq) if seq_len % 3 == 0: return True @@ -266,6 +325,15 @@ def check_length(seq: str) -> bool: def check_amino_acid_three_letter(seq: str) -> bool: + """ + Checks whether all elements of a sequence are three-letter amino acid symbols. + + Arguments: + - seq (str): string of protein sequence + + Return: + - bool: True if sequence is corresponding to the valid three-letter amino acid, otherwise False + """ seq = seq.lower() seq3 = [seq[i:i+3] for i in range(0, len(seq), 3)] for triplet in seq3: From 9e246b2f3d084d0d7c9efdc41980ab6a1c225640 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:40:05 +0200 Subject: [PATCH 66/88] Update protein_analysis_tool.py Fix indent issue --- protein_analysis_tool.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 734e5d7..4e14705 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -325,7 +325,7 @@ def check_length(seq: str) -> bool: def check_amino_acid_three_letter(seq: str) -> bool: - """ + """ Checks whether all elements of a sequence are three-letter amino acid symbols. Arguments: @@ -334,9 +334,9 @@ def check_amino_acid_three_letter(seq: str) -> bool: Return: - bool: True if sequence is corresponding to the valid three-letter amino acid, otherwise False """ - seq = seq.lower() - seq3 = [seq[i:i+3] for i in range(0, len(seq), 3)] - for triplet in seq3: - if triplet not in amino_names_dic.keys(): - return False - else: return True + seq = seq.lower() + seq3 = [seq[i:i+3] for i in range(0, len(seq), 3)] + for triplet in seq3: + if triplet not in amino_names_dic.keys(): + return False + else: return True From c206d9a7bcfc4db6fb6bb7aa0c3587d3ff4d20b7 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 11:45:17 +0300 Subject: [PATCH 67/88] Update protein_analysis_tool.py Tabs debug --- protein_analysis_tool.py | 103 ++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 4e14705..9b29cdc 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -66,7 +66,7 @@ 'W': 'TGG', 'Y': 'TAC'} def protein_analysis(*args: str, procedure: str, cell_type:str = None, format:int = 1) -> list: - """ + """ Function protein_analysis: - calculates predicted molecular weight of amino acid sequences in kDa (procedure name: molecular_weight) - translate aa sequences from one-letter to three-letter code (procedure name: one_letter_to_three) @@ -89,7 +89,7 @@ def protein_analysis(*args: str, procedure: str, cell_type:str = None, format:in - codon_optimization procedure returns list of strings - length procedure returns list of int values - brutto_count procedure returns list of dictionaries with counts of atoms in the sequence - """ + """ aa_seqs = args aa_seqs = name_transform(aa_seqs, format) @@ -109,13 +109,15 @@ def protein_analysis(*args: str, procedure: str, cell_type:str = None, format:in def molecular_weight(aa_seqs: list) -> list: - """Calculates predicated molecular weight of aa sequences. + """ + Calculates predicated molecular weight of aa sequences. Arguments: - aa_seqs (list): list of string with the protein sequences Return: - - List of of floats corresponding to the molecular weight in kDa""" + - List of of floats corresponding to the molecular weight in kDa + """ molecular_weights = [] for seq in aa_seqs: @@ -128,13 +130,14 @@ def molecular_weight(aa_seqs: list) -> list: def one_letter_to_three(aa_seqs: list) -> list: - """Translates one-letter coded amino acid sequences to three-letter coded + """ + Translates one-letter coded amino acid sequences to three-letter coded Arguments: - aa_seqs (list): list of string with the protein sequences Return: - List of of strings with three-letter coded sequences - """ + """ three_letters_seqs = [] for seq in aa_seqs: @@ -147,7 +150,7 @@ def one_letter_to_three(aa_seqs: list) -> list: def get_amino_acid_sum(protein_sequences: list) -> list: - """ + """ Counts the amount of each amino acid in the injected protein sequences Arguments: @@ -155,7 +158,7 @@ def get_amino_acid_sum(protein_sequences: list) -> list: Return: - List of dictionary with amino acid amount - """ + """ result = [] for protein_sequence in range(len(protein_sequences)): amino_acid_count = {'A': 0, 'C': 0, 'D': 0, @@ -172,7 +175,7 @@ def get_amino_acid_sum(protein_sequences: list) -> list: def codon_optimization(protein_sequences: list, cell_type:str) -> list: - """ + """ Makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse @@ -182,7 +185,7 @@ def codon_optimization(protein_sequences: list, cell_type:str) -> list: Return: - List of codon-optimized DNA - """ + """ if cell_type == 'Esherichia coli' or cell_type == 'E.coli': codon_optimization_ecoli = [] @@ -210,7 +213,7 @@ def codon_optimization(protein_sequences: list, cell_type:str) -> list: def length(seqs:list) -> list: - """ + """ Counts total length of amino acid sequence. Arguments: @@ -218,21 +221,21 @@ def length(seqs:list) -> list: Return: - list of int values corresponding to the length of sequences - """ + """ result = [len(seq) for seq in seqs] return result def name_transform(seqs:tuple, format:int) -> list: """ - Transforms the amino acid sequences given to protein_analysis function from three-letter code to one-letter code, - makes sequences unified (for one-letter format all letters to upper and for three-letter format all letters to lower). - - Arguments: - - seqs (tuple): tuple of string with the protein sequences - - Return: - - list of strings with the transformed sequences + Transforms the amino acid sequences given to protein_analysis function from three-letter code to one-letter code, + makes sequences unified (for one-letter format all letters to upper and for three-letter format all letters to lower). + + Arguments: + - seqs (tuple): tuple of string with the protein sequences + + Return: + - list of strings with the transformed sequences """ result = [] multiple_of_three = [] @@ -265,13 +268,13 @@ def name_transform(seqs:tuple, format:int) -> list: def check_amino_acid(input_amino:str) -> bool: """ - Checks whether the entered string is an amino acid (either three-letter encoding or one-letter encoded). - - Arguments: - - input_amino (str): string corresponding to one amino acid (in three-letter code or one-letter code) - - Return: - - bool: True if amino acid is a valid amino acid, otherwise ValueError is amino acid is not correct + Checks whether the entered string is an amino acid (either three-letter encoding or one-letter encoded). + + Arguments: + - input_amino (str): string corresponding to one amino acid (in three-letter code or one-letter code) + + Return: + - bool: True if amino acid is a valid amino acid, otherwise ValueError is amino acid is not correct """ if len(input_amino) == 1: letter = input_amino @@ -289,13 +292,13 @@ def check_amino_acid(input_amino:str) -> bool: def brutto_count(seqs: list) -> list: """ - Calculates the brutto formula of the amino acid sequences. - - Arguments: - - seqs (list): list of string with the protein sequences - - Return: - - list of dictionaries with counts of each elemet included (elements C,H,N,O,S) + Calculates the brutto formula of the amino acid sequences. + + Arguments: + - seqs (list): list of string with the protein sequences + + Return: + - list of dictionaries with counts of each elemet included (elements C,H,N,O,S) """ elements = ['C', 'H', 'N', 'O', 'S'] result = [] @@ -309,14 +312,14 @@ def brutto_count(seqs: list) -> list: def check_length(seq: str) -> bool: - """ - Checks if the sequence is divisible by three. - - Arguments: - - seq (str): string of protein sequence - - Return: - - bool: True if sequence is divisible by three, otherwise False + """ + Checks if the sequence is divisible by three. + + Arguments: + - seq (str): string of protein sequence + + Return: + - bool: True if sequence is divisible by three, otherwise False """ seq_len = len(seq) if seq_len % 3 == 0: @@ -325,14 +328,14 @@ def check_length(seq: str) -> bool: def check_amino_acid_three_letter(seq: str) -> bool: - """ - Checks whether all elements of a sequence are three-letter amino acid symbols. - - Arguments: - - seq (str): string of protein sequence - - Return: - - bool: True if sequence is corresponding to the valid three-letter amino acid, otherwise False + """ + Checks whether all elements of a sequence are three-letter amino acid symbols. + + Arguments: + - seq (str): string of protein sequence + + Return: + - bool: True if sequence is corresponding to the valid three-letter amino acid, otherwise False """ seq = seq.lower() seq3 = [seq[i:i+3] for i in range(0, len(seq), 3)] From 4276a8b31a2817d731491f9121bb61f8645bb992 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:49:06 +0200 Subject: [PATCH 68/88] Update protein_analysis_tool.py Change name replacer_Mouse to replacer_mouse --- protein_analysis_tool.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 9b29cdc..5c4171b 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -203,9 +203,9 @@ def codon_optimization(protein_sequences: list, cell_type:str) -> list: if cell_type == 'Mouse' or cell_type == 'mouse': codon_optimization_mouse = [] - replacer_Mouse = mouse_triplets.get + replacer_mouse = mouse_triplets.get for amino_acid in range(len(protein_sequences)): - codon_optimization_mouse += [''.join([replacer_Mouse(n, n) for n in protein_sequences[amino_acid]])] + codon_optimization_mouse += [''.join([replacer_mouse(n, n) for n in protein_sequences[amino_acid]])] return codon_optimization_mouse else: raise ValueError('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' From 0bd3ae46868e1e6e6b3e097b565b689cb70e722a Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:53:32 +0200 Subject: [PATCH 69/88] Update protein_analysis_tool.py Renamed format to letter_format --- protein_analysis_tool.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 5c4171b..7ec1465 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -65,7 +65,7 @@ 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', 'W': 'TGG', 'Y': 'TAC'} -def protein_analysis(*args: str, procedure: str, cell_type:str = None, format:int = 1) -> list: +def protein_analysis(*args: str, procedure: str, cell_type:str = None, letter_format:int = 1) -> list: """ Function protein_analysis: - calculates predicted molecular weight of amino acid sequences in kDa (procedure name: molecular_weight) @@ -80,7 +80,7 @@ def protein_analysis(*args: str, procedure: str, cell_type:str = None, format:in - one or multiple string of protein sequences written one letter or three letter code (not mixed) - name of procedure as string - cell type (required only for codon_optimization procedure) - - format of code for the protein sequences as int: 1 for one letter, 3 for three letter code + - letter_format of code for the protein sequences as int: 1 for one letter, 3 for three letter code Return: - molecular_weight procedure returns list of floats @@ -92,7 +92,7 @@ def protein_analysis(*args: str, procedure: str, cell_type:str = None, format:in """ aa_seqs = args - aa_seqs = name_transform(aa_seqs, format) + aa_seqs = name_transform(aa_seqs, letter_format) procedures = {'molecular_weight': molecular_weight, 'one_letter_to_three': one_letter_to_three, 'get_amino_acid_sum': get_amino_acid_sum, @@ -226,10 +226,11 @@ def length(seqs:list) -> list: return result -def name_transform(seqs:tuple, format:int) -> list: +def name_transform(seqs:tuple, letter_format:int) -> list: """ Transforms the amino acid sequences given to protein_analysis function from three-letter code to one-letter code, - makes sequences unified (for one-letter format all letters to upper and for three-letter format all letters to lower). + makes sequences unified (for one-letter letter_format all letters to upper and + for three-letter letter_format to lower). Arguments: - seqs (tuple): tuple of string with the protein sequences @@ -240,7 +241,7 @@ def name_transform(seqs:tuple, format:int) -> list: result = [] multiple_of_three = [] test_three_letters = [] - if format == 1: + if letter_format == 1: for seq in seqs: multiple_of_three.append(check_length(seq)) test_three_letters.append(check_amino_acid_three_letter(seq)) @@ -250,9 +251,9 @@ def name_transform(seqs:tuple, format:int) -> list: pass result.append(seq) if all(multiple_of_three) and all(test_three_letters): - print('Note: all your sequences are similar to three-letter ones. Check the format value') + print('Note: all your sequences are similar to three-letter ones. Check the letter_format value') return result - elif format == 3: + elif letter_format == 3: for seq in seqs: seq = seq.lower() seq3 = [seq[i:i+3] for i in range(0, len(seq), 3)] @@ -263,7 +264,7 @@ def name_transform(seqs:tuple, format:int) -> list: result.append(seq_transformed) return result else: - raise ValueError('Error unsupported format. Only formats 1 and 3 are supported') + raise ValueError('Error unsupported letter_format. Only letter_formats 1 and 3 are supported') def check_amino_acid(input_amino:str) -> bool: From 7df8663bc6f98ce2a93590c2052006b37e629931 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 11:58:23 +0300 Subject: [PATCH 70/88] Update README.md Update "how to use" --- README.md | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index c0e98ef..8c8ab73 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,32 @@ This project consists of one function "protein_analysis" that helps user to: python ## How to use: -**protein_analysis** -protein_analysis(**args, procedure, cell_type=None, format=1*)
-**Parametrs:**
- ***args** :
-Any number of lines with amino acid sequences
+**protein_analysis**
+protein_analysis(**args, procedure, cell_type=None, letter_format=1*)
+**Parametrs:** +> ***args** : **sequence of str**
+>     Any number of lines with amino acid sequences
**procedure** : ***str***
- описание процедур
- **cell_type** : ***str, defalut None***
- описание клеток
- **format** : ***int, defalut 1***
- описание форматов
- +>     The name of the operation you want to perform. The following types of procedures are supported:
+>> +>> - ***molecular_weight***: calculates predicted molecular weight of amino acid sequences in kDa +>> - ***one_letter_to_three***: translate aa sequences from one-letter to three-letter code +>> - ***get_amino_acid_sum***: calculates total amount of each amino acid in the sequences +>> - ***codon_optimization***: makes DNA based codon optimization for the introduced amino acid sequences, support 3 types of cells. Can only be used in conjunction with **cell_type**: `Esherichia coli`, `Pichia pastoris`, `Mouse` +>> - ***length***: calculates length of amino acid sequences +>> - ***brutto_count***: counts the number of atoms of each type in a sequence +>> +> **cell_type** : ***str, defalut None***
+>     The type of cells for which optimization is applied. Cell types supported:
+>> +>> - `Esherichia coli` *or* `E.coli` +>> - `Pichia pastoris` *or* `P.pastoris` +>> - `Mouse` *or* `mouse` +>> +> **letter_format** : ***int, defalut 1***
+>     Specifies the format for receiving amino acid sequences. Either one-letter (**letter_format** = 1) or three-letter sequences (**letter_format** = 3)
+> + Call the "protein_analysis" funcion with following arguments. Requred arguments: - tuple of protein sequences written one letter or three letter code without stop codos. Please do not use sequences in different formats in the same function call! From 936eed20f0104810ffd1ae155f098f3123363122 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 11:07:40 +0200 Subject: [PATCH 71/88] Update README.md Add examples of use --- README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8c8ab73..7939e03 100644 --- a/README.md +++ b/README.md @@ -59,11 +59,15 @@ Optional argument: ## Example of use: ```python -protein_analysis("ACD", "AD", procedure="one_letter_to_three", format=1) # ['AlaCysAsp', 'AlaAsp'] -protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", format=3) # [0.37, 0.22] -protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, +protein_analysis("ACD", "AD", procedure="one_letter_to_three", letter_format=1) # ['AlaCysAsp', 'AlaAsp'] +protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", letter_format=3) # [0.37, 0.22] +protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", letter_format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, # {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] -protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', format=1) # ['GCGTGCGAT', 'GCGGAT'] +protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', letter_format=1) # ['GCGTGCGAT', 'GCGGAT'] +protein_analysis("acDEFGHIKLMNPQRSTVwy", "ad", procedure="length", letter_format=1) # [20, 2] +protein_analysis("FGHIKLMNPQ", "PQRSTVwy", "adN", procedure="brutto_count", letter_format=1) # [{'C': 54, 'H': 103, 'N': 15, 'O': 22, 'S': 1}, + # {'C': 48, 'H': 83, 'N': 23, 'O': 18, 'S': 3}, + # {'C': 11, 'H': 22, 'N': 4, 'O': 9, 'S': 0}] ``` From 6b001d07a2fa6273136a3d3d44c9e426d2a8cc3c Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 11:19:45 +0200 Subject: [PATCH 72/88] Update README.md Add more erros --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7939e03..a76bb07 100644 --- a/README.md +++ b/README.md @@ -74,8 +74,12 @@ protein_analysis("FGHIKLMNPQ", "PQRSTVwy", "adN", procedure="brutto_count", lett ## Possible errors: ```python -> `ValueError`('Requested procedure is not defined') # Will occure if proc argument does not correspond to any listed procedure (see List of procedures). -> `ValueError`('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse) # Will occure if the cell type is incorrectly entered to optimize codons. +> `ValueError`('Requested procedure is not defined') # Will occur if proc argument does not correspond to any listed procedure (see List of procedures). +> `ValueError`('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse) # Will occur if the cell type is incorrectly entered to optimize codons. +> `ValueError`('Error unsupported letter_format. Only letter_formats 1 and 3 are supported') # Will oocur if invalid format of input is given. Please check that you have all the sequences written in the same format (one-letter ir three-letter code). Case of input is not important and can be given in upper, lower or mixed case. +> `ValueError`(Error {letter} is not an amino acid. Correct your input') # Will occur if at least one of the amino acids given in not valid one-letter amino acid (letter_format=1). +> `ValueError`(Error {triplet} is not an amino acid. Correct your input') # Will occur if at least one of the amino acids given in not valid three-letter amino acid (letter_format=3). +> `ValueError`(Error {input_amino} is incorrect form of amino acid notation. Correct your input') # Will occur if input amino acid sequences are not given in correct format. ``` ## Private policy and contacts From 3222f3b261b37ce8edf8aba71b5f943356e2f17a Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:36:52 +0300 Subject: [PATCH 73/88] Update protein_analysis_tool.py Code design --- protein_analysis_tool.py | 500 +++++++++++++++++++++++++-------------- 1 file changed, 324 insertions(+), 176 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 7ec1465..0e90029 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -1,85 +1,205 @@ -amino_short_names_dic = {'A':'Ala', 'R':'Arg', 'N':'Asn', - 'D':'Asp', 'V':'Val', 'H':'His', - 'G':'Gly', 'Q':'Gln', 'E':'Glu', - 'I':'Ile', 'L':'Leu', 'K':'Lys', - 'M':'Met', 'P':'Pro', 'S':'Ser', - 'Y':'Tyr', 'T':'Thr', 'W':'Trp', - 'F':'Phe', 'C':'Cys'} - -amino_names_dic = {'ala': 'A', 'arg': 'R', 'asn': 'N', - 'asp': 'D', 'val': 'V', 'his': 'H', - 'gly': 'G', 'gln': 'Q', 'glu': 'E', - 'ile': 'I', 'leu': 'L', 'lys': 'K', - 'met': 'M', 'pro': 'P', 'ser': 'S', - 'tyr': 'Y', 'thr': 'T', 'trp': 'W', - 'phe': 'F', 'cys': 'C'} - -amino_names_dic_reverse = {'Ala': 'A', 'Arg': 'R', 'Asn': 'N', - 'Asp': 'D', 'Val': 'V', 'His': 'H', - 'Gly': 'G', 'Gln': 'Q', 'Glu': 'E', - 'Ile': 'I', 'Leu': 'L', 'Lys': 'K', - 'Met': 'M', 'Pro': 'P', 'Ser': 'S', - 'Tyr': 'Y', 'Thr': 'T', 'Trp': 'W', - 'Phe': 'F', 'Cys': 'C'} - -aa_weights = {'A': 89.09, 'R': 174.20, 'N': 132.12, - 'D': 133.10, 'C': 121.16, 'E': 147.13, - 'Q': 146.15, 'G': 75.07, 'H': 155.16, - 'I': 131.18, 'L': 131.18, 'K': 146.19, - 'M': 149.21, 'F': 165.19, 'P': 115.13, - 'S': 105.09, 'T': 119.12, 'W': 204.23, - 'Y': 181.19, 'V': 117.15} - -amino_brutto = {'A':(3,7,1,2,0), 'R':(6,14,4,2,0), - 'N':(4,8,2,3,0), 'D':(4,7,1,4,0), - 'V':(5,11,1,2,0), 'H':(6,9,3,2,0), - 'G':(2,5,1,2,0), 'Q':(5,10,2,3,0), - 'E':(5,9,1,4,0), 'I':(6,13,1,2,0), - 'L':(6,13,1,2,0), 'K':(6,14,2,2,0), - 'M':(5,11,1,2,1), 'P':(5,9,1,2,0), - 'S':(3,7,1,3,0), 'Y':(9,11,1,3,0), - 'T':(4,9,11,1,3,0), 'W':(11,12,2,2,0), - 'F':(9,11,1,2,0), 'C':(3,7,1,2,1)} - -ecoli_triplets = {'A': 'GCG', 'C': 'TGC', 'D': 'GAT', - 'E': 'GAA', 'F': 'TTT', 'G': 'GGC', - 'H': 'CAT', 'I': 'ATT', 'K': 'AAA', - 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', - 'P': 'CCG', 'Q': 'CAG', 'R': 'CGT', - 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', - 'W': 'TGG', 'Y': 'TAT'} - -ppastoris_triplets = {'A': 'GCT', 'C': 'TGT', 'D': 'GAT', - 'E': 'GAA', 'F': 'TTT', 'G': 'GGT', - 'H': 'CAT', 'I': 'ATT', 'K': 'AAG', - 'L': 'TTG', 'M': 'ATG', 'N': 'AAC', - 'P': 'CCA', 'Q': 'CAA', 'R': 'AGA', - 'S': 'TCT', 'T': 'ACT', 'V': 'GTT', - 'W': 'TGG', 'Y': 'TAC'} - -mouse_triplets = {'A': 'GCC', 'C': 'TGC', 'D': 'GAC', - 'E': 'GAG', 'F': 'TTC', 'G': 'GGC', - 'H': 'CAC', 'I': 'ATC', 'K': 'AAG', - 'L': 'CTG', 'M': 'ATG', 'N': 'AAC', - 'P': 'CCC', 'Q': 'CAG', 'R': 'CGG', - 'S': 'AGC', 'T': 'ACC', 'V': 'GTG', - 'W': 'TGG', 'Y': 'TAC'} - -def protein_analysis(*args: str, procedure: str, cell_type:str = None, letter_format:int = 1) -> list: - """ +amino_short_names_dic = { + "A": "Ala", + "R": "Arg", + "N": "Asn", + "D": "Asp", + "V": "Val", + "H": "His", + "G": "Gly", + "Q": "Gln", + "E": "Glu", + "I": "Ile", + "L": "Leu", + "K": "Lys", + "M": "Met", + "P": "Pro", + "S": "Ser", + "Y": "Tyr", + "T": "Thr", + "W": "Trp", + "F": "Phe", + "C": "Cys", +} + +amino_names_dic = { + "ala": "A", + "arg": "R", + "asn": "N", + "asp": "D", + "val": "V", + "his": "H", + "gly": "G", + "gln": "Q", + "glu": "E", + "ile": "I", + "leu": "L", + "lys": "K", + "met": "M", + "pro": "P", + "ser": "S", + "tyr": "Y", + "thr": "T", + "trp": "W", + "phe": "F", + "cys": "C", +} + +amino_names_dic_reverse = { + "Ala": "A", + "Arg": "R", + "Asn": "N", + "Asp": "D", + "Val": "V", + "His": "H", + "Gly": "G", + "Gln": "Q", + "Glu": "E", + "Ile": "I", + "Leu": "L", + "Lys": "K", + "Met": "M", + "Pro": "P", + "Ser": "S", + "Tyr": "Y", + "Thr": "T", + "Trp": "W", + "Phe": "F", + "Cys": "C", +} + +aa_weights = { + "A": 89.09, + "R": 174.20, + "N": 132.12, + "D": 133.10, + "C": 121.16, + "E": 147.13, + "Q": 146.15, + "G": 75.07, + "H": 155.16, + "I": 131.18, + "L": 131.18, + "K": 146.19, + "M": 149.21, + "F": 165.19, + "P": 115.13, + "S": 105.09, + "T": 119.12, + "W": 204.23, + "Y": 181.19, + "V": 117.15, +} + +amino_brutto = { + "A": (3, 7, 1, 2, 0), + "R": (6, 14, 4, 2, 0), + "N": (4, 8, 2, 3, 0), + "D": (4, 7, 1, 4, 0), + "V": (5, 11, 1, 2, 0), + "H": (6, 9, 3, 2, 0), + "G": (2, 5, 1, 2, 0), + "Q": (5, 10, 2, 3, 0), + "E": (5, 9, 1, 4, 0), + "I": (6, 13, 1, 2, 0), + "L": (6, 13, 1, 2, 0), + "K": (6, 14, 2, 2, 0), + "M": (5, 11, 1, 2, 1), + "P": (5, 9, 1, 2, 0), + "S": (3, 7, 1, 3, 0), + "Y": (9, 11, 1, 3, 0), + "T": (4, 9, 11, 1, 3, 0), + "W": (11, 12, 2, 2, 0), + "F": (9, 11, 1, 2, 0), + "C": (3, 7, 1, 2, 1), +} + +ecoli_triplets = { + "A": "GCG", + "C": "TGC", + "D": "GAT", + "E": "GAA", + "F": "TTT", + "G": "GGC", + "H": "CAT", + "I": "ATT", + "K": "AAA", + "L": "CTG", + "M": "ATG", + "N": "AAC", + "P": "CCG", + "Q": "CAG", + "R": "CGT", + "S": "AGC", + "T": "ACC", + "V": "GTG", + "W": "TGG", + "Y": "TAT", +} + +ppastoris_triplets = { + "A": "GCT", + "C": "TGT", + "D": "GAT", + "E": "GAA", + "F": "TTT", + "G": "GGT", + "H": "CAT", + "I": "ATT", + "K": "AAG", + "L": "TTG", + "M": "ATG", + "N": "AAC", + "P": "CCA", + "Q": "CAA", + "R": "AGA", + "S": "TCT", + "T": "ACT", + "V": "GTT", + "W": "TGG", + "Y": "TAC", +} + +mouse_triplets = { + "A": "GCC", + "C": "TGC", + "D": "GAC", + "E": "GAG", + "F": "TTC", + "G": "GGC", + "H": "CAC", + "I": "ATC", + "K": "AAG", + "L": "CTG", + "M": "ATG", + "N": "AAC", + "P": "CCC", + "Q": "CAG", + "R": "CGG", + "S": "AGC", + "T": "ACC", + "V": "GTG", + "W": "TGG", + "Y": "TAC", +} + + +def protein_analysis( + *args: str, procedure: str, cell_type: str = None, letter_format: int = 1 +) -> list: + """ Function protein_analysis: - calculates predicted molecular weight of amino acid sequences in kDa (procedure name: molecular_weight) - translate aa sequences from one-letter to three-letter code (procedure name: one_letter_to_three) - - calculates total amount of each amino acid in the sequences (procedure name: get_amino_acid_sum) - - makes DNA based codon optimization for the introduced amino acid sequences, support 3 types of cells: - Esherichia coli, Pichia pastoris, Mouse (procedure name: codon_optimization) + - calculates total amount of each amino acid in the sequences (procedure name: get_amino_acid_sum) + - makes DNA based codon optimization for the introduced amino acid sequences, support 3 types of cells: + Esherichia coli, Pichia pastoris, Mouse (procedure name: codon_optimization) - calculates length of amino acid sequences (procedure name: length) - counts the number of atoms of each type in a sequence (procedure name: brutto_count) - + Arguments: - one or multiple string of protein sequences written one letter or three letter code (not mixed) - name of procedure as string - - cell type (required only for codon_optimization procedure) + - cell type (required only for codon_optimization procedure) - letter_format of code for the protein sequences as int: 1 for one letter, 3 for three letter code Return: @@ -89,93 +209,104 @@ def protein_analysis(*args: str, procedure: str, cell_type:str = None, letter_fo - codon_optimization procedure returns list of strings - length procedure returns list of int values - brutto_count procedure returns list of dictionaries with counts of atoms in the sequence - """ - - aa_seqs = args - aa_seqs = name_transform(aa_seqs, letter_format) - procedures = {'molecular_weight': molecular_weight, - 'one_letter_to_three': one_letter_to_three, - 'get_amino_acid_sum': get_amino_acid_sum, - 'codon_optimization': codon_optimization, - 'length': length, - 'brutto_count': brutto_count} - + """ + aa_seqs = name_transform(args, letter_format) + procedures = { + "molecular_weight": molecular_weight, + "one_letter_to_three": one_letter_to_three, + "get_amino_acid_sum": get_amino_acid_sum, + "codon_optimization": codon_optimization, + "length": length, + "brutto_count": brutto_count, + } if procedure not in procedures.keys(): - raise ValueError('Requested procedure is not defined') - elif procedure == 'codon_optimization': + raise ValueError("Requested procedure is not defined") + elif procedure == "codon_optimization": return procedures.get(procedure)(aa_seqs, cell_type) - else: + else: return procedures.get(procedure)(aa_seqs) def molecular_weight(aa_seqs: list) -> list: - """ + """ Calculates predicated molecular weight of aa sequences. - + Arguments: - aa_seqs (list): list of string with the protein sequences - + Return: - List of of floats corresponding to the molecular weight in kDa - """ - + """ molecular_weights = [] for seq in aa_seqs: total_weight = 0 for aa in seq: aa = aa.upper() total_weight += aa_weights[aa] - molecular_weights.append(round(total_weight/1000, 2)) + molecular_weights.append(round(total_weight / 1000, 2)) return molecular_weights def one_letter_to_three(aa_seqs: list) -> list: - """ - Translates one-letter coded amino acid sequences to three-letter coded + """ + Translates one-letter coded amino acid sequences to three-letter coded Arguments: - aa_seqs (list): list of string with the protein sequences - + Return: - List of of strings with three-letter coded sequences - """ - + """ three_letters_seqs = [] for seq in aa_seqs: three_letters_seq = [] for aa in seq: aa = aa.upper() three_letters_seq.append(amino_short_names_dic[aa]) - three_letters_seqs.append(''.join(three_letters_seq)) + three_letters_seqs.append("".join(three_letters_seq)) return three_letters_seqs - + def get_amino_acid_sum(protein_sequences: list) -> list: - """ + """ Counts the amount of each amino acid in the injected protein sequences Arguments: - protein_sequences (list): list of injected protein sequence Return: - - List of dictionary with amino acid amount - """ + - List of dictionary with amino acid amount""" result = [] for protein_sequence in range(len(protein_sequences)): - amino_acid_count = {'A': 0, 'C': 0, 'D': 0, - 'E': 0, 'F': 0, 'G': 0, - 'H': 0, 'I': 0, 'K': 0, - 'L': 0, 'M': 0, 'N': 0, - 'P': 0, 'Q': 0, 'R': 0, - 'S': 0, 'T': 0, 'V': 0, - 'W': 0, 'Y': 0} + amino_acid_count = { + "A": 0, + "C": 0, + "D": 0, + "E": 0, + "F": 0, + "G": 0, + "H": 0, + "I": 0, + "K": 0, + "L": 0, + "M": 0, + "N": 0, + "P": 0, + "Q": 0, + "R": 0, + "S": 0, + "T": 0, + "V": 0, + "W": 0, + "Y": 0, + } for amino_acid in protein_sequences[protein_sequence]: amino_acid_count[amino_acid] += 1 result.append(amino_acid_count) return result -def codon_optimization(protein_sequences: list, cell_type:str) -> list: - """ +def codon_optimization(protein_sequences: list, cell_type: str) -> list: + """ Makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse @@ -184,65 +315,72 @@ def codon_optimization(protein_sequences: list, cell_type:str) -> list: - cell_type (str): user-entered cell type for codon optimization Return: - - List of codon-optimized DNA - """ + - List of codon-optimized DNA""" - if cell_type == 'Esherichia coli' or cell_type == 'E.coli': + if cell_type == "Esherichia coli" or cell_type == "E.coli": codon_optimization_ecoli = [] replacer_ecoli = ecoli_triplets.get for amino_acid in range(len(protein_sequences)): - codon_optimization_ecoli += [''.join([replacer_ecoli(n, n) for n in protein_sequences[amino_acid]])] + codon_optimization_ecoli += [ + "".join([replacer_ecoli(n, n) for n in protein_sequences[amino_acid]]) + ] return codon_optimization_ecoli - if cell_type == 'Pichia pastoris' or cell_type == 'P.pastoris': + if cell_type == "Pichia pastoris" or cell_type == "P.pastoris": codon_optimization_ppastoris = [] replacer_ppastoris = ppastoris_triplets.get for amino_acid in range(len(protein_sequences)): - codon_optimization_ppastoris += [''.join([replacer_ppastoris(n, n) for n in protein_sequences[amino_acid]])] + codon_optimization_ppastoris += [ + "".join( + [replacer_ppastoris(n, n) for n in protein_sequences[amino_acid]] + ) + ] return codon_optimization_ppastoris - if cell_type == 'Mouse' or cell_type == 'mouse': + if cell_type == "Mouse" or cell_type == "mouse": codon_optimization_mouse = [] replacer_mouse = mouse_triplets.get for amino_acid in range(len(protein_sequences)): - codon_optimization_mouse += [''.join([replacer_mouse(n, n) for n in protein_sequences[amino_acid]])] + codon_optimization_mouse += [ + "".join([replacer_mouse(n, n) for n in protein_sequences[amino_acid]]) + ] return codon_optimization_mouse else: - raise ValueError('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris,' - 'Mouse') + raise ValueError( + "The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris," + "Mouse" + ) -def length(seqs:list) -> list: - """ +def length(seqs: list) -> list: + """ Counts total length of amino acid sequence. - + Arguments: - seqs (list): list of string with the protein sequences - + Return: - - list of int values corresponding to the length of sequences - """ + - list of int values corresponding to the length of sequences""" result = [len(seq) for seq in seqs] return result - -def name_transform(seqs:tuple, letter_format:int) -> list: - """ - Transforms the amino acid sequences given to protein_analysis function from three-letter code to one-letter code, - makes sequences unified (for one-letter letter_format all letters to upper and + +def name_transform(seqs: tuple, letter_format: int) -> list: + """ + Transforms the amino acid sequences given to protein_analysis function from three-letter code to one-letter code, + makes sequences unified (for one-letter letter_format all letters to upper and for three-letter letter_format to lower). - + Arguments: - seqs (tuple): tuple of string with the protein sequences - + Return: - - list of strings with the transformed sequences - """ + - list of strings with the transformed sequences""" result = [] multiple_of_three = [] test_three_letters = [] if letter_format == 1: - for seq in seqs: + for seq in seqs: multiple_of_three.append(check_length(seq)) test_three_letters.append(check_amino_acid_three_letter(seq)) seq = seq.upper() @@ -251,12 +389,14 @@ def name_transform(seqs:tuple, letter_format:int) -> list: pass result.append(seq) if all(multiple_of_three) and all(test_three_letters): - print('Note: all your sequences are similar to three-letter ones. Check the letter_format value') + print( + "Note: all your sequences are similar to three-letter ones. Check the letter_format value" + ) return result elif letter_format == 3: for seq in seqs: seq = seq.lower() - seq3 = [seq[i:i+3] for i in range(0, len(seq), 3)] + seq3 = [seq[i: i + 3] for i in range(0, len(seq), 3)] for triplet in seq3: if check_amino_acid(triplet): pass @@ -264,83 +404,91 @@ def name_transform(seqs:tuple, letter_format:int) -> list: result.append(seq_transformed) return result else: - raise ValueError('Error unsupported letter_format. Only letter_formats 1 and 3 are supported') + raise ValueError( + "Error unsupported letter_format. Only letter_formats 1 and 3 are supported" + ) -def check_amino_acid(input_amino:str) -> bool: - """ +def check_amino_acid(input_amino: str) -> bool: + """ Checks whether the entered string is an amino acid (either three-letter encoding or one-letter encoded). - + Arguments: - input_amino (str): string corresponding to one amino acid (in three-letter code or one-letter code) - + Return: - bool: True if amino acid is a valid amino acid, otherwise ValueError is amino acid is not correct - """ + """ if len(input_amino) == 1: letter = input_amino if letter not in amino_short_names_dic.keys(): - raise ValueError(f'Error {letter} is not an amino acid. Correct your input') - else: return True + raise ValueError(f"Error {letter} is not an amino acid. Correct your input") + else: + return True elif len(input_amino) == 3: triplet = input_amino if triplet not in amino_names_dic.keys(): - raise ValueError(f'Error {triplet} is not an amino acid. Correct your input') - else: return True + raise ValueError( + f"Error {triplet} is not an amino acid. Correct your input" + ) + else: + return True else: - raise ValueError(f'Error {input_amino} is incorrect form of amino acid notation. Correct your input') + raise ValueError( + f"Error {input_amino} is incorrect form of amino acid notation. Correct your input" + ) def brutto_count(seqs: list) -> list: - """ + """ Calculates the brutto formula of the amino acid sequences. - + Arguments: - seqs (list): list of string with the protein sequences - + Return: - - list of dictionaries with counts of each elemet included (elements C,H,N,O,S) - """ - elements = ['C', 'H', 'N', 'O', 'S'] + - list of dictionaries with counts of each elemet included (elements C,H,N,O,S)""" + elements = ["C", "H", "N", "O", "S"] result = [] for seq in seqs: brutto_list = [amino_brutto.get(letter) for letter in seq] brutto_pair = list(zip(*brutto_list)) brutto = [sum(i) for i in brutto_pair] - brutto_dict = dict(zip(elements, brutto)) + brutto_dict = dict(zip(elements, brutto)) result.append(brutto_dict) return result def check_length(seq: str) -> bool: - """ + """ Checks if the sequence is divisible by three. - + Arguments: - seq (str): string of protein sequence - + Return: - - bool: True if sequence is divisible by three, otherwise False - """ + - bool: True if sequence is divisible by three, otherwise False""" seq_len = len(seq) if seq_len % 3 == 0: return True - else: return False + else: + return False def check_amino_acid_three_letter(seq: str) -> bool: - """ + """ Checks whether all elements of a sequence are three-letter amino acid symbols. - + Arguments: - seq (str): string of protein sequence - - Return: + + Return: - bool: True if sequence is corresponding to the valid three-letter amino acid, otherwise False - """ - seq = seq.lower() - seq3 = [seq[i:i+3] for i in range(0, len(seq), 3)] - for triplet in seq3: - if triplet not in amino_names_dic.keys(): - return False - else: return True + """ + seq = seq.lower() + seq3 = [seq[i: i + 3] for i in range(0, len(seq), 3)] + for triplet in seq3: + if triplet not in amino_names_dic.keys(): + return False + else: + return True From d11588ab3ced2a2ef98c40eb6b040f604591d9ff Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:38:05 +0300 Subject: [PATCH 74/88] Update protein_analysis_tool.py Return debug --- protein_analysis_tool.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 0e90029..129d83d 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -423,16 +423,14 @@ def check_amino_acid(input_amino: str) -> bool: letter = input_amino if letter not in amino_short_names_dic.keys(): raise ValueError(f"Error {letter} is not an amino acid. Correct your input") - else: - return True + return True elif len(input_amino) == 3: triplet = input_amino if triplet not in amino_names_dic.keys(): raise ValueError( f"Error {triplet} is not an amino acid. Correct your input" ) - else: - return True + return True else: raise ValueError( f"Error {input_amino} is incorrect form of amino acid notation. Correct your input" From 935733afe59653d3aea42a7ac990319fa324ca95 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 11:44:00 +0200 Subject: [PATCH 75/88] Update protein_analysis_tool.py Renamed bool functions --- protein_analysis_tool.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 129d83d..f6f04d3 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -67,7 +67,7 @@ "Cys": "C", } -aa_weights = { +amino_weights = { "A": 89.09, "R": 174.20, "N": 132.12, @@ -210,7 +210,7 @@ def protein_analysis( - length procedure returns list of int values - brutto_count procedure returns list of dictionaries with counts of atoms in the sequence """ - aa_seqs = name_transform(args, letter_format) + amino_acid_seqs = name_transform(args, letter_format) procedures = { "molecular_weight": molecular_weight, "one_letter_to_three": one_letter_to_three, @@ -222,42 +222,42 @@ def protein_analysis( if procedure not in procedures.keys(): raise ValueError("Requested procedure is not defined") elif procedure == "codon_optimization": - return procedures.get(procedure)(aa_seqs, cell_type) + return procedures.get(procedure)(amino_acid_seqs, cell_type) else: - return procedures.get(procedure)(aa_seqs) + return procedures.get(procedure)(amino_acid_seqs) -def molecular_weight(aa_seqs: list) -> list: +def molecular_weight(amino_acid_seqs: list) -> list: """ Calculates predicated molecular weight of aa sequences. Arguments: - - aa_seqs (list): list of string with the protein sequences + - amino_acid_seqs (list): list of string with the protein sequences Return: - List of of floats corresponding to the molecular weight in kDa """ molecular_weights = [] - for seq in aa_seqs: + for seq in amino_acid_seqs: total_weight = 0 for aa in seq: aa = aa.upper() - total_weight += aa_weights[aa] + total_weight += amino_weights[aa] molecular_weights.append(round(total_weight / 1000, 2)) return molecular_weights -def one_letter_to_three(aa_seqs: list) -> list: +def one_letter_to_three(amino_acid_seqs: list) -> list: """ Translates one-letter coded amino acid sequences to three-letter coded Arguments: - - aa_seqs (list): list of string with the protein sequences + - amino_acid_seqs (list): list of string with the protein sequences Return: - List of of strings with three-letter coded sequences """ three_letters_seqs = [] - for seq in aa_seqs: + for seq in amino_acid_seqs: three_letters_seq = [] for aa in seq: aa = aa.upper() @@ -381,11 +381,11 @@ def name_transform(seqs: tuple, letter_format: int) -> list: test_three_letters = [] if letter_format == 1: for seq in seqs: - multiple_of_three.append(check_length(seq)) - test_three_letters.append(check_amino_acid_three_letter(seq)) + multiple_of_three.append(is_length_divisible_by_3(seq)) + test_three_letters.append(is_amino_acid_three_letter(seq)) seq = seq.upper() for letter in seq: - if check_amino_acid(letter): + if is_amino_acid(letter): pass result.append(seq) if all(multiple_of_three) and all(test_three_letters): @@ -398,7 +398,7 @@ def name_transform(seqs: tuple, letter_format: int) -> list: seq = seq.lower() seq3 = [seq[i: i + 3] for i in range(0, len(seq), 3)] for triplet in seq3: - if check_amino_acid(triplet): + if is_amino_acid(triplet): pass seq_transformed = "".join([amino_names_dic.get(seq) for seq in seq3]) result.append(seq_transformed) @@ -409,7 +409,7 @@ def name_transform(seqs: tuple, letter_format: int) -> list: ) -def check_amino_acid(input_amino: str) -> bool: +def is_amino_acid(input_amino: str) -> bool: """ Checks whether the entered string is an amino acid (either three-letter encoding or one-letter encoded). @@ -457,7 +457,7 @@ def brutto_count(seqs: list) -> list: return result -def check_length(seq: str) -> bool: +def is_length_divisible_by_3(seq: str) -> bool: """ Checks if the sequence is divisible by three. @@ -473,7 +473,7 @@ def check_length(seq: str) -> bool: return False -def check_amino_acid_three_letter(seq: str) -> bool: +def is_amino_acid_three_letter(seq: str) -> bool: """ Checks whether all elements of a sequence are three-letter amino acid symbols. From 5b852b70057b21f4af45c843ef9ff0732cc2dd7c Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:45:24 +0300 Subject: [PATCH 76/88] Update README.md Debug erros output --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a76bb07..e764580 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ Optional argument: ```python protein_analysis("ACD", "AD", procedure="one_letter_to_three", letter_format=1) # ['AlaCysAsp', 'AlaAsp'] protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", letter_format=3) # [0.37, 0.22] -protein_analysis("ACD", "AD", procedure="get_amino_acid_sum", letter_format=1) # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, +protein_analysis("ACD", "AD", procedure="get_amino_acid_sum") # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, # {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', letter_format=1) # ['GCGTGCGAT', 'GCGGAT'] protein_analysis("acDEFGHIKLMNPQRSTVwy", "ad", procedure="length", letter_format=1) # [20, 2] @@ -77,9 +77,9 @@ protein_analysis("FGHIKLMNPQ", "PQRSTVwy", "adN", procedure="brutto_count", lett > `ValueError`('Requested procedure is not defined') # Will occur if proc argument does not correspond to any listed procedure (see List of procedures). > `ValueError`('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse) # Will occur if the cell type is incorrectly entered to optimize codons. > `ValueError`('Error unsupported letter_format. Only letter_formats 1 and 3 are supported') # Will oocur if invalid format of input is given. Please check that you have all the sequences written in the same format (one-letter ir three-letter code). Case of input is not important and can be given in upper, lower or mixed case. -> `ValueError`(Error {letter} is not an amino acid. Correct your input') # Will occur if at least one of the amino acids given in not valid one-letter amino acid (letter_format=1). -> `ValueError`(Error {triplet} is not an amino acid. Correct your input') # Will occur if at least one of the amino acids given in not valid three-letter amino acid (letter_format=3). -> `ValueError`(Error {input_amino} is incorrect form of amino acid notation. Correct your input') # Will occur if input amino acid sequences are not given in correct format. +> `ValueError`('Error {letter} is not an amino acid. Correct your input') # Will occur if at least one of the amino acids given in not valid one-letter amino acid (letter_format=1). +> `ValueError`('Error {triplet} is not an amino acid. Correct your input') # Will occur if at least one of the amino acids given in not valid three-letter amino acid (letter_format=3). +> `ValueError`('Error {input_amino} is incorrect form of amino acid notation. Correct your input') # Will occur if input amino acid sequences are not given in correct format. ``` ## Private policy and contacts From a829305eb8fadc116e3b19f29474d43cf9511133 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 11:51:59 +0200 Subject: [PATCH 77/88] Update README.md Changed functions names in contribution part --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e764580..b2726aa 100644 --- a/README.md +++ b/README.md @@ -102,10 +102,10 @@ Team photo: `Ivan Kozin` (team leader) worte functions: - length - brutto_count -- check_amino_acid +- is_amino_acid - name_transform -- check_amino_acid_three_letter -- check_length +- is_length_divisible_by_3 +- is_amino_acid_three_letter - managed work with guthub repository `Dasha Sokolova` wrote functions: From 7e9486fa9c7534510d937eaf52e5f26fc81eab11 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 13:08:32 +0300 Subject: [PATCH 78/88] Update protein_analysis_tool.py Redesign error message in codon_optimization --- protein_analysis_tool.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index f6f04d3..33cef1c 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -316,7 +316,7 @@ def codon_optimization(protein_sequences: list, cell_type: str) -> list: Return: - List of codon-optimized DNA""" - + if cell_type == "Esherichia coli" or cell_type == "E.coli": codon_optimization_ecoli = [] replacer_ecoli = ecoli_triplets.get @@ -347,8 +347,7 @@ def codon_optimization(protein_sequences: list, cell_type: str) -> list: return codon_optimization_mouse else: raise ValueError( - "The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris," - "Mouse" + f'This {cell_type} is not supported. The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse' ) From 1102dba2fb26dbefb04fb711deb0eb9926d75c18 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:14:27 +0200 Subject: [PATCH 79/88] Update protein_analysis_tool.py Change error message of codon optimisation --- protein_analysis_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 33cef1c..7a78487 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -347,7 +347,7 @@ def codon_optimization(protein_sequences: list, cell_type: str) -> list: return codon_optimization_mouse else: raise ValueError( - f'This {cell_type} is not supported. The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse' + f'Type {cell_type} is not supported. The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse' ) From 625fae021f3a1812135134e89ada8ebdb8faaaa4 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:20:10 +0200 Subject: [PATCH 80/88] Update protein_analysis_tool.py Change Note to Warning --- protein_analysis_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 7a78487..2755566 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -389,7 +389,7 @@ def name_transform(seqs: tuple, letter_format: int) -> list: result.append(seq) if all(multiple_of_three) and all(test_three_letters): print( - "Note: all your sequences are similar to three-letter ones. Check the letter_format value" + "Warning: all your sequences are similar to three-letter ones. Check the letter_format value" ) return result elif letter_format == 3: From 737c5e68f8a244282e7a3778995a150a787224b2 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 13:27:26 +0300 Subject: [PATCH 81/88] Update README.md Add new chapter --- README.md | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b2726aa..cbf1113 100644 --- a/README.md +++ b/README.md @@ -71,8 +71,29 @@ protein_analysis("FGHIKLMNPQ", "PQRSTVwy", "adN", procedure="brutto_count", lett ``` +## Input requirements and possible errors: + - **It is important to indicate the type of operation. An error occurs when you enter an incorrect operation type** +```python +protein_analysis("FGHIKLMNPQ", "PQRSTVwy", "adN", procedure="brutto", letter_format=1) +# ValueError: Requested procedure is not defined +``` + - **To perform the coden_optimization operation, you must enter cell_type (None by default). Otherwise an error message is displayed** +```python +protein_analysis('AlaCysAsp', 'AlaAsp', procedure="codon_optimization", cell_type='Rat', letter_format=3) +# ValueError: Type Rat is not supported. The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse +``` + - **By default, entering amino acid sequences in a single-letter format in any case is supported. To enter in three-letter format in any case, you need to specify letter_format = 3.
If an unknown format is entered, an error message is displayed.** +```python +protein_analysis("ACD", "AD", procedure="one_letter_to_three", cell_type='E.coli', letter_format=2) +# ValueError: Error unsupported letter_format. Only letter_formats 1 and 3 are supported +``` + - **If letter_format = 1 is specified, but all sequences are similar to the three-letter amino slot encoding, a notification will be displayed warning** +```python +protein_analysis("LYSlys", "HishisHis", procedure="get_amino_acid_sum", letter_format=1) +# Warning: all your sequences are similar to three-letter ones. Check the letter_format value +``` -## Possible errors: +### ```python > `ValueError`('Requested procedure is not defined') # Will occur if proc argument does not correspond to any listed procedure (see List of procedures). > `ValueError`('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse) # Will occur if the cell type is incorrectly entered to optimize codons. From d2874db8fc74745abe1dfe693d2db125b72d8619 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 13:35:18 +0300 Subject: [PATCH 82/88] Update README.md Add new errors message description --- README.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index cbf1113..47b9ed7 100644 --- a/README.md +++ b/README.md @@ -92,15 +92,17 @@ protein_analysis("ACD", "AD", procedure="one_letter_to_three", cell_type='E.coli protein_analysis("LYSlys", "HishisHis", procedure="get_amino_acid_sum", letter_format=1) # Warning: all your sequences are similar to three-letter ones. Check the letter_format value ``` - -### + - **If a single-letter amino acid input format is specified, but at least one amino acid slot is not standard or is written incorrectly, an error message is displayed** +```python +protein_analysis("BBB", procedure="get_amino_acid_sum", letter_format=1)) +# ValueError: Error B is not an amino acid. Correct your input +``` + - **If a three-letter amino acid input format is specified, but at least one amino acid slot is not standard or is written incorrectly, an error message is displayed** ```python -> `ValueError`('Requested procedure is not defined') # Will occur if proc argument does not correspond to any listed procedure (see List of procedures). -> `ValueError`('The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse) # Will occur if the cell type is incorrectly entered to optimize codons. -> `ValueError`('Error unsupported letter_format. Only letter_formats 1 and 3 are supported') # Will oocur if invalid format of input is given. Please check that you have all the sequences written in the same format (one-letter ir three-letter code). Case of input is not important and can be given in upper, lower or mixed case. -> `ValueError`('Error {letter} is not an amino acid. Correct your input') # Will occur if at least one of the amino acids given in not valid one-letter amino acid (letter_format=1). -> `ValueError`('Error {triplet} is not an amino acid. Correct your input') # Will occur if at least one of the amino acids given in not valid three-letter amino acid (letter_format=3). -> `ValueError`('Error {input_amino} is incorrect form of amino acid notation. Correct your input') # Will occur if input amino acid sequences are not given in correct format. +protein_analysis("Al", procedure="get_amino_acid_sum", letter_format=3) +# ValueError: Error al is incorrect form of amino acid notation. Correct your input +protein_analysis("AluLysArg", procedure="get_amino_acid_sum", letter_format=3) +# ValueError: Error alu is not an amino acid. Correct your input ``` ## Private policy and contacts @@ -129,11 +131,11 @@ Team photo: - is_amino_acid_three_letter - managed work with guthub repository -`Dasha Sokolova` wrote functions: +`Dasha Sokolova` (co-leader) wrote functions: - get_amino_acid_sum - codon_optimization functions -`Yulia Volkova` wrote functions: +`Yulia Volkova` (co-leader) wrote functions: - main (protein_analysis) - molecular_weight - one_letter_to_three functions From 07616dfb4b3a27086c40752cc9687e91bab7da14 Mon Sep 17 00:00:00 2001 From: yvolko <144178378+yvolko@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:38:16 +0200 Subject: [PATCH 83/88] Update README.md Correct general info --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 47b9ed7..07ce251 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,15 @@ # Protein Info +This tool supports standard 20 amino acids. Any modifications of amino acids are not supported. You can write amino acids in any case (lower, upper or mixed). This project consists of one function "protein_analysis" that helps user to: - predict molecular weight of amino acid (aa) sequences - translate aa sequences from one-letter to three-letter code - calculate total amount of each amino acid in the sequences - make DNA based codon optimization for the introduced amino acid sequences with the support for 3 cell types: Esherichia coli, Pichia pastoris, Mouse - calculate length of amino acid sequences -- count the number of atoms of each type in a sequence (brutto formula) +- count the number of atoms of each type in a sequence (brutto formula)
-## Technology: - -python +Tool is coded with Python. ## How to use: **protein_analysis**
From 1a60b29aeb47fd72e9d5af2e869c1755a12f6939 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 13:39:50 +0300 Subject: [PATCH 84/88] Update README.md last update --- README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 07ce251..4a5b56b 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,7 @@ This project consists of one function "protein_analysis" that helps user to: Tool is coded with Python. ## How to use: -**protein_analysis**
-protein_analysis(**args, procedure, cell_type=None, letter_format=1*)
+**protein_analysis**(**args, procedure, cell_type=None, letter_format=1*)
**Parametrs:** > ***args** : **sequence of str**
>     Any number of lines with amino acid sequences
@@ -64,9 +63,8 @@ protein_analysis("ACD", "AD", procedure="get_amino_acid_sum") # [{'A': 1, 'C': 1 # {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', letter_format=1) # ['GCGTGCGAT', 'GCGGAT'] protein_analysis("acDEFGHIKLMNPQRSTVwy", "ad", procedure="length", letter_format=1) # [20, 2] -protein_analysis("FGHIKLMNPQ", "PQRSTVwy", "adN", procedure="brutto_count", letter_format=1) # [{'C': 54, 'H': 103, 'N': 15, 'O': 22, 'S': 1}, - # {'C': 48, 'H': 83, 'N': 23, 'O': 18, 'S': 3}, - # {'C': 11, 'H': 22, 'N': 4, 'O': 9, 'S': 0}] +protein_analysis("FGHIKLMNPQ", "PQRSTVwy", "adN", procedure="brutto_count", letter_format=1) +# [{'C': 54, 'H': 103, 'N': 15, 'O': 22, 'S': 1}, # {'C': 48, 'H': 83, 'N': 23, 'O': 18, 'S': 3}, # {'C': 11, 'H': 22, 'N': 4, 'O': 9, 'S': 0}] ``` From b0f3b4936c2b7ca67a1ac9d7664b9b280416f36f Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sun, 1 Oct 2023 13:40:58 +0300 Subject: [PATCH 85/88] Update README.md Latest update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a5b56b..9fdff9d 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ protein_analysis("ACD", "AD", procedure="get_amino_acid_sum") # [{'A': 1, 'C': 1 protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', letter_format=1) # ['GCGTGCGAT', 'GCGGAT'] protein_analysis("acDEFGHIKLMNPQRSTVwy", "ad", procedure="length", letter_format=1) # [20, 2] protein_analysis("FGHIKLMNPQ", "PQRSTVwy", "adN", procedure="brutto_count", letter_format=1) -# [{'C': 54, 'H': 103, 'N': 15, 'O': 22, 'S': 1}, # {'C': 48, 'H': 83, 'N': 23, 'O': 18, 'S': 3}, # {'C': 11, 'H': 22, 'N': 4, 'O': 9, 'S': 0}] +# [{'C': 54, 'H': 103, 'N': 15, 'O': 22, 'S': 1}, {'C': 48, 'H': 83, 'N': 23, 'O': 18, 'S': 3}, {'C': 11, 'H': 22, 'N': 4, 'O': 9, 'S': 0}] ``` From ae0d1327d5a1faf102ac6106c69eec86d4954b72 Mon Sep 17 00:00:00 2001 From: Sokolova Dasha <144246645+stegodasha@users.noreply.github.com> Date: Sat, 7 Oct 2023 20:42:01 +0300 Subject: [PATCH 86/88] Update amino_acid_sum and codon_optimization --- protein_analysis_tool.py | 72 ++++++++++------------------------------ 1 file changed, 17 insertions(+), 55 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 2755566..62d19e3 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -277,28 +277,7 @@ def get_amino_acid_sum(protein_sequences: list) -> list: - List of dictionary with amino acid amount""" result = [] for protein_sequence in range(len(protein_sequences)): - amino_acid_count = { - "A": 0, - "C": 0, - "D": 0, - "E": 0, - "F": 0, - "G": 0, - "H": 0, - "I": 0, - "K": 0, - "L": 0, - "M": 0, - "N": 0, - "P": 0, - "Q": 0, - "R": 0, - "S": 0, - "T": 0, - "V": 0, - "W": 0, - "Y": 0, - } + amino_acid_count = dict([(key, 0) for key in amino_short_names_dic.keys()]) for amino_acid in protein_sequences[protein_sequence]: amino_acid_count[amino_acid] += 1 result.append(amino_acid_count) @@ -316,40 +295,23 @@ def codon_optimization(protein_sequences: list, cell_type: str) -> list: Return: - List of codon-optimized DNA""" - - if cell_type == "Esherichia coli" or cell_type == "E.coli": - codon_optimization_ecoli = [] - replacer_ecoli = ecoli_triplets.get - for amino_acid in range(len(protein_sequences)): - codon_optimization_ecoli += [ - "".join([replacer_ecoli(n, n) for n in protein_sequences[amino_acid]]) - ] - return codon_optimization_ecoli - - if cell_type == "Pichia pastoris" or cell_type == "P.pastoris": - codon_optimization_ppastoris = [] - replacer_ppastoris = ppastoris_triplets.get - for amino_acid in range(len(protein_sequences)): - codon_optimization_ppastoris += [ - "".join( - [replacer_ppastoris(n, n) for n in protein_sequences[amino_acid]] - ) - ] - return codon_optimization_ppastoris - - if cell_type == "Mouse" or cell_type == "mouse": - codon_optimization_mouse = [] - replacer_mouse = mouse_triplets.get - for amino_acid in range(len(protein_sequences)): - codon_optimization_mouse += [ - "".join([replacer_mouse(n, n) for n in protein_sequences[amino_acid]]) - ] - return codon_optimization_mouse + cell_types = {"Esherichia coli": ecoli_triplets, "E.coli": ecoli_triplets, + "Pichia pastoris" : ppastoris_triplets, "P.pastoris" : ppastoris_triplets, + "Mouse" : mouse_triplets, "mouse" : mouse_triplets} + list_cell_type = ["Esherichia coli", "E.coli","Pichia pastoris","P.pastoris", "Mouse","mouse"] + if cell_type in list_cell_type: + codon_optimization_post = [] + using_key = cell_types[cell_type] + for sequence in protein_sequences: + codon_optimization_pre = [] + for amino_acid in sequence: + codon_optimization_pre += using_key[amino_acid] + codon_optimization_post.append(''.join(codon_optimization_pre)) + return codon_optimization_post else: - raise ValueError( - f'Type {cell_type} is not supported. The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse' - ) - + raise ValueError( f'Type {cell_type} is not supported. ' + f'The following types of organisms are available for codon optimization: ' + f'Esherichia coli, Pichia pastoris, Mouse' ) def length(seqs: list) -> list: """ From 83ac5f9914c5c8a404fd910e13f7ae8db69c44ad Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 7 Oct 2023 23:51:09 +0300 Subject: [PATCH 87/88] Update protein_analysis_tool.py The outputs of the is_amino_acid function have been changed to True and False. Removed unimportant "elses" --- protein_analysis_tool.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index 62d19e3..e77664d 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -346,8 +346,10 @@ def name_transform(seqs: tuple, letter_format: int) -> list: test_three_letters.append(is_amino_acid_three_letter(seq)) seq = seq.upper() for letter in seq: - if is_amino_acid(letter): - pass + if not is_amino_acid(letter): + raise ValueError( + f"Error {letter} is not an amino acid. Correct your input" + ) result.append(seq) if all(multiple_of_three) and all(test_three_letters): print( @@ -359,8 +361,10 @@ def name_transform(seqs: tuple, letter_format: int) -> list: seq = seq.lower() seq3 = [seq[i: i + 3] for i in range(0, len(seq), 3)] for triplet in seq3: - if is_amino_acid(triplet): - pass + if not is_amino_acid(triplet): + raise ValueError( + f"Error {triplet} is not an amino acid. Correct your input" + ) seq_transformed = "".join([amino_names_dic.get(seq) for seq in seq3]) result.append(seq_transformed) return result @@ -383,19 +387,15 @@ def is_amino_acid(input_amino: str) -> bool: if len(input_amino) == 1: letter = input_amino if letter not in amino_short_names_dic.keys(): - raise ValueError(f"Error {letter} is not an amino acid. Correct your input") + return False return True elif len(input_amino) == 3: triplet = input_amino if triplet not in amino_names_dic.keys(): - raise ValueError( - f"Error {triplet} is not an amino acid. Correct your input" - ) + return False return True else: - raise ValueError( - f"Error {input_amino} is incorrect form of amino acid notation. Correct your input" - ) + return False def brutto_count(seqs: list) -> list: @@ -430,8 +430,7 @@ def is_length_divisible_by_3(seq: str) -> bool: seq_len = len(seq) if seq_len % 3 == 0: return True - else: - return False + return False def is_amino_acid_three_letter(seq: str) -> bool: @@ -449,5 +448,4 @@ def is_amino_acid_three_letter(seq: str) -> bool: for triplet in seq3: if triplet not in amino_names_dic.keys(): return False - else: - return True + return True From d1af9b47322507bf3ca29cc5bdf4070002a104a7 Mon Sep 17 00:00:00 2001 From: Ivan Kozin <63678919+ivandkoz@users.noreply.github.com> Date: Sat, 7 Oct 2023 23:57:18 +0300 Subject: [PATCH 88/88] Update protein_analysis_tool.py Changing case in constants to uppercase --- protein_analysis_tool.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py index e77664d..faa0894 100644 --- a/protein_analysis_tool.py +++ b/protein_analysis_tool.py @@ -1,4 +1,4 @@ -amino_short_names_dic = { +AMINO_SHORT_NAMES_DIC = { "A": "Ala", "R": "Arg", "N": "Asn", @@ -21,7 +21,7 @@ "C": "Cys", } -amino_names_dic = { +AMINO_NAMES_DIC = { "ala": "A", "arg": "R", "asn": "N", @@ -44,7 +44,7 @@ "cys": "C", } -amino_names_dic_reverse = { +AMINO_NAMES_DIC_REVERSE = { "Ala": "A", "Arg": "R", "Asn": "N", @@ -67,7 +67,7 @@ "Cys": "C", } -amino_weights = { +AMINO_WEIGHTS = { "A": 89.09, "R": 174.20, "N": 132.12, @@ -90,7 +90,7 @@ "V": 117.15, } -amino_brutto = { +AMINO_BRUTTO = { "A": (3, 7, 1, 2, 0), "R": (6, 14, 4, 2, 0), "N": (4, 8, 2, 3, 0), @@ -113,7 +113,7 @@ "C": (3, 7, 1, 2, 1), } -ecoli_triplets = { +ECOLI_TRIPLETS = { "A": "GCG", "C": "TGC", "D": "GAT", @@ -136,7 +136,7 @@ "Y": "TAT", } -ppastoris_triplets = { +PPASTORIS_TRIPLETS = { "A": "GCT", "C": "TGT", "D": "GAT", @@ -159,7 +159,7 @@ "Y": "TAC", } -mouse_triplets = { +MOUSE_TRIPLETS = { "A": "GCC", "C": "TGC", "D": "GAC", @@ -242,7 +242,7 @@ def molecular_weight(amino_acid_seqs: list) -> list: total_weight = 0 for aa in seq: aa = aa.upper() - total_weight += amino_weights[aa] + total_weight += AMINO_WEIGHTS[aa] molecular_weights.append(round(total_weight / 1000, 2)) return molecular_weights @@ -261,7 +261,7 @@ def one_letter_to_three(amino_acid_seqs: list) -> list: three_letters_seq = [] for aa in seq: aa = aa.upper() - three_letters_seq.append(amino_short_names_dic[aa]) + three_letters_seq.append(AMINO_SHORT_NAMES_DIC[aa]) three_letters_seqs.append("".join(three_letters_seq)) return three_letters_seqs @@ -277,7 +277,7 @@ def get_amino_acid_sum(protein_sequences: list) -> list: - List of dictionary with amino acid amount""" result = [] for protein_sequence in range(len(protein_sequences)): - amino_acid_count = dict([(key, 0) for key in amino_short_names_dic.keys()]) + amino_acid_count = dict([(key, 0) for key in AMINO_SHORT_NAMES_DIC.keys()]) for amino_acid in protein_sequences[protein_sequence]: amino_acid_count[amino_acid] += 1 result.append(amino_acid_count) @@ -295,9 +295,9 @@ def codon_optimization(protein_sequences: list, cell_type: str) -> list: Return: - List of codon-optimized DNA""" - cell_types = {"Esherichia coli": ecoli_triplets, "E.coli": ecoli_triplets, - "Pichia pastoris" : ppastoris_triplets, "P.pastoris" : ppastoris_triplets, - "Mouse" : mouse_triplets, "mouse" : mouse_triplets} + cell_types = {"Esherichia coli": ECOLI_TRIPLETS, "E.coli": ECOLI_TRIPLETS, + "Pichia pastoris" : PPASTORIS_TRIPLETS, "P.pastoris" : PPASTORIS_TRIPLETS, + "Mouse" : MOUSE_TRIPLETS, "mouse" : MOUSE_TRIPLETS} list_cell_type = ["Esherichia coli", "E.coli","Pichia pastoris","P.pastoris", "Mouse","mouse"] if cell_type in list_cell_type: codon_optimization_post = [] @@ -365,7 +365,7 @@ def name_transform(seqs: tuple, letter_format: int) -> list: raise ValueError( f"Error {triplet} is not an amino acid. Correct your input" ) - seq_transformed = "".join([amino_names_dic.get(seq) for seq in seq3]) + seq_transformed = "".join([AMINO_NAMES_DIC.get(seq) for seq in seq3]) result.append(seq_transformed) return result else: @@ -386,12 +386,12 @@ def is_amino_acid(input_amino: str) -> bool: """ if len(input_amino) == 1: letter = input_amino - if letter not in amino_short_names_dic.keys(): + if letter not in AMINO_SHORT_NAMES_DIC.keys(): return False return True elif len(input_amino) == 3: triplet = input_amino - if triplet not in amino_names_dic.keys(): + if triplet not in AMINO_NAMES_DIC.keys(): return False return True else: @@ -410,7 +410,7 @@ def brutto_count(seqs: list) -> list: elements = ["C", "H", "N", "O", "S"] result = [] for seq in seqs: - brutto_list = [amino_brutto.get(letter) for letter in seq] + brutto_list = [AMINO_BRUTTO.get(letter) for letter in seq] brutto_pair = list(zip(*brutto_list)) brutto = [sum(i) for i in brutto_pair] brutto_dict = dict(zip(elements, brutto)) @@ -446,6 +446,6 @@ def is_amino_acid_three_letter(seq: str) -> bool: seq = seq.lower() seq3 = [seq[i: i + 3] for i in range(0, len(seq), 3)] for triplet in seq3: - if triplet not in amino_names_dic.keys(): + if triplet not in AMINO_NAMES_DIC.keys(): return False return True