diff --git a/README.md b/README.md index f918170..9fdff9d 100644 --- a/README.md +++ b/README.md @@ -1,65 +1,143 @@ -# HW 4. Functions 2 -> *This is the repo for the fourth homework of the BI Python 2023 course* +# Protein Info + +This tool supports standard 20 amino acids. Any modifications of amino acids are not supported. You can write amino acids in any case (lower, upper or mixed). +This project consists of one function "protein_analysis" that helps user to: +- predict molecular weight of amino acid (aa) sequences +- translate aa sequences from one-letter to three-letter code +- calculate total amount of each amino acid in the sequences +- make DNA based codon optimization for the introduced amino acid sequences with the support for 3 cell types: Esherichia coli, Pichia pastoris, Mouse +- calculate length of amino acid sequences +- count the number of atoms of each type in a sequence (brutto formula)
+ +Tool is coded with Python. + +## How to use: +**protein_analysis**(**args, procedure, cell_type=None, letter_format=1*)
+**Parametrs:** +> ***args** : **sequence of str**
+>     Any number of lines with amino acid sequences
+ **procedure** : ***str***
+>     The name of the operation you want to perform. The following types of procedures are supported:
+>> +>> - ***molecular_weight***: calculates predicted molecular weight of amino acid sequences in kDa +>> - ***one_letter_to_three***: translate aa sequences from one-letter to three-letter code +>> - ***get_amino_acid_sum***: calculates total amount of each amino acid in the sequences +>> - ***codon_optimization***: makes DNA based codon optimization for the introduced amino acid sequences, support 3 types of cells. Can only be used in conjunction with **cell_type**: `Esherichia coli`, `Pichia pastoris`, `Mouse` +>> - ***length***: calculates length of amino acid sequences +>> - ***brutto_count***: counts the number of atoms of each type in a sequence +>> +> **cell_type** : ***str, defalut None***
+>     The type of cells for which optimization is applied. Cell types supported:
+>> +>> - `Esherichia coli` *or* `E.coli` +>> - `Pichia pastoris` *or* `P.pastoris` +>> - `Mouse` *or* `mouse` +>> +> **letter_format** : ***int, defalut 1***
+>     Specifies the format for receiving amino acid sequences. Either one-letter (**letter_format** = 1) or three-letter sequences (**letter_format** = 3)
+> + +Call the "protein_analysis" funcion with following arguments. +Requred arguments: +- tuple of protein sequences written one letter or three letter code without stop codos. Please do not use sequences in different formats in the same function call! +- name of procedure as string (see list of precedures) +- format of code for the protein sequences as int: 1 for one letter, 3 for three letter code +Optional argument: +- cell type (required only for codon_optimization procedure). Accepted cell types Esherichia coli, Pichia pastoris, Mouse + +## List of procedures: + +- `molecular_weight` — returns list of float values, that indicate predicted molecular weights of given aa sequences (in kDa) +- `one_letter_to_three` — will return list of strings, containing the same sequences written in three-letter code +- `get_amino_acid_sum` — сounts the amount of each amino acid in the injected protein sequences +- `codon_optimization` — makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: Esherichia coli, Pichia pastoris, Mouse +- `length` — calculates length of amino acid sequences +- `brutto_count` — counts the number of atoms of each type in a sequence + +## Example of use: + +```python +protein_analysis("ACD", "AD", procedure="one_letter_to_three", letter_format=1) # ['AlaCysAsp', 'AlaAsp'] +protein_analysis("AlaAspLys", "AlaAsp", procedure="molecular_weight", letter_format=3) # [0.37, 0.22] +protein_analysis("ACD", "AD", procedure="get_amino_acid_sum") # [{'A': 1, 'C': 1, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}, + # {'A': 1, 'C': 0, 'D': 1, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}] +protein_analysis("ACD", "AD", procedure="codon_optimization", cell_type = 'E.coli', letter_format=1) # ['GCGTGCGAT', 'GCGGAT'] +protein_analysis("acDEFGHIKLMNPQRSTVwy", "ad", procedure="length", letter_format=1) # [20, 2] +protein_analysis("FGHIKLMNPQ", "PQRSTVwy", "adN", procedure="brutto_count", letter_format=1) +# [{'C': 54, 'H': 103, 'N': 15, 'O': 22, 'S': 1}, {'C': 48, 'H': 83, 'N': 23, 'O': 18, 'S': 3}, {'C': 11, 'H': 22, 'N': 4, 'O': 9, 'S': 0}] +``` + + +## Input requirements and possible errors: + - **It is important to indicate the type of operation. An error occurs when you enter an incorrect operation type** +```python +protein_analysis("FGHIKLMNPQ", "PQRSTVwy", "adN", procedure="brutto", letter_format=1) +# ValueError: Requested procedure is not defined +``` + - **To perform the coden_optimization operation, you must enter cell_type (None by default). Otherwise an error message is displayed** +```python +protein_analysis('AlaCysAsp', 'AlaAsp', procedure="codon_optimization", cell_type='Rat', letter_format=3) +# ValueError: Type Rat is not supported. The following types of organisms are available for codon optimization: Esherichia coli, Pichia pastoris, Mouse +``` + - **By default, entering amino acid sequences in a single-letter format in any case is supported. To enter in three-letter format in any case, you need to specify letter_format = 3.
If an unknown format is entered, an error message is displayed.** +```python +protein_analysis("ACD", "AD", procedure="one_letter_to_three", cell_type='E.coli', letter_format=2) +# ValueError: Error unsupported letter_format. Only letter_formats 1 and 3 are supported +``` + - **If letter_format = 1 is specified, but all sequences are similar to the three-letter amino slot encoding, a notification will be displayed warning** +```python +protein_analysis("LYSlys", "HishisHis", procedure="get_amino_acid_sum", letter_format=1) +# Warning: all your sequences are similar to three-letter ones. Check the letter_format value +``` + - **If a single-letter amino acid input format is specified, but at least one amino acid slot is not standard or is written incorrectly, an error message is displayed** +```python +protein_analysis("BBB", procedure="get_amino_acid_sum", letter_format=1)) +# ValueError: Error B is not an amino acid. Correct your input +``` + - **If a three-letter amino acid input format is specified, but at least one amino acid slot is not standard or is written incorrectly, an error message is displayed** +```python +protein_analysis("Al", procedure="get_amino_acid_sum", letter_format=3) +# ValueError: Error al is incorrect form of amino acid notation. Correct your input +protein_analysis("AluLysArg", procedure="get_amino_acid_sum", letter_format=3) +# ValueError: Error alu is not an amino acid. Correct your input +``` + +## Private policy and contacts +This tool can be freely distributed and used. +
+If you have any suggestions for improving the tool or if you find a bug, please contact us by email. +
+This tool was developed by the "workaholics" team: +
+Yulia Volkova volkova.yulia.leonidovna@gmail.com +
+Dasha Sokolova kalabanova_dasha@mail.ru +
+Team leader: Ivan Kozin ivan.d.kozin@gmail.com +
+Team photo: +![Снимок экрана 2023-09-29 210559_2](https://github.com/ivandkoz/HW4_Functions2_Kozin/assets/63678919/ad1302a1-d139-4c82-b7eb-d5b9ac1897e8) + +## Personal contribution +`Ivan Kozin` (team leader) worte functions: +- length +- brutto_count +- is_amino_acid +- name_transform +- is_length_divisible_by_3 +- is_amino_acid_three_letter +- managed work with guthub repository + +`Dasha Sokolova` (co-leader) wrote functions: +- get_amino_acid_sum +- codon_optimization functions + +`Yulia Volkova` (co-leader) wrote functions: +- main (protein_analysis) +- molecular_weight +- one_letter_to_three functions + +Writting README, debugging code and testing it has been done by the efforts of all team. -### Homework description -На прошлой неделе вы делали утилиту для работы с последовательностями нуклеиновых кислот (с весьма строгим ТЗ). Пришло время для чего-то более самостоятельного. -#### Основное задание - - -Напишите утилиту для работы с последовательностями белков. Там должно быть минимум 5 различных операций, должна быть какая-то точка входа через которую пользователь будет всё это дело использовать. На этом, по сути, всё. Всё целиком зависит от вашей фантазии и креативности. Можете опираться на ДЗ №2 и №3. - -Самая главная часть задания - это файл `README.md`. Сделайте краткое введение, напишите описание тула, приведите документацию по использованию со списком аргументов. Добавьте примеры использования. Возможно, вы захотите сделать секцию Troubleshooting. ***Почему это нужно?*** В этот раз проверяющий не будет знать того, как должен работать ваш тул. Это ваш авторский код. Даже самая прекрасная функциональность, не будучи отраженной в README, скорее всего останется незамеченной. README - это ваш способ познакомить пользователя с тулом, показать всё лучше и обосновать, почему именно ваша команда должна получить наивысший балл. - -Есть люди которые, любят писать документации, а есть те - кто не любит. Найдите в вашей команде того, кто любит. И в будущем в своих рабочих проектах всегда держите рядом такого человек (или будьте им). - -Примеры некоторых README, которыми можно вдохновляться: - -- [MetaFX](https://github.com/ctlab/metafx), тул Артёма Иванова. Там еще и [wiki](https://github.com/ctlab/metafx/wiki) крутое. -- [samovar](https://github.com/nvaulin/samovar) -- [MetaGEM](https://github.com/franciscozorrilla/metaGEM) -- [Pharokka](https://github.com/gbouras13/pharokka) - -Типовые секции, на которые стоит обратить внимание: Title, Overview, Usage, Options, Examples, Troubleshooting, Contacts. - -**Tехническое требование к заданию.** - -Это задание будет выполняться в командах по 3 человека. Каждый из членов команды должен внести ***как минимум*** 2 функции. Каждое внесение функции должно сопровождаться коммитом с осмысленным описанием коммита. Ниже приведена последовательность действий для успешного выполнения задания (аналогично ДЗ №2): - -1. Посмотрите состав своей команды здесь ([**ССЫЛКА**](https://docs.google.com/spreadsheets/d/1KMBBBu8LqauRpDJb0v1ldPwpvzNn8-KakcHexAcqLsE/edit?usp=sharing)). -2. Тимлид делает форк данного репозитория. **В форке создает ветку `HW4_`, в ветке создает папку `HW4_`, в этой папке вы всё делаете.** -3. Члены команды могут либо делать свои форки, либо работать в репозитории тимлида в качестве колабораторов ("contributors"). В любом случае делаете клоны => пишите код локально => пушите. -4. В конце тимлид делайет pull-request из `HW4_` своего репозитория в `main` этого. - - -А также: -- Сопроводите программу лучшим `README.md` файлом в вашей жизни (на английском языке). -- В этом ДЗ проблемы с качеством кода (нейминги, пустые строки, анноатции типов, док.стринги, пробелы) могут привести к снижению балла. Воспользуйтесь линтерами чтобы себя обезопасить. IDE по типу PyCharm или VSCode имеют фунцонал по авто-исправлению многих проблем такого рода. - -Автотестов на GitHub в этом ДЗ нет, но вы можете прогнать линтеры на качество кода локально (как в ДЗ №3, подробнее читайте [тут](https://plausible-cannon-091.notion.site/Code-auto-checks-02b2ea69c1d545fca07b50ce5933ed5f?pvs=4)). - -- Программа должна сохранять регистр символов. -- Программа должна работать только с последовательностями белков. -- Запрещается использование сторонних модулей. - - -### Форма сдачи - -Прикрепите ссылку на pull-request тимлида в Google Class (можете сделать от лица каждого члена команды, но это не обязательно). - - -### Pазбалловка - -- За каждую из 5 операций - максимум **1.5 балла** -- За README - максимум **2.5 балла** -- Если вы не внесли как минимум 2 функции от себя, вы получаете 0 баллов (на баллы остальных членов команды это не влияет). -- За фото созвона в README можно получить 0.2 доп. балла (но не более 10 баллов суммарно) - - - -### **Предполагаемый учебный результат** - -Это задание позволит вам проявить креативность и учиться быть не только кодером, но и автором. Также это задание поможет окончательно закрепить материал по функциям который мы прошли. - -Удачи! ✨✨ diff --git a/protein_analysis_tool.py b/protein_analysis_tool.py new file mode 100644 index 0000000..faa0894 --- /dev/null +++ b/protein_analysis_tool.py @@ -0,0 +1,451 @@ +AMINO_SHORT_NAMES_DIC = { + "A": "Ala", + "R": "Arg", + "N": "Asn", + "D": "Asp", + "V": "Val", + "H": "His", + "G": "Gly", + "Q": "Gln", + "E": "Glu", + "I": "Ile", + "L": "Leu", + "K": "Lys", + "M": "Met", + "P": "Pro", + "S": "Ser", + "Y": "Tyr", + "T": "Thr", + "W": "Trp", + "F": "Phe", + "C": "Cys", +} + +AMINO_NAMES_DIC = { + "ala": "A", + "arg": "R", + "asn": "N", + "asp": "D", + "val": "V", + "his": "H", + "gly": "G", + "gln": "Q", + "glu": "E", + "ile": "I", + "leu": "L", + "lys": "K", + "met": "M", + "pro": "P", + "ser": "S", + "tyr": "Y", + "thr": "T", + "trp": "W", + "phe": "F", + "cys": "C", +} + +AMINO_NAMES_DIC_REVERSE = { + "Ala": "A", + "Arg": "R", + "Asn": "N", + "Asp": "D", + "Val": "V", + "His": "H", + "Gly": "G", + "Gln": "Q", + "Glu": "E", + "Ile": "I", + "Leu": "L", + "Lys": "K", + "Met": "M", + "Pro": "P", + "Ser": "S", + "Tyr": "Y", + "Thr": "T", + "Trp": "W", + "Phe": "F", + "Cys": "C", +} + +AMINO_WEIGHTS = { + "A": 89.09, + "R": 174.20, + "N": 132.12, + "D": 133.10, + "C": 121.16, + "E": 147.13, + "Q": 146.15, + "G": 75.07, + "H": 155.16, + "I": 131.18, + "L": 131.18, + "K": 146.19, + "M": 149.21, + "F": 165.19, + "P": 115.13, + "S": 105.09, + "T": 119.12, + "W": 204.23, + "Y": 181.19, + "V": 117.15, +} + +AMINO_BRUTTO = { + "A": (3, 7, 1, 2, 0), + "R": (6, 14, 4, 2, 0), + "N": (4, 8, 2, 3, 0), + "D": (4, 7, 1, 4, 0), + "V": (5, 11, 1, 2, 0), + "H": (6, 9, 3, 2, 0), + "G": (2, 5, 1, 2, 0), + "Q": (5, 10, 2, 3, 0), + "E": (5, 9, 1, 4, 0), + "I": (6, 13, 1, 2, 0), + "L": (6, 13, 1, 2, 0), + "K": (6, 14, 2, 2, 0), + "M": (5, 11, 1, 2, 1), + "P": (5, 9, 1, 2, 0), + "S": (3, 7, 1, 3, 0), + "Y": (9, 11, 1, 3, 0), + "T": (4, 9, 11, 1, 3, 0), + "W": (11, 12, 2, 2, 0), + "F": (9, 11, 1, 2, 0), + "C": (3, 7, 1, 2, 1), +} + +ECOLI_TRIPLETS = { + "A": "GCG", + "C": "TGC", + "D": "GAT", + "E": "GAA", + "F": "TTT", + "G": "GGC", + "H": "CAT", + "I": "ATT", + "K": "AAA", + "L": "CTG", + "M": "ATG", + "N": "AAC", + "P": "CCG", + "Q": "CAG", + "R": "CGT", + "S": "AGC", + "T": "ACC", + "V": "GTG", + "W": "TGG", + "Y": "TAT", +} + +PPASTORIS_TRIPLETS = { + "A": "GCT", + "C": "TGT", + "D": "GAT", + "E": "GAA", + "F": "TTT", + "G": "GGT", + "H": "CAT", + "I": "ATT", + "K": "AAG", + "L": "TTG", + "M": "ATG", + "N": "AAC", + "P": "CCA", + "Q": "CAA", + "R": "AGA", + "S": "TCT", + "T": "ACT", + "V": "GTT", + "W": "TGG", + "Y": "TAC", +} + +MOUSE_TRIPLETS = { + "A": "GCC", + "C": "TGC", + "D": "GAC", + "E": "GAG", + "F": "TTC", + "G": "GGC", + "H": "CAC", + "I": "ATC", + "K": "AAG", + "L": "CTG", + "M": "ATG", + "N": "AAC", + "P": "CCC", + "Q": "CAG", + "R": "CGG", + "S": "AGC", + "T": "ACC", + "V": "GTG", + "W": "TGG", + "Y": "TAC", +} + + +def protein_analysis( + *args: str, procedure: str, cell_type: str = None, letter_format: int = 1 +) -> list: + """ + Function protein_analysis: + - calculates predicted molecular weight of amino acid sequences in kDa (procedure name: molecular_weight) + - translate aa sequences from one-letter to three-letter code (procedure name: one_letter_to_three) + - calculates total amount of each amino acid in the sequences (procedure name: get_amino_acid_sum) + - makes DNA based codon optimization for the introduced amino acid sequences, support 3 types of cells: + Esherichia coli, Pichia pastoris, Mouse (procedure name: codon_optimization) + - calculates length of amino acid sequences (procedure name: length) + - counts the number of atoms of each type in a sequence (procedure name: brutto_count) + + Arguments: + - one or multiple string of protein sequences written one letter or three letter code (not mixed) + - name of procedure as string + - cell type (required only for codon_optimization procedure) + - letter_format of code for the protein sequences as int: 1 for one letter, 3 for three letter code + + Return: + - molecular_weight procedure returns list of floats + - one_letter_to_three procedure returns list of strings + - get_amino_acid_sum procedure returns list of dictionaries + - codon_optimization procedure returns list of strings + - length procedure returns list of int values + - brutto_count procedure returns list of dictionaries with counts of atoms in the sequence + """ + amino_acid_seqs = name_transform(args, letter_format) + procedures = { + "molecular_weight": molecular_weight, + "one_letter_to_three": one_letter_to_three, + "get_amino_acid_sum": get_amino_acid_sum, + "codon_optimization": codon_optimization, + "length": length, + "brutto_count": brutto_count, + } + if procedure not in procedures.keys(): + raise ValueError("Requested procedure is not defined") + elif procedure == "codon_optimization": + return procedures.get(procedure)(amino_acid_seqs, cell_type) + else: + return procedures.get(procedure)(amino_acid_seqs) + + +def molecular_weight(amino_acid_seqs: list) -> list: + """ + Calculates predicated molecular weight of aa sequences. + + Arguments: + - amino_acid_seqs (list): list of string with the protein sequences + + Return: + - List of of floats corresponding to the molecular weight in kDa + """ + molecular_weights = [] + for seq in amino_acid_seqs: + total_weight = 0 + for aa in seq: + aa = aa.upper() + total_weight += AMINO_WEIGHTS[aa] + molecular_weights.append(round(total_weight / 1000, 2)) + return molecular_weights + + +def one_letter_to_three(amino_acid_seqs: list) -> list: + """ + Translates one-letter coded amino acid sequences to three-letter coded + Arguments: + - amino_acid_seqs (list): list of string with the protein sequences + + Return: + - List of of strings with three-letter coded sequences + """ + three_letters_seqs = [] + for seq in amino_acid_seqs: + three_letters_seq = [] + for aa in seq: + aa = aa.upper() + three_letters_seq.append(AMINO_SHORT_NAMES_DIC[aa]) + three_letters_seqs.append("".join(three_letters_seq)) + return three_letters_seqs + + +def get_amino_acid_sum(protein_sequences: list) -> list: + """ + Counts the amount of each amino acid in the injected protein sequences + + Arguments: + - protein_sequences (list): list of injected protein sequence + + Return: + - List of dictionary with amino acid amount""" + result = [] + for protein_sequence in range(len(protein_sequences)): + amino_acid_count = dict([(key, 0) for key in AMINO_SHORT_NAMES_DIC.keys()]) + for amino_acid in protein_sequences[protein_sequence]: + amino_acid_count[amino_acid] += 1 + result.append(amino_acid_count) + return result + + +def codon_optimization(protein_sequences: list, cell_type: str) -> list: + """ + Makes codon-optimized DNA based on the introduced amino acid sequences for 3 types of cells: + Esherichia coli, Pichia pastoris, Mouse + + Arguments: + - protein_sequences (list): list of injected protein sequence + - cell_type (str): user-entered cell type for codon optimization + + Return: + - List of codon-optimized DNA""" + cell_types = {"Esherichia coli": ECOLI_TRIPLETS, "E.coli": ECOLI_TRIPLETS, + "Pichia pastoris" : PPASTORIS_TRIPLETS, "P.pastoris" : PPASTORIS_TRIPLETS, + "Mouse" : MOUSE_TRIPLETS, "mouse" : MOUSE_TRIPLETS} + list_cell_type = ["Esherichia coli", "E.coli","Pichia pastoris","P.pastoris", "Mouse","mouse"] + if cell_type in list_cell_type: + codon_optimization_post = [] + using_key = cell_types[cell_type] + for sequence in protein_sequences: + codon_optimization_pre = [] + for amino_acid in sequence: + codon_optimization_pre += using_key[amino_acid] + codon_optimization_post.append(''.join(codon_optimization_pre)) + return codon_optimization_post + else: + raise ValueError( f'Type {cell_type} is not supported. ' + f'The following types of organisms are available for codon optimization: ' + f'Esherichia coli, Pichia pastoris, Mouse' ) + +def length(seqs: list) -> list: + """ + Counts total length of amino acid sequence. + + Arguments: + - seqs (list): list of string with the protein sequences + + Return: + - list of int values corresponding to the length of sequences""" + result = [len(seq) for seq in seqs] + return result + + +def name_transform(seqs: tuple, letter_format: int) -> list: + """ + Transforms the amino acid sequences given to protein_analysis function from three-letter code to one-letter code, + makes sequences unified (for one-letter letter_format all letters to upper and + for three-letter letter_format to lower). + + Arguments: + - seqs (tuple): tuple of string with the protein sequences + + Return: + - list of strings with the transformed sequences""" + result = [] + multiple_of_three = [] + test_three_letters = [] + if letter_format == 1: + for seq in seqs: + multiple_of_three.append(is_length_divisible_by_3(seq)) + test_three_letters.append(is_amino_acid_three_letter(seq)) + seq = seq.upper() + for letter in seq: + if not is_amino_acid(letter): + raise ValueError( + f"Error {letter} is not an amino acid. Correct your input" + ) + result.append(seq) + if all(multiple_of_three) and all(test_three_letters): + print( + "Warning: all your sequences are similar to three-letter ones. Check the letter_format value" + ) + return result + elif letter_format == 3: + for seq in seqs: + seq = seq.lower() + seq3 = [seq[i: i + 3] for i in range(0, len(seq), 3)] + for triplet in seq3: + if not is_amino_acid(triplet): + raise ValueError( + f"Error {triplet} is not an amino acid. Correct your input" + ) + seq_transformed = "".join([AMINO_NAMES_DIC.get(seq) for seq in seq3]) + result.append(seq_transformed) + return result + else: + raise ValueError( + "Error unsupported letter_format. Only letter_formats 1 and 3 are supported" + ) + + +def is_amino_acid(input_amino: str) -> bool: + """ + Checks whether the entered string is an amino acid (either three-letter encoding or one-letter encoded). + + Arguments: + - input_amino (str): string corresponding to one amino acid (in three-letter code or one-letter code) + + Return: + - bool: True if amino acid is a valid amino acid, otherwise ValueError is amino acid is not correct + """ + if len(input_amino) == 1: + letter = input_amino + if letter not in AMINO_SHORT_NAMES_DIC.keys(): + return False + return True + elif len(input_amino) == 3: + triplet = input_amino + if triplet not in AMINO_NAMES_DIC.keys(): + return False + return True + else: + return False + + +def brutto_count(seqs: list) -> list: + """ + Calculates the brutto formula of the amino acid sequences. + + Arguments: + - seqs (list): list of string with the protein sequences + + Return: + - list of dictionaries with counts of each elemet included (elements C,H,N,O,S)""" + elements = ["C", "H", "N", "O", "S"] + result = [] + for seq in seqs: + brutto_list = [AMINO_BRUTTO.get(letter) for letter in seq] + brutto_pair = list(zip(*brutto_list)) + brutto = [sum(i) for i in brutto_pair] + brutto_dict = dict(zip(elements, brutto)) + result.append(brutto_dict) + return result + + +def is_length_divisible_by_3(seq: str) -> bool: + """ + Checks if the sequence is divisible by three. + + Arguments: + - seq (str): string of protein sequence + + Return: + - bool: True if sequence is divisible by three, otherwise False""" + seq_len = len(seq) + if seq_len % 3 == 0: + return True + return False + + +def is_amino_acid_three_letter(seq: str) -> bool: + """ + Checks whether all elements of a sequence are three-letter amino acid symbols. + + Arguments: + - seq (str): string of protein sequence + + Return: + - bool: True if sequence is corresponding to the valid three-letter amino acid, otherwise False + """ + seq = seq.lower() + seq3 = [seq[i: i + 3] for i in range(0, len(seq), 3)] + for triplet in seq3: + if triplet not in AMINO_NAMES_DIC.keys(): + return False + return True