diff --git a/HW4_Trofimov/README.md b/HW4_Trofimov/README.md
new file mode 100644
index 0000000..56c6659
--- /dev/null
+++ b/HW4_Trofimov/README.md
@@ -0,0 +1,69 @@
+
+
+# Das biotools.aminoacids
+> **The great and terrifying successor of biopython**
+
+Das biotools strikes again! Now it works only with aminoacid sequences!
+
+## Features
+
+- **get_pI()**: Gives isoelectric point value for each aminoacid individually.
+
+- **build_scoring_matri()**: Auxiliary function for needleman_wunsch. Build a scoring matrix for amino acid pairs, which can be used in sequence alignment algorithms.
+
+- **needleman_wunsch()**: Implement the Needleman-Wunsch algorithm for global sequence alignment of two amino acid sequences.
+
+- **convert_to_3L_code()**: Converts one letter animoacid sequence to three letter aminoacid sequence.
+
+- **protein_mass()**: Calculates molecular weight of the aminoacid sequence using monoisotopic masses.
+
+- **translate_protein_rna()**: Converts aminoacid sequence to RNA sequence. For those aminoacids that are coded with more than one codon, this function randomly chooses one codon from the set.
+
+- **calculate_aa_freq()**: Calculate the frequences of aminoacids in protein sequences.
+
+## Examples
+
+- **get_pI**
+
+```python
+calculate_pI('RAHP') -> "Sequence: RAHP. Isoelectric point of each aminoacid: [('R', 10.8), ('A', 6.0), ('H', 7.6), ('P', 6.3)]"
+```
+
+- **needleman_wunsch**
+
+```python
+needleman_wunsch('raHP','RAQQHP') -> 'ra--HP, RAQQHP, final score: 2'
+```
+
+- **convert_to_3L_code**
+
+```python
+convert_to_3L_code('ACDEF') -> 'Ala-Cys-Asp-Glu-Phe'
+```
+
+- **protein_mass**
+
+```python
+protein_mass('ACDEF') -> 565.184
+```
+
+- **translate_protein_rna**
+
+```python
+translate_protein_rna('ACDEF') -> 'GCCUGCGACGAGUUC'
+```
+
+- **calculate_aa_freq**
+
+```python
+calculate_aa_freq('ACDEF') -> {'A': 1, 'C': 1, 'D': 1, 'E': 1, 'F': 1}
+```
+
+## OUR TEAM
+
+
+Up to bottom, left to right:
+- Alisa Fedorenko: functions **main**, **calculate_aa_freq**
+- Michil Trofimov: functions **get_pI**, **needleman_wunsch** (teamlead)
+- Shakir Suleimanov: functions **convert_to_3L_code**, **protein_mass**, **translate_protein_rna**
+
diff --git a/HW4_Trofimov/das_protein_tools.py b/HW4_Trofimov/das_protein_tools.py
new file mode 100644
index 0000000..3ae64ed
--- /dev/null
+++ b/HW4_Trofimov/das_protein_tools.py
@@ -0,0 +1,361 @@
+# importing necessary modules
+import protein_dict as pd
+from random import choice
+
+
+AMINO_LETTERS = [
+ "A",
+ "C",
+ "D",
+ "E",
+ "F",
+ "G",
+ "H",
+ "I",
+ "K",
+ "L",
+ "M",
+ "N",
+ "P",
+ "Q",
+ "R",
+ "S",
+ "T",
+ "V",
+ "W",
+ "Y",
+]
+
+
+# Function to determine is the sequence is a protein or not
+def is_protein(seq: str) -> bool:
+ """
+ This function checks if the sequence is a protein or not
+
+ Arguments:
+ seq (str): A sequence of aminoacids
+
+ Output:
+ returns True or False
+ """
+ unique_chars = set(seq)
+ aminoacids = set(pd.aa_monoistopic_mass_dict.keys())
+ return bool(unique_chars <= aminoacids)
+
+
+# Function to get pI for each aa
+def get_pI(
+ sequence: str,
+ pI_values: dict = None,
+) -> str:
+ """
+ Gives isoelectric point value for each aminoacid individually
+
+ Args:
+ - sequence (str): sequence for which to calculate isoelectric point
+ - pI_values (dict): acid dissociation constants for each aminoacid
+
+ Return:
+ - str: string, which contains:
+ - an original sequence,
+ - list of tuple pairs of aminoacid and corresponding isoelectric point,
+ """
+
+ if pI_values is None:
+ # Default pKa_values if not provided
+ pI_values = pd.aa_pI
+
+ aminoacid_pIs = []
+
+ # Calculate pI for each amino acid in the sequence while preserving case
+ analysed_aa = []
+ for aa in sequence:
+ aa_upper = aa.upper()
+ if aa_upper not in analysed_aa:
+ if aa_upper in pI_values:
+ pI = pI_values[aa_upper]
+ analysed_aa.append(aa_upper)
+ if aa.isupper():
+ aminoacid_pIs.append((aa_upper, pI))
+ else:
+ aminoacid_pIs.append((aa, pI))
+ else:
+ continue
+
+ return f"Sequence: {sequence}. Isoelectric point of each aminoacid: {aminoacid_pIs}"
+
+
+# Function to build scoring matrix for needleman_wunsch function
+def build_scoring_matrix(
+ match_score: int,
+ mismatch_score: int,
+ amino_acid_alphabet: str = None,
+) -> dict:
+ """
+ Build a default scoring matrix, if not provided in needleman-wunsch function parameter
+
+ Args:
+ - match_score (int): integer value of a matching score of aminoacids
+ - mismatch_score (int): integer value of a mismatching score of aminoacids
+ - amino_acid_alphabet (str): upper case amino acid alphabet
+
+ Returns:
+ - a dictionary of dictionaries representing a scoring matrix for aminoacids paris. Key of a dictionary is an aminoacid and its value is a dictionary of scores
+ """
+
+ if amino_acid_alphabet is None:
+ # Default pKa_values if not provided
+ amino_acid_alphabet = "ACDEFGHIKLMNPQRSTVWY"
+
+ scoring_matrix = {}
+
+ for aa1 in amino_acid_alphabet:
+ scoring_matrix[aa1] = {}
+ for aa2 in amino_acid_alphabet:
+ scoring_matrix[aa1][aa2] = (
+ match_score if aa1.upper() == aa2.upper() else mismatch_score
+ )
+
+ return scoring_matrix
+
+
+# Function to perform alignment based on needleman_wunsch algorithm
+def needleman_wunsch(
+ seq1: str,
+ seq2: str,
+ scoring_matrix: dict = None,
+ gap_penalty: int = -1,
+ match_score: int = 1,
+ mismatch_score: int = -1,
+) -> str:
+ """
+ Uses Needleman-Wunsch algorithm to make a global alignment of two sequences.
+
+ Args:
+ - seq1 (str): first aminoacid sequence for alignment
+ - seq2 (str): second aminoacid sequence for alignment
+ - scoring_matrix (dict): A dictionary representing a scoring matrix for amino acid pairs
+ If not provided, a default scoring_matrix is generated based on match and mismatch scores
+ - gap_penalty (int): integer va;ue of a penalty score for introducing a gap in the alignment
+ - match_score (int): integer value of a matching score for matching aminoacids
+ - mismatch_score (int): integer value of a mismatching score for mismatched aminoacids
+
+ Returns:
+ - string: a string containing the aligned sequences (str), the aligned score (int)
+ """
+ if scoring_matrix is None:
+ # Default scoring matrix if not provided
+ scoring_matrix = build_scoring_matrix(match_score, mismatch_score)
+
+ seq1_upper = seq1.upper() # Convert seq1 to uppercase
+ seq2_upper = seq2.upper() # Convert seq2 to uppercase
+
+ m, n = len(seq1_upper), len(seq2_upper)
+
+ # Initialize matrices
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
+ traceback = [[""] * (n + 1) for _ in range(m + 1)]
+
+ # Fill in the scoring matrix and traceback matrix
+ for i in range(1, m + 1):
+ for j in range(1, n + 1):
+ match = dp[i - 1][j - 1] + scoring_matrix.get(seq1_upper[i - 1], {}).get(
+ seq2_upper[j - 1], mismatch_score
+ )
+ delete = dp[i - 1][j] + gap_penalty
+ insert = dp[i][j - 1] + gap_penalty
+
+ dp[i][j] = max(match, delete, insert)
+
+ if dp[i][j] == match:
+ traceback[i][j] = "D" # Diagonal (indicates a match/mismatch)
+ elif dp[i][j] == delete:
+ traceback[i][j] = "U" # Up (indicates a gap in seq2)
+ else:
+ traceback[i][j] = "L" # Left (indicates a gap in seq1)
+
+ # Traceback to find the aligned sequences while preserving case
+ aligned_seq1, aligned_seq2 = [], []
+ i, j = m, n
+ while i > 0 or j > 0:
+ if i > 0 and j > 0 and traceback[i][j] == "D":
+ # check original case of amionacid in seq1
+ if seq1[i - 1].isupper():
+ aligned_seq1.append(seq1_upper[i - 1])
+ else:
+ aligned_seq1.append(seq1[i - 1])
+
+ # check original case of amionacid in seq2
+ if seq2[j - 1].isupper():
+ aligned_seq2.append(seq2_upper[j - 1])
+ else:
+ aligned_seq2.append(seq2[j - 1])
+
+ i -= 1
+ j -= 1
+ elif i > 0 and traceback[i][j] == "U":
+ # check original case of amionacid in seq1
+ if seq1[i - 1].isupper():
+ aligned_seq1.append(seq1_upper[i - 1])
+ else:
+ aligned_seq1.append(seq1[i - 1])
+ aligned_seq2.append("-")
+
+ i -= 1
+ else:
+ aligned_seq1.append("-")
+ # check original case of amionacid in seq2
+ if seq2[j - 1].isupper():
+ aligned_seq2.append(seq2_upper[j - 1])
+ else:
+ aligned_seq2.append(seq2[j - 1])
+
+ j -= 1
+
+ aligned_seq1 = "".join(reversed(aligned_seq1))
+ aligned_seq2 = "".join(reversed(aligned_seq2))
+
+ return f"{aligned_seq1}, {aligned_seq2}, final score: {dp[m][n]}"
+
+
+# Function to calculate frequency of unique aminoacid in the sequence
+def calculate_aa_freq(sequences: str) -> dict:
+ """
+ Calculates the frequency of each amino acid in a protein sequence or sequences.
+
+ :param sequences: protein sequence or sequences
+ :type sequences: str or list of str
+ :return: dictionary with the frequency of each amino acid
+ :rtype: dict
+ """
+
+ # Creating a dictionary with aminoacid frequencies:
+ amino_acid_frequency = {}
+
+ for amino_acid in sequences:
+ # If the aminoacid has been already in:
+ if amino_acid in amino_acid_frequency:
+ amino_acid_frequency[amino_acid] += 1
+ # If the aminoacid hasn't been already in:
+ else:
+ amino_acid_frequency[amino_acid] = 1
+
+ return amino_acid_frequency
+
+
+# Function to convert one-letter protein sequence to three-letter protein sequence
+def convert_to_3L_code(seq: str) -> str:
+ """
+ This function takes one letter aminoacids sequence and convert's it to three leter coding
+
+ Arguments:
+ seq (str): A sequence of aminoacids
+
+ Output:
+ same sequence but in three-letter coding
+ """
+ seq = seq.upper()
+ if is_protein(seq) is True:
+ sequence = "".join(pd.aa_one_to_three_letter.get(aa) for aa in seq)
+ return sequence[:-1]
+ else:
+ raise ValueError("Sequence is not a protein, input should be protein")
+
+
+# Function to calculate protein mass
+def protein_mass(seq: str) -> float:
+ """
+ This function takes aminoacids sequence and counts it's summary molecular weight using monoisotopic masses
+
+ Arguments:
+ seq (str): A sequence of aminoacids
+
+ Output:
+ returns molecular weight
+ """
+ seq = seq.upper()
+ if is_protein(seq) is True:
+ mass = sum(pd.aa_monoistopic_mass_dict.get(aa) for aa in seq)
+ return mass
+ else:
+ raise ValueError("Sequence is not a protein, input should be protein")
+
+
+# Function to translate Protein to RNA
+def translate_protein_rna(seq: str) -> str:
+ """
+ This function takes aminoacid sequence and translates in to the RNA.
+ As most of the aminoacids are coded with several different codons,
+ this function will take a random codon of the set for such aminoacids.
+
+ Arguments:
+ seq (str): A sequence of RNA molecule
+
+ Output:
+ returns sequence of aminoacids
+ """
+ seq = seq.upper()
+ if is_protein(seq) is True:
+ rna = ""
+ for aa in seq:
+ codon = choice(pd.aa_codon_dict.get(aa))
+ rna += codon
+ return rna
+ else:
+ raise ValueError("Sequence is not a protein, input should be a protein")
+
+
+def main(*args: str):
+ """
+ Main function to perform various actions on protein sequences.
+
+ Args:
+ - *args: Variable number of arguments. The first n-1 arguments should be protein sequences,
+ and the last argument should be a string specifying the action to be performed.
+
+ Returns:
+ - The result of the specified action on the input protein sequences.
+
+ Raises:
+ - ValueError: If the specified action is not supported or if there is an error in the number of sequences.
+ Also raised if the input sequences are not valid protein sequences.
+
+ Supported Actions:
+ - "get_pI": Calculate isoelectric points for each amino acid in the sequence.
+ - "needleman_wunsch": Perform global alignment of two sequences using the Needleman-Wunsch algorithm.
+ - "build_scoring_matrix": Build a scoring matrix for amino acid pairs.
+ - "calculate_aa_freq": Calculate the frequency of each amino acid in a protein sequence.
+ - "translate_protein_rna": Translate amino acid sequence to RNA, using random codons for each amino acid.
+ - "convert_to_3L_code": Convert one-letter amino acid sequence to three-letter coding.
+ - "protein_mass": Calculate the molecular weight of the protein sequence.
+ """
+
+ action = args[-1]
+ action_list = {
+ "get_pI": get_pI,
+ "needleman_wunsch": needleman_wunsch,
+ "build_scoring_matrix": build_scoring_matrix,
+ "calculate_aa_freq": calculate_aa_freq,
+ "translate_protein_rna": translate_protein_rna,
+ "convert_to_3L_code": convert_to_3L_code,
+ "protein_mass": protein_mass,
+ }
+
+ if action not in action_list:
+ raise ValueError(f"No such action: {action}")
+
+ if not (
+ action == "needleman_wunsch"
+ and len(args) == 3
+ or action != "needleman_wunsch"
+ and len(args) == 2
+ ):
+ raise ValueError("Error in number of sequences")
+
+ for sequence in args[:-1]:
+ if not all([letter.capitalize() in AMINO_LETTERS for letter in sequence]):
+ raise ValueError(f"The sequence is not protein sequence: {sequence}")
+
+ result = action_list[action](*args[:-1])
+
+ return result
diff --git a/HW4_Trofimov/protein_dict.py b/HW4_Trofimov/protein_dict.py
new file mode 100644
index 0000000..a7e05c2
--- /dev/null
+++ b/HW4_Trofimov/protein_dict.py
@@ -0,0 +1,97 @@
+# Dictionary: keys - single-letter amino acid designations; values - list of RNA codons
+aa_codon_dict = {
+ "G": ["GGA", "GGU", "GGC", "GGG"],
+ "R": ["AGA", "AGG", "CGA", "CGC", "CGG", "CGU"],
+ "S": ["AGC", "AGU", "UCA", "UCC", "UCG", "UCU"],
+ "E": ["GAA", "GAG"],
+ "P": ["CCA", "CCC", "CCG", "CCU"],
+ "L": ["CUA", "CUC", "CUG", "CUU", "UUA", "UUG"],
+ "V": ["GUA", "GUC", "GUG", "GUU"],
+ "T": ["ACA", "ACC", "ACG", "ACU"],
+ "A": ["GCA", "GCC", "GCG", "GCU"],
+ "I": ["AUA", "AUC", "AUU"],
+ "F": ["UUC", "UUU"],
+ "H": ["CAC", "CAU"],
+ "Y": ["UAC", "UAU"],
+ "Q": ["CAA", "CAG"],
+ "C": ["UGC", "UGU"],
+ "N": ["AAC", "AAU"],
+ "D": ["GAC", "GAU"],
+ "K": ["AAA", "AAG"],
+ "M": ["AUG"],
+ "W": ["UGG"],
+}
+
+
+# Dictionary: keys - single-letter amino acid designations; values - names of amino acids
+aa_one_to_three_letter = {
+ "A": "Ala-",
+ "C": "Cys-",
+ "D": "Asp-",
+ "E": "Glu-",
+ "F": "Phe-",
+ "G": "Gly-",
+ "H": "His-",
+ "I": "Ile-",
+ "K": "Lys-",
+ "L": "Leu-",
+ "M": "Met-",
+ "N": "Asn-",
+ "P": "Pro-",
+ "Q": "Gln-",
+ "R": "Arg-",
+ "S": "Ser-",
+ "T": "Thr-",
+ "V": "Val-",
+ "W": "Trp-",
+ "Y": "Tyr-",
+}
+
+
+# aminoacids mass dictionary
+aa_monoistopic_mass_dict = {
+ "A": 71.03711,
+ "C": 103.00919,
+ "D": 115.02694,
+ "E": 129.04259,
+ "F": 147.06841,
+ "G": 57.02146,
+ "H": 137.05891,
+ "I": 113.08406,
+ "K": 128.09496,
+ "L": 113.08406,
+ "M": 131.04049,
+ "N": 114.04293,
+ "P": 97.05276,
+ "Q": 128.05858,
+ "R": 156.10111,
+ "S": 87.03203,
+ "T": 101.04768,
+ "V": 99.06841,
+ "W": 186.07931,
+ "Y": 163.06333,
+}
+
+# aminoacids pI (isoelectric point) values dictionary
+aa_pI = {
+ "A": 6.0, # Alanine
+ "R": 10.8, # Arginine
+ "N": 5.4, # Asparagine
+ "D": 2.8, # Aspartic Acid
+ "C": 5.0, # Cysteine
+ "E": 3.2, # Glutamic Acid
+ "Q": 5.7, # Glutamine
+ "G": 6.1, # Glycine
+ "H": 7.6, # Histidine
+ "I": 6.0, # Isoleucine
+ "L": 6.0, # Leucine
+ "K": 9.7, # Lysine
+ "M": 5.7, # Methionine
+ "F": 5.5, # Phenylalanine
+ "P": 6.3, # Proline
+ "S": 5.7, # Serine
+ "T": 5.6, # Threonine
+ "W": 5.9, # Tryptophan
+ "Y": 5.7, # Tyrosine
+ "V": 6.0, # Valine
+}