-
Notifications
You must be signed in to change notification settings - Fork 0
Completed HW5 #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| from Modules.filter_fastq import run_filter_fastq | ||
| from Modules.dna_rna_tools import run_dna_rna_tools | ||
| from Modules.aminoacids_tools import run_aminoacid_tools | ||
|
|
||
| def run_BioSeqTools(tool_name, *args): | ||
| if tool_name == "run_aminoacid_tools": | ||
| if len(args) < 2: | ||
| raise ValueError("Not enough arguments for run_aminoacid_tools") | ||
| return run_aminoacid_tools(*args[:-1], operation=args[-1]) | ||
| elif tool_name == "run_dna_rna_tools": | ||
| return run_dna_rna_tools(*args) | ||
| elif tool_name == "run_filter_fastq": | ||
| return run_filter_fastq(*args) | ||
| else: | ||
| raise ValueError("Invalid tool_name") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,224 @@ | ||
| from typing import Dict | ||
|
|
||
|
|
||
| def calculate_percentage(seq: str) -> str: | ||
| """ | ||
| Calculates the percentage of amino acids in the entered amino acid | ||
| sequence | ||
| Arguments: | ||
| - seq (str): amino acid sequences to be analyzed | ||
| Return: | ||
| - str: a string with the percentage of each amino acid | ||
| """ | ||
| amino_acid_counts: Dict[str, int] = {} # dict to store count of each amino acid | ||
| for amino_acid in seq: | ||
| if amino_acid in amino_acid_counts: | ||
| amino_acid_counts[amino_acid] += 1 | ||
| else: | ||
| amino_acid_counts[amino_acid] = 1 | ||
| total_amino_acids = len(seq) | ||
| amino_acid_percentages = {} # dict to store each amino acid and its % | ||
| for amino_acid, count in amino_acid_counts.items(): | ||
| percentage = round(((count / total_amino_acids) * 100), 2) | ||
| amino_acid_percentages[amino_acid] = percentage | ||
| return f'Amino acids percentage of the sequence {seq}: {amino_acid_percentages}' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Всё-таки тут лучше возвращать именно словарь ( |
||
|
|
||
|
|
||
| def calculate_molecular_weight(seq: str) -> str: | ||
| """ | ||
| Calculates the molecular weight of entered amino acid sequence | ||
| Arguments: | ||
| - seq (str): amino acid sequences to be analyzed | ||
| Return: | ||
| - str: a string with the molecular weight value for amino acid | ||
| sequence | ||
| """ | ||
| amino_acid_weights = { | ||
| 'G': 57.051, 'A': 71.078, 'S': 87.077, 'P': 97.115, 'V': 99.131, | ||
| 'T': 101.104, 'C': 103.143, 'I': 113.158, 'L': 113.158, 'N': | ||
| 114.103, | ||
| 'D': 115.087, 'Q': 128.129, 'K': 128.172, 'E': 129.114, 'M': | ||
| 131.196, | ||
| 'H': 137.139, 'F': 147.174, 'R': 156.186, 'Y': 163.173, 'W': | ||
| 186.210 | ||
| } | ||
| weight = 18.02 # for the H and OH at the termini | ||
| for amino_acid in seq: | ||
| weight += amino_acid_weights[amino_acid] | ||
| return f'Molecular weight of the sequence {seq}: {round(weight, 2)} Da' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Та же история, что тут лучше возвращать число. Для отображения можно завести другую функцию |
||
|
|
||
|
|
||
| def calculate_hydrophobicity_eisenberg(sequence): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. нет типов :( |
||
|
|
||
| # Amino acid hydrophilicity/hydrophobicity scale by Eisengerg | ||
| hydrophobicity_values = { | ||
| 'A': 0.5, 'R': 0.65, 'N': 1.0, 'D': 1.3, 'C': -0.15, | ||
| 'Q': 1.0, 'E': 1.5, 'G': 0.75, 'H': 0.7, 'I': -1.3, | ||
| 'L': -1.3, 'K': 0.75, 'M': -1.1, 'F': -1.9, 'P': 0.55, | ||
| 'S': 0.6, 'T': 0.3, 'W': -0.5, 'Y': -1.65, 'V': -0.9 | ||
| } | ||
|
|
||
| # Calculate sum of hydrophilicities for all amino acids in the sequence | ||
| hydrophobicity_sum = sum(hydrophobicity_values.get(aa, 0) for aa in sequence) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А почему |
||
|
|
||
| # Determine hydrophilicity/hydrophobicity of sequence | ||
| if hydrophobicity_sum > 0: | ||
| return f"Sequence {sequence}: Hydrophilic" | ||
| elif hydrophobicity_sum < 0: | ||
| return f"Sequence {sequence}: Hydrophobic" | ||
| else: | ||
| return f"Sequence {sequence}: Neutral" | ||
|
Comment on lines
+65
to
+70
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Опять же история с отображением |
||
|
|
||
|
|
||
| def calculate_pI(sequence): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. нет типов :( |
||
| """Create a dictionary of pK values (COO-, NH3+, R) information taken | ||
| from source | ||
| http://www.sev-chem.narod.ru/spravochnik/piaminoacid.htm""" | ||
| pK_values = { | ||
| 'A': (2.34, 9.60), | ||
| 'R': (2.17, 9.04, 12.48), | ||
| 'N': (2.02, 8.80), | ||
| 'D': (2.09, 9.82, 3.86), | ||
| 'C': (1.71, 8.33, 10.30), | ||
| 'Q': (2.17, 9.13), | ||
| 'E': (2.19, 9.76, 4.25), | ||
| 'G': (2.34, 9.60), | ||
| 'H': (1.82, 9.17, 6.00), | ||
| 'I': (2.32, 9.76), | ||
| 'L': (2.36, 9.60), | ||
| 'K': (2.18, 8.95, 10.5), | ||
| 'M': (2.28, 9.21), | ||
| 'F': (2.58, 9.24), | ||
| 'P': (2.00, 10.60), | ||
| 'S': (2.21, 9.15), | ||
| 'T': (2.63, 10.43), | ||
| 'W': (1.22, 9.39), | ||
| 'Y': (2.20, 9.11, 10.10), | ||
| 'V': (2.29, 9.72) | ||
| } | ||
|
|
||
| # Initialization of variables for leftmost and rightmost elements | ||
| N_end_pK = None | ||
| C_end_pK = None | ||
|
|
||
| # Find the marginal elements and their corresponding pKs | ||
| for amino_acid in sequence: | ||
| if amino_acid in pK_values: | ||
| pK_list = pK_values[amino_acid] | ||
| if len(pK_list) >= 2: | ||
| if N_end_pK is None: | ||
| N_end_pK = pK_list[1] # Второй pK | ||
| C_end_pK = pK_list[0] # Первый pK | ||
|
|
||
| # If no amino acid sequence is specified - return None | ||
| if N_end_pK is None or C_end_pK is None: | ||
| return None | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Можно просто |
||
|
|
||
| # Calculate pI | ||
| total_pK = N_end_pK + C_end_pK | ||
| count = 2 # We take into account the found pKs - there are at least 2 | ||
|
|
||
| # Also add pK of AA radicals - the dictionary contains 3 pK values | ||
| for amino_acid in sequence: | ||
| if amino_acid in pK_values: | ||
| pK_list = pK_values[amino_acid] | ||
| if len(pK_list) >= 3: | ||
| total_pK += pK_list[2] # Третий pK | ||
| count += 1 | ||
|
|
||
| # Substitute all found values into the formula and calculate pI | ||
| pI = total_pK / count | ||
| return f"Isoelectric point for the sequence {sequence}: {pI}" | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Проблема с отображением )) |
||
|
|
||
|
|
||
| def find_cleavage_sites(seq: str, motif: list) -> list: | ||
| """Find cleavage sites for motif-specific proteases. | ||
| Arguments: | ||
| - seq - string sequence to be analyzed | ||
| - motif - subsequence to be found in a sequence. Subsequence is | ||
| specified as list of lists. | ||
| Each nested list means more than one possible aminoacid at a single | ||
| position (checked by OR condition). | ||
| Return: | ||
| - list of cleavage sites coordinates (C-end aminoacid of *potentially* | ||
| cleaved sequence) | ||
| """ | ||
| cleavage_sites = [] | ||
| seq_idx = 0 | ||
| while seq_idx < len(seq): | ||
| motif_idx = 0 | ||
| chars_at_motif_idx = motif[motif_idx] | ||
| seq_char = seq[seq_idx] | ||
| if seq_char in chars_at_motif_idx: | ||
| motif_idx += 1 | ||
| while motif_idx < len(motif): | ||
| chars_at_motif_idx = motif[motif_idx] | ||
| seq_char = seq[seq_idx+motif_idx] | ||
| if seq_char in chars_at_motif_idx: | ||
| motif_idx += 1 | ||
| else: | ||
| break | ||
| if motif_idx == len(motif): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А разве тут не всегда True? |
||
| cleavage_sites.append(seq_idx + motif_idx) | ||
| seq_idx += 1 | ||
| return cleavage_sites | ||
|
Comment on lines
+148
to
+164
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
|
|
||
| motif_dict = { | ||
| 'Caspase 3': [['D'], ['M'], ['Q'], ['D']], | ||
| 'Caspase 6': [['V'], ['E'], ['H', 'I'], ['D']], | ||
| 'Caspase 7': [['D'], ['E'], ['V'], ['D']], | ||
| 'Enterokinase': [['D', 'E'], ['D', 'E'], ['D', 'E'], ['K']] | ||
| } | ||
|
|
||
|
|
||
| def get_cleavage_sites(seq: str) -> str: | ||
| "Return amount and coordinates of cleavage sites for proteases, specified in motif_dict" | ||
| output = f'{seq}\n' | ||
| for motif_name, motif_value in motif_dict.items(): | ||
| sites = find_cleavage_sites(seq, motif_value) | ||
| output += f'{len(sites)} protease cleavage site(s) for {motif_name}: {sites}\n' | ||
| return output | ||
|
|
||
|
|
||
| all_aminoacids = { | ||
| 'A', 'R', 'N', 'D', 'C', 'H', 'G', 'Q', 'E', 'I', | ||
| 'L', 'K', 'M', 'P', 'S', 'Y', 'T', 'W', 'F', 'V' | ||
| } | ||
|
|
||
|
|
||
| def is_peptide(seq: str) -> bool: | ||
| "Check whether the incoming sequence is an aminoacid" | ||
| if set(seq).issubset(all_aminoacids): # if set(seq) <= all_aminoacids | ||
| return True | ||
| raise ValueError(f'Incoming sequence {seq} is not a peptide') | ||
|
Comment on lines
+190
to
+194
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
|
|
||
| operation_dict = { | ||
| 'get_cleavage_sites': get_cleavage_sites, | ||
| 'calculate_molecular_weight': calculate_molecular_weight, | ||
| 'calculate_percentage': calculate_percentage, | ||
| 'calculate_pI': calculate_pI, | ||
| 'calculate_hydrophobicity_eisenberg': | ||
| calculate_hydrophobicity_eisenberg | ||
| } | ||
|
|
||
|
|
||
| def run_aminoacid_tools(*seqs: str, operation: str) -> str: | ||
| """Run AminoAcid Tools | ||
| Arguments: | ||
| - *seqs - one or more string sequences to be analyzed | ||
| - operation - action to be done with sequence(s) | ||
| Return: | ||
| - string that contains incoming sequence and result of operation""" | ||
| if operation == '': | ||
| raise ValueError('Operation value is not specified') | ||
| if operation not in operation_dict: | ||
| raise ValueError(f'Incorrect operation value\nSupported operations: {list(operation_dict.keys())}') | ||
| for seq in seqs: | ||
| is_peptide(seq) | ||
| output = '' | ||
| for seq in seqs: | ||
| output += operation_dict[operation](seq) | ||
| output += '\n\n' | ||
| return output | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| def transcribe(dna_sequence): | ||
| rna_sequence = '' | ||
| for base in dna_sequence: | ||
| if base == 'T': | ||
| rna_sequence += 'U' | ||
| elif base == 't': | ||
| rna_sequence += 'u' | ||
| elif base == 'U': | ||
| rna_sequence += 'T' | ||
| elif base == 'u': | ||
| rna_sequence += 't' | ||
| else: | ||
| rna_sequence += base | ||
| return rna_sequence | ||
|
|
||
| def reverse(sequence): | ||
| return sequence[::-1] | ||
|
|
||
| def complement(sequence): | ||
| complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't', | ||
| 't': 'a', 'c': 'g', 'g': 'c'} | ||
| complement_sequence = ''.join(complement_dict.get(base, base) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Опять .get |
||
| for base in sequence) | ||
| return complement_sequence | ||
|
|
||
| def reverse_complement(dna_sequence): | ||
| # можно также "complement_sequence = complement(dna_sequence)" | ||
| complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't', | ||
| 't': 'a', 'c': 'g', 'g': 'c'} | ||
| complement_sequence = ''.join(complement_dict.get(base, base) | ||
| for base in dna_sequence) | ||
| return complement_sequence[::-1] | ||
|
Comment on lines
+26
to
+32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Есть же функции для reverse и complement |
||
|
|
||
| def run_dna_rna_tools(*args): | ||
| if not args: | ||
| return "There are no function arguments" | ||
| action = args[-1].lower() | ||
| sequences = args[:-1] | ||
| results = [] | ||
| for sequence in sequences: | ||
| if not all(base in 'ACGTU' for base in sequence.upper()): | ||
| results.append(f"Invalid sequence: {sequence}") | ||
| elif action == 'transcribe': | ||
| results.append(transcribe(sequence)) | ||
| elif action == 'reverse': | ||
| results.append(reverse(sequence)) | ||
| elif action == 'complement': | ||
| results.append(complement(sequence)) | ||
| elif action == 'reverse_complement': | ||
| results.append(reverse_complement(sequence)) | ||
| else: | ||
| results.append(f"Invalid action: {action}") | ||
| return results[0] if len(results) == 1 else results | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| def run_filter_fastq(seqs, gc_bounds=(0, 100), length_bounds=(0, 2**32), quality_threshold=0): | ||
| filtered_seqs = {} | ||
|
|
||
| for name, (sequence, quality) in seqs.items(): | ||
| # Estimate GC-content | ||
| gc_content = (sequence.count('G') + sequence.count('C')) / len(sequence) * 100 | ||
|
|
||
| # Estimate average quality | ||
| avg_quality = sum(ord(q) - 33 for q in quality) / len(quality) | ||
|
|
||
| # Filtering by user`s conditions | ||
| if ( | ||
| gc_bounds[0] <= gc_content <= gc_bounds[1] and | ||
| length_bounds[0] <= len(sequence) <= length_bounds[1] and | ||
| avg_quality >= quality_threshold | ||
| ): | ||
| filtered_seqs[name] = (sequence, quality) | ||
|
|
||
| return filtered_seqs | ||
|
|


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Честно говоря, из названия не оч понятно, что за процент