Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions BioSeqTools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from Modules.filter_fastq import run_filter_fastq
from Modules.dna_rna_tools import run_dna_rna_tools
from Modules.aminoacids_tools import run_aminoacid_tools

def run_BioSeqTools(tool_name, *args):
if tool_name == "run_aminoacid_tools":
if len(args) < 2:
raise ValueError("Not enough arguments for run_aminoacid_tools")
return run_aminoacid_tools(*args[:-1], operation=args[-1])
elif tool_name == "run_dna_rna_tools":
return run_dna_rna_tools(*args)
elif tool_name == "run_filter_fastq":
return run_filter_fastq(*args)
else:
raise ValueError("Invalid tool_name")
Binary file added Modules/.DS_Store
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added Modules/__pycache__/dna_rna_tools.cpython-310.pyc
Binary file not shown.
Binary file added Modules/__pycache__/dna_rna_tools.cpython-311.pyc
Binary file not shown.
Binary file added Modules/__pycache__/filter_fastq.cpython-310.pyc
Binary file not shown.
Binary file added Modules/__pycache__/filter_fastq.cpython-311.pyc
Binary file not shown.
224 changes: 224 additions & 0 deletions Modules/aminoacids_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
from typing import Dict


def calculate_percentage(seq: str) -> str:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Честно говоря, из названия не оч понятно, что за процент

"""
Calculates the percentage of amino acids in the entered amino acid
sequence
Arguments:
- seq (str): amino acid sequences to be analyzed
Return:
- str: a string with the percentage of each amino acid
"""
amino_acid_counts: Dict[str, int] = {} # dict to store count of each amino acid
for amino_acid in seq:
if amino_acid in amino_acid_counts:
amino_acid_counts[amino_acid] += 1
else:
amino_acid_counts[amino_acid] = 1
total_amino_acids = len(seq)
amino_acid_percentages = {} # dict to store each amino acid and its %
for amino_acid, count in amino_acid_counts.items():
percentage = round(((count / total_amino_acids) * 100), 2)
amino_acid_percentages[amino_acid] = percentage
return f'Amino acids percentage of the sequence {seq}: {amino_acid_percentages}'

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Всё-таки тут лучше возвращать именно словарь (amino_acid_percentages), т.к. он может понадобиться при расчете чего-то другого. Отображение можно сделать в другой функции, которая как раз принимала бы этот словарь



def calculate_molecular_weight(seq: str) -> str:
"""
Calculates the molecular weight of entered amino acid sequence
Arguments:
- seq (str): amino acid sequences to be analyzed
Return:
- str: a string with the molecular weight value for amino acid
sequence
"""
amino_acid_weights = {
'G': 57.051, 'A': 71.078, 'S': 87.077, 'P': 97.115, 'V': 99.131,
'T': 101.104, 'C': 103.143, 'I': 113.158, 'L': 113.158, 'N':
114.103,
'D': 115.087, 'Q': 128.129, 'K': 128.172, 'E': 129.114, 'M':
131.196,
'H': 137.139, 'F': 147.174, 'R': 156.186, 'Y': 163.173, 'W':
186.210
}
weight = 18.02 # for the H and OH at the termini
for amino_acid in seq:
weight += amino_acid_weights[amino_acid]
return f'Molecular weight of the sequence {seq}: {round(weight, 2)} Da'

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Та же история, что тут лучше возвращать число. Для отображения можно завести другую функцию



def calculate_hydrophobicity_eisenberg(sequence):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

нет типов :(


# Amino acid hydrophilicity/hydrophobicity scale by Eisengerg
hydrophobicity_values = {
'A': 0.5, 'R': 0.65, 'N': 1.0, 'D': 1.3, 'C': -0.15,
'Q': 1.0, 'E': 1.5, 'G': 0.75, 'H': 0.7, 'I': -1.3,
'L': -1.3, 'K': 0.75, 'M': -1.1, 'F': -1.9, 'P': 0.55,
'S': 0.6, 'T': 0.3, 'W': -0.5, 'Y': -1.65, 'V': -0.9
}

# Calculate sum of hydrophilicities for all amino acids in the sequence
hydrophobicity_sum = sum(hydrophobicity_values.get(aa, 0) for aa in sequence)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А почему .get? ))
Кажется, что если нету нужной а.к., то падаем с ошибкой, а не просто подставляем 0


# Determine hydrophilicity/hydrophobicity of sequence
if hydrophobicity_sum > 0:
return f"Sequence {sequence}: Hydrophilic"
elif hydrophobicity_sum < 0:
return f"Sequence {sequence}: Hydrophobic"
else:
return f"Sequence {sequence}: Neutral"
Comment on lines +65 to +70

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Опять же история с отображением



def calculate_pI(sequence):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

нет типов :(

"""Create a dictionary of pK values (COO-, NH3+, R) information taken
from source
http://www.sev-chem.narod.ru/spravochnik/piaminoacid.htm"""
pK_values = {
'A': (2.34, 9.60),
'R': (2.17, 9.04, 12.48),
'N': (2.02, 8.80),
'D': (2.09, 9.82, 3.86),
'C': (1.71, 8.33, 10.30),
'Q': (2.17, 9.13),
'E': (2.19, 9.76, 4.25),
'G': (2.34, 9.60),
'H': (1.82, 9.17, 6.00),
'I': (2.32, 9.76),
'L': (2.36, 9.60),
'K': (2.18, 8.95, 10.5),
'M': (2.28, 9.21),
'F': (2.58, 9.24),
'P': (2.00, 10.60),
'S': (2.21, 9.15),
'T': (2.63, 10.43),
'W': (1.22, 9.39),
'Y': (2.20, 9.11, 10.10),
'V': (2.29, 9.72)
}

# Initialization of variables for leftmost and rightmost elements
N_end_pK = None
C_end_pK = None

# Find the marginal elements and their corresponding pKs
for amino_acid in sequence:
if amino_acid in pK_values:
pK_list = pK_values[amino_acid]
if len(pK_list) >= 2:
if N_end_pK is None:
N_end_pK = pK_list[1] # Второй pK
C_end_pK = pK_list[0] # Первый pK

# If no amino acid sequence is specified - return None
if N_end_pK is None or C_end_pK is None:
return None

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Можно просто return. Это то же самое, что и return None


# Calculate pI
total_pK = N_end_pK + C_end_pK
count = 2 # We take into account the found pKs - there are at least 2

# Also add pK of AA radicals - the dictionary contains 3 pK values
for amino_acid in sequence:
if amino_acid in pK_values:
pK_list = pK_values[amino_acid]
if len(pK_list) >= 3:
total_pK += pK_list[2] # Третий pK
count += 1

# Substitute all found values into the formula and calculate pI
pI = total_pK / count
return f"Isoelectric point for the sequence {sequence}: {pI}"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Проблема с отображением ))



def find_cleavage_sites(seq: str, motif: list) -> list:
"""Find cleavage sites for motif-specific proteases.
Arguments:
- seq - string sequence to be analyzed
- motif - subsequence to be found in a sequence. Subsequence is
specified as list of lists.
Each nested list means more than one possible aminoacid at a single
position (checked by OR condition).
Return:
- list of cleavage sites coordinates (C-end aminoacid of *potentially*
cleaved sequence)
"""
cleavage_sites = []
seq_idx = 0
while seq_idx < len(seq):
motif_idx = 0
chars_at_motif_idx = motif[motif_idx]
seq_char = seq[seq_idx]
if seq_char in chars_at_motif_idx:
motif_idx += 1
while motif_idx < len(motif):
chars_at_motif_idx = motif[motif_idx]
seq_char = seq[seq_idx+motif_idx]
if seq_char in chars_at_motif_idx:
motif_idx += 1
else:
break
if motif_idx == len(motif):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А разве тут не всегда True?

cleavage_sites.append(seq_idx + motif_idx)
seq_idx += 1
return cleavage_sites
Comment on lines +148 to +164

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В предыдущем ревью этого ДЗ писал, как лучше изменить :(
image



motif_dict = {
'Caspase 3': [['D'], ['M'], ['Q'], ['D']],
'Caspase 6': [['V'], ['E'], ['H', 'I'], ['D']],
'Caspase 7': [['D'], ['E'], ['V'], ['D']],
'Enterokinase': [['D', 'E'], ['D', 'E'], ['D', 'E'], ['K']]
}


def get_cleavage_sites(seq: str) -> str:
"Return amount and coordinates of cleavage sites for proteases, specified in motif_dict"
output = f'{seq}\n'
for motif_name, motif_value in motif_dict.items():
sites = find_cleavage_sites(seq, motif_value)
output += f'{len(sites)} protease cleavage site(s) for {motif_name}: {sites}\n'
return output


all_aminoacids = {
'A', 'R', 'N', 'D', 'C', 'H', 'G', 'Q', 'E', 'I',
'L', 'K', 'M', 'P', 'S', 'Y', 'T', 'W', 'F', 'V'
}


def is_peptide(seq: str) -> bool:
"Check whether the incoming sequence is an aminoacid"
if set(seq).issubset(all_aminoacids): # if set(seq) <= all_aminoacids
return True
raise ValueError(f'Incoming sequence {seq} is not a peptide')
Comment on lines +190 to +194

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Из предыдущего ревью:
image



operation_dict = {
'get_cleavage_sites': get_cleavage_sites,
'calculate_molecular_weight': calculate_molecular_weight,
'calculate_percentage': calculate_percentage,
'calculate_pI': calculate_pI,
'calculate_hydrophobicity_eisenberg':
calculate_hydrophobicity_eisenberg
}


def run_aminoacid_tools(*seqs: str, operation: str) -> str:
"""Run AminoAcid Tools
Arguments:
- *seqs - one or more string sequences to be analyzed
- operation - action to be done with sequence(s)
Return:
- string that contains incoming sequence and result of operation"""
if operation == '':
raise ValueError('Operation value is not specified')
if operation not in operation_dict:
raise ValueError(f'Incorrect operation value\nSupported operations: {list(operation_dict.keys())}')
for seq in seqs:
is_peptide(seq)
output = ''
for seq in seqs:
output += operation_dict[operation](seq)
output += '\n\n'
return output
53 changes: 53 additions & 0 deletions Modules/dna_rna_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
def transcribe(dna_sequence):
rna_sequence = ''
for base in dna_sequence:
if base == 'T':
rna_sequence += 'U'
elif base == 't':
rna_sequence += 'u'
elif base == 'U':
rna_sequence += 'T'
elif base == 'u':
rna_sequence += 't'
else:
rna_sequence += base
return rna_sequence

def reverse(sequence):
return sequence[::-1]

def complement(sequence):
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't',
't': 'a', 'c': 'g', 'g': 'c'}
complement_sequence = ''.join(complement_dict.get(base, base)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Опять .get

for base in sequence)
return complement_sequence

def reverse_complement(dna_sequence):
# можно также "complement_sequence = complement(dna_sequence)"
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't',
't': 'a', 'c': 'g', 'g': 'c'}
complement_sequence = ''.join(complement_dict.get(base, base)
for base in dna_sequence)
return complement_sequence[::-1]
Comment on lines +26 to +32

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Есть же функции для reverse и complement


def run_dna_rna_tools(*args):
if not args:
return "There are no function arguments"
action = args[-1].lower()
sequences = args[:-1]
results = []
for sequence in sequences:
if not all(base in 'ACGTU' for base in sequence.upper()):
results.append(f"Invalid sequence: {sequence}")
elif action == 'transcribe':
results.append(transcribe(sequence))
elif action == 'reverse':
results.append(reverse(sequence))
elif action == 'complement':
results.append(complement(sequence))
elif action == 'reverse_complement':
results.append(reverse_complement(sequence))
else:
results.append(f"Invalid action: {action}")
return results[0] if len(results) == 1 else results
20 changes: 20 additions & 0 deletions Modules/filter_fastq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
def run_filter_fastq(seqs, gc_bounds=(0, 100), length_bounds=(0, 2**32), quality_threshold=0):
filtered_seqs = {}

for name, (sequence, quality) in seqs.items():
# Estimate GC-content
gc_content = (sequence.count('G') + sequence.count('C')) / len(sequence) * 100

# Estimate average quality
avg_quality = sum(ord(q) - 33 for q in quality) / len(quality)

# Filtering by user`s conditions
if (
gc_bounds[0] <= gc_content <= gc_bounds[1] and
length_bounds[0] <= len(sequence) <= length_bounds[1] and
avg_quality >= quality_threshold
):
filtered_seqs[name] = (sequence, quality)

return filtered_seqs

Loading