Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions CEP170B.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import os
from abc import ABC, abstractmethod
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
from typing import Tuple

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Насколько мне известно, после python 3.9 не рекомендуется использовать Tuple, хотя работе кода это не мешает.



def filter_fastq(input_path: str,
output_filename: str = None,
gc_bounds: Tuple[int, int] = (0, 100),
length_bounds: Tuple[int, int] = (0, 2**32),
Comment on lines +8 to +11

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Есть некоторое нарушение pep8: не следует оставлять пробелы на концах строк.

quality_threshold: int = 0) -> None:
'''
Filter FASTQ-sequences based on entered requirements.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Следует убрать \t.

Arguments:
- input_path (str): path to the file with FASTQ-sequences
- output_filename (str): name of the output file with
filtered FASTQ-sequences
- gc_bounds (tuple or int, default = (0, 100)): GC-content
interval (percentage) for filtering. Tuple if contains
lower and upper bounds, int if only contains an upper bound.
- length_bounds (tuple or int, default = (0, 2**32)): length
interval for filtering. Tuple if contains lower and upper
bounds, int if only contains an upper bound.
- quality_threshold (int, default = 0): threshold value of average
read quality for filtering.

Note: the output file is saved to the /fastq_filtrator_results
directory. The default output file name is the name of the input file.
Comment on lines +16 to +30
Copy link

@zmitserbio zmitserbio Mar 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В некоторых строках имеются пробелы на концах.
Гораздо более существенно то, что функция не соответствует заявленному функционалу. Если подать int аргументам gc_bounds и length_bounds, то падает с ошибкой, т.к. не реализован перевод этого int в соответствующий tuple. Это можно было реализовать, например, так:

if isinstance(gc_bounds, int):
    gc_bounds = tuple([0, gc_bounds])
if isinstance(length_bounds, int):
    length_bounds = tuple([0, length_bounds])

Возможно, в будущем имеет смысл писать себе pass или комментарии, чтобы не забывать.

'''
with open(input_path, "r") as handle:
records = [record for record in SeqIO.parse(handle, "fastq")]

filtered_records = []
for record in records:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Здесь была возможность использовать SeqIO:

Suggested change
for record in records:
for i, record in enumerate(SeqIO.parse(input_path, "fastq")):

gc_content = gc_fraction(record.seq) * 100
seq_length = len(record.seq)
avg_quality = sum(record.letter_annotations["phred_quality"]) / seq_length

if (gc_bounds[0] <= gc_content <= gc_bounds[1] and
length_bounds[0] <= seq_length <= length_bounds[1] and
avg_quality >= quality_threshold):
filtered_records.append(record)

if not os.path.exists("fastq_filtrator_results"):
os.makedirs("fastq_filtrator_results")

if output_filename is None:
output_filename = input_path.split("/")[-1].split(".")[0] + "_filtered.fastq"

output_path = "fastq_filtrator_results/" + output_filename

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Отличный вариант с добавлением папки для результатов. Но можно проверить ее существование перед созданием.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Это очень очень хорошее замечание. Папку при чем можно было бы делать где нибудь отдельно в начале специальной функцией для создания папок

SeqIO.write(filtered_records, output_path, "fastq")

print(f"Filtered sequences saved to {output_path}")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Небольшое дополнение, но помогает сориентироваться, куда записался файл, спасибо.



class BiologicalSequence(ABC):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Насколько мне известно, классы более принято располагать перед функциями.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Все так

def __init__(self, seq: str = None):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хотя это и не запрещено, в лекции/консультации упоминалось, что лучше не писать код в абстрактном классе.

self.seq = seq

@abstractmethod
def __len__(self):
pass

@abstractmethod
def __getitem__(self, index):
pass

@abstractmethod
def __repr__(self):
pass

@abstractmethod
def check_alphabet(self):
pass


class NucleicAcidSequence(BiologicalSequence):
def __len__(self):
return len(self.seq)

def __getitem__(self, index):
return self.seq[index]

def __repr__(self):
return self.seq

def check_alphabet(self):
return set(self.seq.upper()).issubset(self.ALPHABET)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Считаю это достаточно изящным решением.


def complement(self):
comp_seq = self.seq.translate(str.maketrans(self.COMPLEMENT_MAP))
return type(self)(comp_seq)

def reverse_complement(self):
reverse_seq = self.complement().seq[::-1]
return type(self)(reverse_seq)

def gc_content(self):
gc_count = self.seq.count('G') + self.seq.count('C')
return gc_count / len(self.seq) * 100

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Можно учесть деление на 0.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Верно подмечено



class DNASequence(NucleicAcidSequence):
ALPHABET = {'A', 'T', 'G', 'C'}
COMPLEMENT_MAP = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G',
'a': 't', 't': 'a', 'g': 'c', 'c': 'g'}

def transcribe(self):
return RNASequence(self.seq.replace('T', 'U'))


class RNASequence(NucleicAcidSequence):
ALPHABET = {'A', 'G', 'C', 'U'}
COMPLEMENT_MAP = {'A': 'U', 'U': 'A', 'G': 'C', 'C': 'G',
'a': 'u', 'u': 'a', 'g': 'c', 'c': 'g'}


class AminoAcidSequence(BiologicalSequence):
ALPHABET = {"A", "C", "D", "E", "F", "G", "H", "I","K", "L",
"M", "N","P", "Q", "R", "S", "T", "V", "W", "Y"}
Comment on lines +121 to +122

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ALPHABET = {"A", "C", "D", "E", "F", "G", "H", "I","K", "L",
"M", "N","P", "Q", "R", "S", "T", "V", "W", "Y"}
ALPHABET = {"A", "C", "D", "E", "F", "G", "H", "I", "K", "L",
"M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"}

Стоит добавить пробелы. Помимо этого, есть еще некоторое количество пробелов на концах строк и \t в пустых строках, но я полагаю, не имеет смысла на каждом останавливаться. Стоит проверять код линтером, так можно выявить те огрехи, которые невооруженному глазу плохо заметны.

HYDROPHOBIC_AMINOACIDS = {"A", "V", "L", "I", "M", "F", "Y", "W"}

def __len__(self):
return len(self.seq)

def __getitem__(self, index):
return self.seq[index]

def __repr__(self):
return self.seq

def check_alphabet(self):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я думаю, что проверять алфавит разумно было бы в init в обязательном порядке.

return set(self.seq.upper()).issubset(self.ALPHABET)

def compute_hydrophobicity(self):
hydrophobic_count = sum(1 for aa in self.seq if aa.upper() in self.HYDROPHOBIC_AMINOACIDS)
return (hydrophobic_count / len(self.seq)) * 100