Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
376 changes: 376 additions & 0 deletions TAOK3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,376 @@
import os
from abc import ABC, abstractmethod
from typing import Optional

import numpy as np
from Bio import SeqIO, SeqRecord, SeqUtils


def is_in_gc_bounds(seq_record: SeqRecord, gc_bounds: tuple) -> bool:
"""
Check if the sequence is in the range of GC-content bounds

Arguments:
- seq_record(SeqRecord): the sequence object to check
- gc_bounds(tuple): contain min and max GC-content bounds

Return:
- bool: the result of the check
"""
gc_min, gc_max = gc_bounds[0], gc_bounds[1]
gc_content = SeqUtils.GC123(seq_record.seq)[0]
return gc_min <= gc_content <= gc_max


def is_in_length_bounds(seq_record: SeqRecord, length_bounds: tuple) -> bool:
"""
Check if the sequence is in the range of length bounds

Arguments:
- seq(SeqRecord): the sequence object to check
- length_bounds(tuple): contain min and max length bounds

Return:
- bool: the result of the check
"""
length_min, length_max = length_bounds[0], length_bounds[1]
return length_min <= len(seq_record) <= length_max


def is_above_quality_threshold(seq_record: SeqRecord,
quality_threshold: float) -> bool:
"""
Check if the mean of the sequence quality values in FASTQ exceeds
the quality threshold of the interest

To convert quality values, the function uses phred+33

Arguments:
- seq_record(Bio.SeqRecord): the sequence object to check
- quality_threshold(int or float): the quality threshold of the interest

Return:
- bool: the result of the check
"""
quality = np.mean(seq_record.letter_annotations["phred_quality"])
return quality >= quality_threshold


def filter_fastq(input_path: str,
output_filename: Optional[str] = None,
gc_bounds=(0, 100),
length_bounds=(0, 2**32),
quality_threshold=0) -> None:
"""
Main function to select reads in FASTQ format according
to three main requirements:

-correspondence to the range of GC-content bounds
The range of GC-content bounds is determined with gc_bounds argument

-correspondence to the range of length bounds
The range of length bounds is determined with length_bounds argument

-exceeds the quality threshold of the interest
The quality threshold is determined with quality_threshold argument

Function takes the path to the file in input_path argument.
Use files with fastq extension only.

Function output the result of checking in a file that is named according
to output_filename(Optional).

The output file also has fastq extension.

The output file is saved in the 'fastq_filtrator_results' directory.

If 'fastq_filtrator_results' directory doesn't exist the program creates it
in a current directory.

Without full names use arguments in a certain order
Example: filter_fastq(input_path, output_filename, (0,100), (0,200), 0)
# filter_fastq(input_path, output_filename, gc_bounds=(0,100),
length_bounds=(0,200), quality_threshold=0)

In case of changing only one argument, provide its full name!
Example: filter_fastq(input_path, output_filename,
length_bounds=(50, 100))

Arguments:

- input_path(str): the path to the file

- output_filename(str): the name for output file with obtained result
By default output_filename=None
Without output_filename argument the output file is named as input file
Name without fastq extention is acceptible.
Example: output_filename='result' # 'result.fastq'
output_filename='result.fastq'
Comment on lines +107 to +108

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Очень хорошая идея с примерами, порой их очень не хватает


- gc_bounds(tuple or int or float): contain min and max GC-content bounds
By default gc_bounds=(0,100)
If input contains one number the function accepts it as a maximum bound
Examples: gc_bounds=(20,40)
gc_bounds=40 # (0,40)

- length_bounds(tuple or int or float): contain mini and max length bounds
By default length_bounds=(0,4294967296)
If input contains one number the function accepts it as a maximum bound
Examples: length_bounds=(10,90)
length_bounds=90 # (0,90)

- quality_threshold(int or float): the quality threshold of the interest
By default quality_threshold=0
Examples: quality_threshold=10

There are three functions that are used in the main function:

- is_in_gc_bounds(seq_record, gc_bounds):
Check if the sequence falls in the range of GC-content bounds

- is_in_length_bounds(seq_record, length_bounds):
Check if the sequence falls in the range of length bounds

- is_above_quality_threshold(seq_record, quality_threshold):
Check if the mean of quality values exceeds the quality threshold

Return:
- file: file with fastq extension containing selected fragments.

For more information please see README

"""
if type(length_bounds) is int:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Можно еще проверить на None, если вдруг пользователь захочет передать в функцию такое значение параметров и в случае None присвоить значение по умолчанию

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Да, об этом не подумала

length_bounds = 0, length_bounds
if type(gc_bounds) is int or type(gc_bounds) is float:
gc_bounds = 0, gc_bounds
Comment on lines +143 to +146

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Круто, что была сделана проверка на типы!

Важно: Если в gc_bounds пользователь укажет tuple проверка сломается, нужно придумать, что делать в этом случае...
Неважно: тут так и просится isinstance)

Suggested change
if type(length_bounds) is int:
length_bounds = 0, length_bounds
if type(gc_bounds) is int or type(gc_bounds) is float:
gc_bounds = 0, gc_bounds
if isinstance(length_bounds, int):
length_bounds = 0, length_bounds
if isinstance(gc_bounds, int) or isinstance(gc_bounds, float):
gc_bounds = 0, gc_bounds

Comment on lines +143 to +146

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if type(length_bounds) is int:
length_bounds = 0, length_bounds
if type(gc_bounds) is int or type(gc_bounds) is float:
gc_bounds = 0, gc_bounds
if isinstance(length_bounds, int):
length_bounds = 0, length_bounds
if isinstance(gc_bounds, (int, float)):
gc_bounds = 0, gc_bounds

Так попроще будет

sequences = SeqIO.parse(input_path, "fastq")
good_reads = []
for seq_record in sequences:
if (is_in_gc_bounds(seq_record, gc_bounds) and
is_in_length_bounds(seq_record, length_bounds) and
is_above_quality_threshold(seq_record, quality_threshold)):
good_reads += [seq_record]
if len(good_reads) == 0:
raise ValueError('There are no sequences suited to requirements')
Comment on lines +154 to +155

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Кажется, что лучше просто вывести предупреждение в stdout о конкретном файле. Было бы неплохо прогонять фильтр по нескольким файлам в цикле... Но если хоть в одном из них не будет хорошей последовательности, то цикл остановится с ошибкой - неприятно

Suggested change
if len(good_reads) == 0:
raise ValueError('There are no sequences suited to requirements')
if len(good_reads) == 0:
print(f'In {input_path} there is no sequences suited to requirements')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Очень здорово, что есть ошибка на такой случай

if output_filename is None:
input_filename = os.path.split(input_path)[-1]
output_filename = input_filename
if not (output_filename.endswith('.fastq')):
output_filename = output_filename + '.fastq'

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Тоже хорошая проверка

current_directory = os.getcwd()
path = os.path.join(current_directory, 'fastq_filtrator_results')
if not (os.path.exists(path)):
os.mkdir(path)
output_path = os.path.join(path, output_filename)
if os.path.exists(output_path):
error = 'File with such name exists! Change output_filename arg!'

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Еще одна прекрасная проверка!

raise ValueError(error)
SeqIO.write(good_reads, output_path, "fastq")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хорошая работа!

Comment on lines +156 to +169

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💥💥💥🔥🔥🔥



class BiologicalSequence(ABC):
"""
The abstract class for biological sequences
"""
@abstractmethod
def __len__(self):
pass

@abstractmethod
def __getitem__(self, item):
pass

@abstractmethod
def __str__(self):
pass

def __repr__(self):
pass

@abstractmethod
def is_alphabet_correct(self):
pass


class NucleicAcidSequence(BiologicalSequence):
"""
Class for nucleic acids
"""
def __init__(self, seq):
self.seq = seq
Comment on lines +200 to +201

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Для стандартизации тут неплохо было бы сделать self.seq = seq.upper() (или .lower()) - не придется приводить к определенному регистру в будущем/сравнивать несколько регистров как на строчке 246

Suggested change
def __init__(self, seq):
self.seq = seq
def __init__(self, seq):
self.seq = seq.upper()


def __len__(self):
return len(self.seq)

def __getitem__(self, item):
return self.seq[item]

def __str__(self):
return self.seq

def __repr__(self):
return self.seq

def is_alphabet_correct(self) -> bool:
"""
The function check does the sequence contain
standard A, G, C, T, U nucleotides

Returns: bool - the result of check
"""
return type(self).is_correct(self)

def complement(self) -> BiologicalSequence:
"""
Output the complementary sequence
The complementarity rule could be found here:
https://en.wikipedia.org/wiki/Complementarity_(molecular_biology)

Arguments:
- self: the sequence obj to change

Return: DNAsequence or RNAsequence - the result sequence object
"""
complement = self.seq.translate(type(self).rule_complement)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хотелось бы еще как то справляться с ошибкой в случае, если complement вызывается на самой NucleicAcidSequence (NotImplementedError или что нибудь такое, указывающее, что пользователю нужно выбрать между DNASequence и RNASequence)

complement_seq = type(self)(complement)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Можно как альтернативу использовать: self.__class__(complement) (две пары кавычек немного смущают, но это дело вкуса)

Suggested change
complement_seq = type(self)(complement)
complement_seq = self.__class__(complement)

return complement_seq

def gc_content(self) -> float:
"""
Check the gc content in the sequence object
Returns: int - the percentage of GC content
"""
gc_counter = 0
for nucl in self.seq:
if nucl in ('G', 'C', 'g', 'c'):
gc_counter += 1

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Здесь можно было бы использовать метод .upper() или .lower(), чтобы не сравнивать регистры или вынести эти значения в константы, чтобы каждый раз не создавался новый tuple при каждой итерации

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Да, абсолютно согласна)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Не поспорить)

gc_share = gc_counter / len(self.seq) * 100
return gc_share


class RNASequence(NucleicAcidSequence):
"""
The class for RNA sequences
"""
rule_complement = 'AUCG'.maketrans('AaUuCcGg', 'UuAaGgCc')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Интересная реазизация!

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Спасибо)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Возьму на заметку, хороший метод!


def __init__(self, seq):
super().__init__(seq)
if not (super().is_alphabet_correct()):
raise ValueError('The sequence does not correspond to RNA')

def is_correct(self) -> bool:
"""
Check if the sequence is RNA

Arguments:
- self: the sequence object to check

Return:
- bool: the result of the check
"""
return set(self.seq) <= set('AaGgCcUu')


class DNASequence(NucleicAcidSequence):
"""
The class for DNA sequences
"""
rule_complement = 'ATCG'.maketrans('AaTtCcGg', 'TtAaGgCc')
rule_transcription = 'AUCG'.maketrans('Tt', 'Uu')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Вааааау, это очень классно!

def __init__(self, seq):
super().__init__(seq)
if not (super().is_alphabet_correct()):
raise ValueError('The sequence does not correspond to DNA')

def is_correct(self) -> bool:
"""
Check if the sequence is DNA

Arguments:
- self: the sequence object to check

Return:
- bool: the result of the check
"""
return set(self.seq) <= set('AaGgCcTt')

def transcribe(self) -> RNASequence:
"""
Transcribe DNA sequence to RNA

Arguments:
- self: the sequence object to change

Return: str: RNASequence object
"""
transcribe_seq = self.seq.translate(self.rule_transcription)
rna_seq = RNASequence(transcribe_seq)
return rna_seq

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Очень красиво!



class AminoAcidSequence(BiologicalSequence):
"""
The class for amino acid sequences
"""
aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY'

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Константы обозначаем большими буквами

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А вот это точно константы? Я тоже об этом думала, но в данном случае наши константы стали атрибутами. Как будто они всегда записываются в нижнем регистре?

Copy link

@nekrasovadasha22 nekrasovadasha22 Mar 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хм, наверное это вопрос уже вкуса. Мне кажется, что если есть какое-то значение, которое ты задаешь единожды и не изменяешь, то это все-таки константа. А начальные какие-то значения, которые ты передаешь в def init, например - это уже атрибуты, которые ты определяешь.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Константами в питоне называются конкретно штуки вынсенные в начало модуля. Тут это просто классовые атрибуты, поэтому да, просто в нижнем регистре
  2. Обрати внимание, когда пишешь какой то код в PR, то его кушает маркдаун. У тебя __init__ стал жирным init. Чтобы он оставался собой, надо добавлять штрихи (на букве ё).


def __init__(self, seq):
self.seq = seq
if not (self.is_alphabet_correct()):
error = 'The sequence does not match standard protein code'
raise ValueError(error)

def __len__(self):
return len(self.seq)

def __getitem__(self, item):
return self.seq[item]

def __str__(self):
return self.seq

def __repr__(self):
return self.seq

def is_alphabet_correct(self) -> bool:
"""
The function checks if the sequence object contains
standard amino acid code
Returns: list of alternative frames (AminoAcidSequence)
"""
return set(self.seq.upper()) <= set(self.aa_alphabet)

def search_for_alt_frames(self, alt_start_aa='M') -> list:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хорошая функция)

"""
Search for alternative frames in a protein sequences
Use only one-letter code

Search is not sensitive for letter case
Without an alt_start_aa argument search for
frames that start with methionine ('M')
To search frames with alternative start codon
add alt_start_aa argument

The function ignores the last three amino acids in sequences

Arguments:
- self: sequence object to check
- alt_start_aa (str): the name of an amino acid that
is encoded by alternative start AA (Optional)
Default: alt_start_aa='I'

Return: the list of alternative frames
"""
alternative_frames = []
num_position = 0
for amino_acid in self.seq[1:-3]:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Здесь так и задумано, что мы пропускаем первую аминокислоту?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Да, здесь так задумано) С первой амк начинается стандартная рамка считывания, а мы ищем альтернативные)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Отлично!

alt_frame = ''
num_position += 1
if amino_acid.upper() == alt_start_aa:
alt_frame += self.seq[num_position:]
alt_frame = AminoAcidSequence(alt_frame)
alternative_frames.append(alt_frame)
return alternative_frames