Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
263 changes: 263 additions & 0 deletions TOMM20.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
import os

ALPHABET_FOR_DNA = {'A', 'T', 'G', 'C', 'a', 't', 'g', 'c'}
ALPHABET_FOR_RNA = {'A', 'U', 'G', 'C', 'a', 'u', 'g', 'c'}
ALPHABET_FOR_PROTEIN = set('FLIMVSPTAYHQNKDECWRG')
Comment on lines +5 to +7

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Если занудствовать, то можно было унифицировать и сделать всё через set.
Но это конечно "Откройте окно! Слишком душно"

Suggested change
ALPHABET_FOR_DNA = {'A', 'T', 'G', 'C', 'a', 't', 'g', 'c'}
ALPHABET_FOR_RNA = {'A', 'U', 'G', 'C', 'a', 'u', 'g', 'c'}
ALPHABET_FOR_PROTEIN = set('FLIMVSPTAYHQNKDECWRG')
ALPHABET_FOR_DNA = set('ATGCatgc')
ALPHABET_FOR_RNA = set('AUGCaugc')
ALPHABET_FOR_PROTEIN = set('FLIMVSPTAYHQNKDECWRG')

COMPLEMENT_ALPHABET_DNA = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G',
'a': 't', 't': 'a', 'g': 'c', 'c': 'g'}
COMPLEMENT_ALPHABET_RNA = {'U': 'A', 'A': 'U', 'G': 'C', 'C': 'G',
'u': 'a', 'a': 'u', 'g': 'c', 'c': 'g'}
Comment on lines +5 to +11

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Будет совсем замечательно, если все эти глобальные переменные превратить в полиморфные атрибуты классов DNASequence и RNASequence. Тогда бы они имели одинаковые названия (например, ALPHABET и COMPLEMENT_ALPHABET без всяких приставок), но разное содержание в зависимости от класса.
Благодаря этому, в функции check_seq_type можно было бы обойтись без проверки типа последовательности

DICT_RNA_TO_PROTEIN = {
'UUU': 'F', 'UUC': 'F', 'UUA': 'L', 'UUG': 'L', 'CUU': 'L',
'CUC': 'L', 'CUA': 'L', 'CUG': 'L', 'AUU': 'I', 'AUC': 'I',
'AUA': 'I', 'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V',
'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S', 'AGU': 'S',
'AGC': 'S', 'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'GCU': 'A',
'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'UAU': 'Y', 'UAC': 'Y',
'UAA': 'stop', 'UAG': 'stop', 'UGA': 'stop', 'CAU': 'H',
'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'AAU': 'N', 'AAC': 'N',
'AAA': 'K', 'AAG': 'K', 'GAU': 'D', 'GAC': 'D', 'UGG': 'W',
'GAA': 'E', 'GAG': 'E', 'UGU': 'C', 'UGC': 'C', 'CGU': 'R',
'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'AGA': 'R', 'AGG': 'R',
'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', 'AUG': 'M'
}

DICT_MOLECULAR_MASS = {
'G': 75, 'A': 89, 'V': 117, 'L': 131, 'I': 131, 'P': 115,
'F': 165, 'Y': 181, 'W': 204, 'S': 105, 'T': 119, 'C': 121,
'M': 149, 'N': 132, 'Q': 146, 'D': 133, 'E': 147, 'K': 146,
'R': 174, 'H': 155
}


class BiologicalSequence(str):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Абстрактные классы обычно наследуются от базового класса ABC из модуля abc. Но конкретно в этом случае, наследование от str выглядит вполне логичным :)

Suggested change
class BiologicalSequence(str):
from abc import ABC
class BiologicalSequence(ABC):

"""
Abstract class for working with biological sequences
Attributes of BiologicalSequence class:
- seq (str): biological sequence
- seq_type (str): which sequence DNA, RNA or protein the sequence is
"""

def __init__(self, seq):
self.seq = seq
self.seq_type = None

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В целом... создание объектов BiologicalSequence (как и NucleicAcidSequence), на мой взгляд, не предполагается, а для объектов DNASequence, RNASequence и AminoAcidSequence название класса говорит само за себя.

Suggested change
self.seq_type = None


def __len__(self):
return len(self.seq)

def __getitem__(self, idx):
return self.seq[idx]

def __getslice__(self, start, end):
return self.seq[start: end]

def __repr__(self):
return f'The sequence is: {self.seq}, type is {self.seq_type}'

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

класс!

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

су-пер!


def __str__(self):
return self.seq

def check_seq_type(self, check_type):
"""
Check if sequence is check_type of molecule DNA, RNA or protein
Input:
- check_type (str): type of molecule DNA, RNA, protein
Return:
- bool: True is type match with check_type, False if not
"""

seq_types = {
'DNA': ALPHABET_FOR_DNA,
'RNA': ALPHABET_FOR_RNA,
'Protein': ALPHABET_FOR_PROTEIN
}

if check_type not in seq_types:
raise ValueError(f'There is not such sequence type as {check_type}!')
return set(self.seq) <= seq_types[check_type]

def type_definition(self):
"""
Set sequence type (DNA, RNA, protein).
Input:
- seq (str): sequence that we need to get type:
Return:
- type (str): type that sequence are
"""
if self.check_seq_type('DNA'):
self.seq_type = 'DNA'
elif self.check_seq_type('RNA'):
self.seq_type = 'RNA'
elif self.check_seq_type('Protein'):
self.seq_type = 'Protein'
else:
raise ValueError(f'Sequence {self.seq} can not be analysed!')
Comment on lines +90 to +97

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Можно еще вот так, и когда сравнение больше чем из 2х случаев, работает чуть быстрее

Suggested change
if self.check_seq_type('DNA'):
self.seq_type = 'DNA'
elif self.check_seq_type('RNA'):
self.seq_type = 'RNA'
elif self.check_seq_type('Protein'):
self.seq_type = 'Protein'
else:
raise ValueError(f'Sequence {self.seq} can not be analysed!')
match True:
case self.check_seq_type('DNA'):
self.seq_type = 'DNA'
case self.check_seq_type('RNA'):
self.seq_type = 'RNA'
case self.check_seq_type('Protein'):
self.seq_type = 'Protein'
case _:
raise ValueError(f'Sequence {self.seq} can not be analysed!')



class NucleicAcidSequence(BiologicalSequence):
"""
Methods for nucleic acid sequences: DNA & RNA
Attributes:
- seq (str): nucleic acid sequence
- seq_type (str): type of nucleic acid sequence (DNA or RNA)
- complement_alphabet (dict): dictionary with complement pairs for DNA or RNA
"""

def __init__(self, seq):
super().__init__(seq)
super().type_definition()
if super().check_seq_type('DNA'):
self.complement_alphabet = COMPLEMENT_ALPHABET_DNA
elif super().check_seq_type('RNA'):
Comment on lines +112 to +114

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Возможно, можно было оставить проверку типов только в конкретных классах (отдельно ДНК, РНК), чтобы не было повторностей.


self.complement_alphabet = COMPLEMENT_ALPHABET_RNA
else:
raise ValueError(f'Sequence {self.seq} is not nucleic acid!')

def complement(self):
"""
Complement DNA or RNA sequences to RNA or DNA sequence
Return:
- complement sequence
"""
output_seq = []
for nucleotide in self.seq:
output_seq.append(self.complement_alphabet[nucleotide])
return NucleicAcidSequence(''.join(output_seq))
Comment on lines +127 to +129

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

функция map() должна работать чуть побыстрее

Suggested change
for nucleotide in self.seq:
output_seq.append(self.complement_alphabet[nucleotide])
return NucleicAcidSequence(''.join(output_seq))
output_seq = ''.join(map(lambda nucleotide: self.complement_alphabet[nucleotide], self.seq))
return NucleicAcidSequence(output_seq)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ну вот мы только сейчас прошли map и lambda :)
Но да, так более функционально

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Так мы сможем возвращать объект того же класса, который принимаем на вход:

Suggested change
return NucleicAcidSequence(''.join(output_seq))
return type(self)(''.join(output_seq))

Опять же, если считаем, что класс NucleicAcidSequence мы используем для наследования, а не для создания объектов


def gc_content(self):
"""
Calculates GC composition for DNA or RNA
Return:
- sequence gc content, %
"""
gc_result = (self.seq.count('C') + self.seq.count('G')) / len(self.seq) * 100

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Строго говоря, в алфавите есть и прописные буквы. И в данном случае маленькие c и g не посчитаются (но у меня, если честно, также было).

Suggested change
gc_result = (self.seq.count('C') + self.seq.count('G')) / len(self.seq) * 100
gc_result = (self.seq.upper().count('C') + self.upper().seq.count('G')) / len(self.seq) * 100

return round(gc_result, 3)
Comment on lines +137 to +138

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

классно!



class DNASequence(NucleicAcidSequence):
"""
DNA sequence.
Attributes:
- seq (str): nucleic acid sequence
- seq_type (str): type of nucleic acid sequence (DNA or RNA)
- complement_alphabet (dict): dictionary with complement pairs for DNA or RNA
"""

def __init__(self, seq):
super().__init__(seq)
super().type_definition()
if not super().check_seq_type('DNA'):
raise ValueError(f'Sequence {self.seq} is not DNA!')
else:
super().type_definition()

def transcribe(self):
"""
Transcribe DNA sequence to RNA
return:
- RNA sequence
"""
output_seq = self.seq.replace('T', 'U').replace('t', 'u')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Выглядит лаконично. «Краткость – сестра таланта».

return RNASequence(output_seq)
Comment on lines +164 to +165

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

огонь

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍



class RNASequence(NucleicAcidSequence):
"""
RNA sequence.
Attributes:
- seq (str): nucleic acid sequence
- seq_type (str): type of nucleic acid sequence (DNA or RNA)
- complement_alphabet (dict): dictionary with complement pairs for DNA or RNA
"""

def __init__(self, seq):
super().__init__(seq)
super().type_definition()
if not super().check_seq_type('RNA'):
raise ValueError(f'Sequence {self.seq} is not RNA!')

def transcribe_reverse(self):
"""
Transcribe and then reverse RNA sequence to DNA
return:
- DNA sequence
"""
output_seq = self.seq.replace('U', 'T').replace('u', 't')
return DNASequence(output_seq[::-1])


class AminoAcidSequence(BiologicalSequence):
"""
Protein (amino acid) sequence.
Attributes:
- seq (str): protein sequence
- seq_type (str): The type of molecule in the sequence
"""

def __init__(self, seq):
super().__init__(seq)
super().type_definition()
if not super().check_seq_type('Protein'):
raise ValueError(f'Sequence {self.seq} is not protein')
Comment on lines +204 to +205

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

как будто немного странно делать повторные проверки в дочерних классах, ведь в родительском классе уже была проверка. получается, если последовательность прошла первую проверку (что это ДНК, РНК или протеин), то дальше все должно работать, а тут вдруг она дальше неожиданно решит упасть, так как оказалось, что ДНК != РНК


def counting_molecular_weight(self):
"""
Counts the molecular mass of a protein sequence seq
Arguments:
- seq (str): sequence to count the molecular weight
Return:
- output (int): molecular weight value
"""
output = 0
for amino_acid in self.seq:
output += DICT_MOLECULAR_MASS[amino_acid]
return output - 18 * (len(self.seq) - 1)
Comment on lines +207 to +218

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Классный метод!


def filter_fastq(input_path: str, gc_bounds: tuple or int = (0, 100),
length_bounds: tuple or int = (0, 2 ** 32),
quality_threshold: int = 0,
output_filename=None,
output_data_dir: str = 'filter_fastq_results'):
Comment on lines +220 to +224

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Аннотация типов есть - класс! Понравилось, что тут аккуратно всё оформленно.

"""
Main function that is used to get fastq sequence(s) and command. It performs a given action
:param input_path: given path where input file is
:param gc_bounds: GC composition interval (in percent) for filtering (default is (0, 100)
:param length_bounds: length interval for filtering (default is (0, 2 ** 32)
:param quality_threshold: threshold value of average quality of reid for filtering, default value is 0
:param output_filename: name file with filtered sequences will be saved
:param output_data_dir: path where the file with filtered sequences will be saved
"""
if not os.path.isdir(output_data_dir):
os.mkdir(output_data_dir)
Comment on lines +234 to +235

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Круто, что отсутствие папки продуманно

if output_filename is None:
output_filename = 'filtered_fastq'

with open(input_path) as handle, open(os.path.join(output_data_dir, output_filename), mode='w') as file:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

о, не знал, что можно открывать через запятую в одном блоке with

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

тоже не знала про with с запятыми)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

присоединяюсь к комментариям, очень продуманно!

record = SeqIO.parse(handle, "fastq")
for lin in record:
Comment on lines +240 to +241

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Скорее придирка, но такие названия кажутся более логичными

Suggested change
record = SeqIO.parse(handle, "fastq")
for lin in record:
records = SeqIO.parse(handle, "fastq")
for record in records:

out = 0
seq = lin.seq

if type(length_bounds) != tuple:
length_bounds = tuple([0, length_bounds])
Comment on lines +245 to +246

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Проверку типов можно было вынести из цикла for в тело функции filter_fastq, чтобы сразу один раз проверить и подправить всё. На времени работы, наверно, не сильно сказывается.

length_result = len(lin.letter_annotations['phred_quality'])
if not length_bounds[0] <= length_result <= length_bounds[1]:
out = 1

if not isinstance(gc_bounds, tuple):
gc_bounds = (0, gc_bounds)
gc_result = gc_fraction(seq) * 100
if not gc_bounds[0] <= gc_result <= gc_bounds[1]:
out = 1

score = lin.letter_annotations['phred_quality']
score = sum(score) / len(seq)
if not score >= quality_threshold:
out = 1

if out == 0:
file.write(lin.format("fastq"))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Оригинально сделано с переменной out. Даже, считай, дискретка использовалась, чтобы с логическими выражениями поработать)