-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild_database.py
72 lines (61 loc) · 2.52 KB
/
build_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def read_gene_list(filename = "data/uniprot_sprot.fasta"):
"""
Return a list of the names and sequences of the genes in a .fasta
file.
Defaults to the uniprot_sprot database.
:param filename: The name of the .fasta file containing the amino
acid sequences of the genes to be studied
:type filename: str
:returns A list of the names and sequences in the .fasta file.
:rtype list[str, str]:
"""
with open(filename) as f:
return f.read().split(">")
def extract_names_and_sequences(gene_list, organism = "Homo sapiens"):
"""
Returns one list of the names and another of the sequences of a
unified list of gene names and sequences for an organism.
:param gene_list: a list of genes and their names
:type gene_list: list[list[str], list[str]]
:param organism: organism of which genes are to be selected,
defaulting to human
:type organism: str
:returns: a list of gene names and associated sequences
:rtype: list[list[str], list[str]]
"""
names, sequences = [], []
unique_names = set()
for entry in gene_list[1:]:
lines = entry.split("\n")
annotation = lines[0]
if not ("OS=" + organism in annotation and "GN=" in annotation): continue
name = annotation.split("GN=")[1].split(" ")[0]
if name in unique_names: continue
unique_names.add(name)
fasta_sequence = "".join(lines[1:])
names.append(name)
sequences.append(fasta_sequence)
return names, sequences
def save_names_and_sequences(names, sequences):
"""
Save the names and sequences of a list of genes to a text file.
:param names: The names of the genes
:type names: list[str]
:param sequences: The amino acid sequences of the genes
:type sequences: list[str]
"""
names_and_indices = [[name, n] for n, name in enumerate(names)]
with (open('data/raw_sequences.txt', 'w') as sequences_file,
open('data/gene_names.txt', 'w') as names_file):
for name, index in names_and_indices:
names_file.write(name + "\n")
sequences_file.write(sequences[index] + "\n")
def main():
"""
Build a concise text file database of the names and sequences of
the genes of an organism from a comprehensive .fasta file
"""
gene_list = read_gene_list()
names, sequences = extract_names_and_sequences(gene_list)
save_names_and_sequences(names, sequences)
if __name__ == "__main__": main()