-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_samples.py
88 lines (73 loc) · 2.8 KB
/
generate_samples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
""" Methods for generating random test cases of given lengths. Useful for measuring performance of code """
import random
from numpy.random import normal
from Bio import pairwise2
import os.path
TEST_FOLDER = "testdata"
PROTEIN_ALPHABET = "ACDEFGHIKLMNPQRSTVWY"
def random_char_list(alphabet, length):
""" Returns list of characters from given alphabet of given length """
return [random.choice(alphabet) for _ in xrange(length)]
def generate_sample(alphabet, length):
""" Generates pair of sequences with some changes between them. """
seq1 = random_char_list(alphabet, length)
seq2 = seq1[:]
randomness = random.random()
last_deleted = False
i = 0
while i < len(seq1) and i < len(seq2):
if last_deleted and random.random() < 0.5:
del seq2[i]
continue
last_deleted = False
has_changed = random.random() < randomness
if has_changed:
change = random.randint(0,3)
if change == 0: # change protein
seq2[i] = random.choice(alphabet)
i += 1
if change == 1: # add protein
seq2.insert(i, random.choice(alphabet))
i += 2
if change == 2: # remove protein
del seq2[i]
last_deleted = True
else:
i += 1
added_front = int(normal(0, length**0.5))
added_back = int(normal(0, length**0.5))
if added_front > 0:
seq2 = random_char_list(alphabet, added_front) + seq2
elif added_front < 0:
seq2 = seq2[-added_front:]
if added_back > 0:
seq2 += random_char_list(alphabet, added_front)
elif added_back < 0:
seq2 = seq2[:added_back]
seq1 = ''.join(seq1)
seq2 = ''.join(seq2)
return seq1, seq2
def generate_test_samples(count, max_length):
""" Generates given number of samples with lengths linearly distributed from 1 to max length """
diff = max_length/count
i = int(diff)
while i <= max_length:
yield generate_sample(PROTEIN_ALPHABET, i)
i = int(i + diff)
def save_samples(directory, samples):
""" Saves given samples pairs to a directory as .fasta files"""
for seq1, seq2 in samples:
seq_name = "generated_" + str(len(seq1))
filename = seq_name + ".fasta"
with open(os.path.join(directory, filename), 'w') as f:
f.write(">{0}_seq1\n".format(seq_name))
f.write(seq1 + '\n')
f.write(">{0}_seq2\n".format(seq_name))
f.write(seq2 + '\n')
if __name__ == "__main__":
count = raw_input("Insert number of samples to generate: ")
count = int(count)
max_length = raw_input("Insert maximum length of a sample: ")
max_length = int(max_length)
samples = generate_test_samples(count, max_length)
save_samples(TEST_FOLDER, samples)