-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpythonparser.py
375 lines (317 loc) · 18.5 KB
/
pythonparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
import numpy as np
from numba import jit
import os
import bisect
import warnings
import pandas as pd
import re
class MetDenseFile:
""" Class to load data from a .metdense file.
The most relevant method is read_data_range, which can load the methylation data from within a range of positions on a chromosome.
filename: path to the file
verbose: verbosity of the initialization, from 0-2
"""
def __init__(self, filename, verbose=0):
self.filename = filename
self.size = os.path.getsize(filename)
self.size_in_blocks = self.size//4 # in 4-blocks
self.file = open(self.filename, "rb")
self.verbose = verbose
if self.verbose>1: print("Load file",self.filename)
self.endianess = "little" # Should also work with "big", but untested!
self.nptype = "<u4" if self.endianess == "little" else ">u4"
self.cells_masked = False
self.__read_intro()
self.__read_chromosomes_map()
###### Basic functions
def __len__(self):
""" Length, thinks in 4-blocks.
"""
return self.size_in_blocks
def __getitem__(self, i):
""" To run binary search directly on this object, thinks in 4-blocks.
i: address of the value
"""
if not i < self.size_in_blocks: raise IndexError("Index is beyond end of file.")
return self.get_int(4*i+self.nonconform_offset)
def get_int(self, position=None, size=4):
""" Read a single number from file.
position: defaults to current, otherwise byte position in file
size: number of bytes to read into int, by default 4 (uint32)
""" #
if position==None:
return int.from_bytes(self.file.read(size),self.endianess)
else:
self.file.seek(position)
return int.from_bytes(self.file.read(size),self.endianess)
def __get_lines(self, n):
""" Read n lines from current file position.
"""
lines = list()
for i in range(n):
lines.append(self.file.readline()[:-1].decode())
return lines
###### Read initial stuff
def __read_intro(self):
""" Read the intro part of the metdense file.
"""
self.file.seek(8)
self.vMajor, self.vMinor = [self.get_int() for i in range(2)] # Version number
self.positions_bytesize = 4 if (self.vMajor==0 and self.vMinor==0) else 8 # uint64 for positions from 0.1 onward
self.data_start, self.chromosome_start = [self.get_int(size=self.positions_bytesize) for i in range(2)]
if self.data_start % 4 != 0: warnings.warn("The file does not conform to the specification! Data block starts at "+str(self.data_start)+". Parser should still work.")
self.nonconform_offset = self.data_start % 4
self.n_cells = self.get_int()
self.cell_names = self.__get_lines(self.n_cells)
self.data_row_size = int(np.ceil(self.n_cells/16)) # Row size in uint32
self.data_row_size_bytes = 4*self.data_row_size # Row size in bytes
if self.verbose>0: print("The file follows version {}.{} and contains data for {} cells.".format(self.vMajor,self.vMinor,self.n_cells))
def __read_chromosomes_map(self):
""" Read the chromosome map.
"""
self.file.seek(self.chromosome_start)
self.n_chromosomes = self.get_int()
self.positions_start = [self.get_int(size=self.positions_bytesize) for i in range(self.n_chromosomes)]
self.positions_end = self.positions_start[1:]+[self.chromosome_start]
self.data_block_size = self.positions_start[0]-self.data_start
self.chromosome_names = self.__get_lines(self.n_chromosomes)
self.n_CpG = (self.positions_end[-1]-self.positions_start[0])//4
self.n_CpG_per_chromosome = [(self.positions_end[i] - self.positions_start[i])//4 for i in range(self.n_chromosomes)]
if self.data_block_size/self.data_row_size_bytes != self.n_CpG: warnings.warn("The file does not conform to the specification! Data block size, number of cells and number of CpGs does not match.")
if self.verbose>0: print("Contains Chromosomes:",self.chromosome_names)
if self.verbose>1: print("Chromosomes have lengths:",self.n_CpG_per_chromosome,"with total length",self.n_CpG)
###### Data access
#### Helpers
def get_position_range(self, chromosome, lower=0, upper=1e30):
""" Returns the position range in the positions block for a chromosome CpG range.
chromosome: name of the chromosome
lower: lower range on the chromosome (inclusive)
upper: upper range on the chromosome (inclusive)
lower_index: index of the first position in the range
upper_index: index of the first position above the range (or on the next chromosome)
"""
if not str(chromosome) in self.chromosome_names: raise ValueError("Requested chromosome "+str(chromosome)+" does not exist.")
if not lower <= upper: raise ValueError("Upper range is not larger than lower range.")
pos = self.chromosome_names.index(str(chromosome))
lower_bound, upper_bound = self.positions_start[pos]//4, self.positions_end[pos]//4
lower_bound_pos, upper_bound_pos = self[lower_bound], self[upper_bound-1]
if not lower_bound_pos <= upper: raise ValueError("Upper bound does not include anything. Chromosome "+str(chromosome)+", between "+str(lower)+" and "+str(upper)+".")
if not upper_bound_pos >= lower: raise ValueError("Lower bound does not include anything. Chromosome "+str(chromosome)+", between "+str(lower)+" and "+str(upper)+".")
lower_index = 4*bisect.bisect_left(self, lower, lo=lower_bound, hi=upper_bound) if lower > lower_bound_pos else 4*lower_bound
if lower == upper:
if not self.get_int(lower_index) == lower: raise ValueError("A single position was requested, but does not exist.")
return lower_index, lower_index+4
upper_index = 4*bisect.bisect_right(self, upper, lo=lower_bound, hi=upper_bound) if upper < upper_bound_pos else 4*upper_bound
return lower_index, upper_index # upper includes the index of the first CpG that is no longer included!
def __position_to_row(self, position):
""" Turns a position in the position block into the row number in the data block.
"""
row = (position - self.positions_start[0])//4
return row
def __position_to_row_on_chromosome(self, chromosome, position):
""" Turns a position in the position block into the row number in the data block of the chromosome.
"""
if not str(chromosome) in self.chromosome_names: raise ValueError("Requested chromosome "+str(chromosome)+" does not exist.")
pos = self.chromosome_names.index(str(chromosome))
row = (position - self.positions_start[pos])//4
return row
def __position_to_row_start(self, position):
""" Turns a position in the position block into the position in the file where the corresponding row starts.
"""
row = self.__position_to_row(position)
row_position = self.data_start + row*self.data_row_size_bytes
return row_position
def row_to_row_start(self, row, chromosome = ""):
""" Turns a row in the data block into the position in the file where the row starts.
"""
start = self.data_start + row*self.data_row_size_bytes
if chromosome:
if not str(chromosome) in self.chromosome_names: raise ValueError("Requested chromosome "+str(chromosome)+" does not exist.")
pos = self.chromosome_names.index(str(chromosome))
start += (self.positions_start[pos] - self.positions_start[0])//4*self.data_row_size_bytes
return start
def __transform_position_ranges(self, chromosome, ranges):
""" Transform a list of ranges on the chromosome, into a list of the corresponding row ranges in the data block of that chromosome.
"""
CpG_ranges = np.asarray([self.get_position_range(chromosome, lower=min(ran[0],ran[1]), upper=max(ran[0],ran[1])) for ran in ranges])
return self.__position_to_row_on_chromosome(chromosome, CpG_ranges), CpG_ranges.min(), CpG_ranges.max()
def __transform_position_ranges_to_indices(self, chromosome, ranges):
""" Transform a list of ranges on the chromosome, into a list of the corresponding row ranges in the data block of that chromosome.
"""
row_ranges, pos_min, pos_max = self.__transform_position_ranges(chromosome, ranges)
rmin, rmax = row_ranges.min(), row_ranges.max()
return [np.linspace(l, u-1, u-l, dtype=int) for l,u in row_ranges], rmin, rmax, pos_min, pos_max
def __build_load_mask(self, ranges, length):
""" Build mask from ranges.
"""
mask = np.full((length),False)
for ran in ranges:
mask[ran] = True
return mask
def __mask_ranges_transform(self, mask, ranges):
""" Transform indices in ranges, taking into account that some parts in between will get deleted.
"""
key = (np.linspace(0,mask.shape[0]-1,mask.shape[0])-np.invert(mask).cumsum()).astype(int)
ranges_tf = list()
for ran in ranges:
ranges_tf.append(key[ran])
return ranges_tf
def get_chromosome_length(self, chromosome):
""" Returns the CpG length of a chromosome.
"""
if not str(chromosome) in self.chromosome_names: raise ValueError("Requested chromosome "+str(chromosome)+" does not exist.")
pos = self.chromosome_names.index(str(chromosome))
return self.n_CpG_per_chromosome[pos]
def set_cell_mask(self, mask):
""" Set cell mask to be applied to all returned data.
"""
#if len(mask) != self.n_cells: raise ValueError("Cell mask has wrong length.")
self.cells_masked = True
self.cell_mask = mask
#### Read Data
def __read_data_rows(self, lower, upper, mask = None):
""" Read a range of data rows from the data block.
Loading is almost instantaneous, most of the time is spent on the shift and the concatenation.
(equal parts, no idea why concatenation is so slow; allocating the memory beforehand and setting parts of the new array is even slower)
lower: lower row number (inclusive)
upper: upper row number (exclusive)
"""
self.file.seek(0)
data = np.reshape(np.fromfile(self.file,self.nptype,offset=lower+self.nonconform_offset ,count=(upper-lower)//4),(-1,self.data_row_size))
if not mask is None:
data = data[mask].copy()
return self.__process_data_rows(data)
def __process_data_rows(self, data):
""" Read a range of data rows from the data block.
Loading is almost instantaneous, most of the time is spent on the shift and the concatenation.
(equal parts, no idea why concatenation is so slow; allocating the memory beforehand and setting parts of the new array is even slower)
lower: lower row number (inclusive)
upper: upper row number (exclusive)
"""
result = list()
for i in range(16):
result.append(np.bitwise_and(data,0x03).astype(np.uint8))
data = np.right_shift(data,2)
del data # No reason to keep it in the memory
result = [np.array_split(res,res.shape[1],axis=1) for res in result] # Reuse the variable, or just delete the old one afterwards
result = list(map(list, zip(*result)))
result = np.concatenate([el for ls in result for el in ls],axis=1)
result = result[:,:self.n_cells]
if self.cells_masked:
result = result[:,self.cell_mask]
return result
def read_data_range(self, chromosome, lower=0, upper=1e30, mask=None):
""" Read all data for a CpG range on a chromosome.
chromosome: which chromosome
lower: lower position on the chromosome (inclusive)
upper: upper position on the chromosome (inclusive)
"""
lower_index, upper_index = self.get_position_range(chromosome, lower, upper)
row_lower, row_upper = self.__position_to_row_start(lower_index), self.__position_to_row_start(upper_index)
return self.__read_data_rows(row_lower, row_upper, mask)
def read_data_positions_from_ranges(self, chromosome, ranges):
""" Read all data for multiple CpG ranges on a chromosome.
chromosome: which chromosome
ranges
"""
range_indices, rmin, rmax, pos_min, pos_max = self.__transform_position_ranges_to_indices(chromosome, ranges)
range_indices -= rmin
mask = self.__build_load_mask(range_indices, rmax-rmin)
data = self.__read_data_rows(self.row_to_row_start(rmin, chromosome), self.row_to_row_start(rmax, chromosome), mask)
range_indices = self.__mask_ranges_transform(mask, range_indices)
final = [data[ran] for ran in range_indices]
self.file.seek(0)
positions = np.fromfile(self.file,self.nptype,offset=pos_min+self.nonconform_offset ,count=(pos_max-pos_min)//4)[mask]
pos = [positions[ran] for ran in range_indices]
return final, pos
def read_data_chromosome(self, chromosome, mask=None):
""" Read all data on a chromosome.
"""
return self.read_data_range(chromosome, 0, 1e30, mask)
def read_data_full(self):
""" Read the full data.
"""
return self.__read_data_rows(self.data_start, self.positions_start[0])
#### Read Positions
def read_positions_range(self, chromosome, lower=0, upper=1e30):
""" Read all available CpG positions within a range on a chromosome.
- chromosome: which chromosome
- lower: lower position on the chromosome (inclusive)
- upper: upper position on the chromosome (inclusive)
"""
lower_index, upper_index = self.get_position_range(chromosome, lower, upper)
self.file.seek(0)
positions = np.fromfile(self.file,self.nptype,offset=lower_index+self.nonconform_offset ,count=(upper_index-lower_index)//4)
return positions
@jit
def tricube_mean_sample(xlist,ylist,zlist,xwidth,ywidth,xrange,yrange):
def tricube_mean(xlist,ylist,zlist,xcent,ycent,xwidth,ywidth):
def tricube(x,cent,width): return 70/81*(1-(2*abs(x-cent)/width)**3)**3 if 2*abs(x-cent)/width<1 else 0
weights = list()
for i in range(len(xlist)):
weights.append(tricube(xlist[i],xcent,xwidth)*tricube(ylist[i],ycent,ywidth))
result = 0
weightsum = 0
for i in range(len(zlist)):
#if weights[i] != 0:
weightsum += weights[i]
result += weights[i]*zlist[i]
if weightsum!=0:
result /= weightsum
else:
result = np.nan
#result /= weightsum
return result
xleft = np.searchsorted(xlist,xrange-xwidth/2,"left")
xright = np.searchsorted(xlist,xrange+xwidth/2,"right")
res = [[tricube_mean(xlist[xL:xR],ylist[xL:xR],zlist[xL:xR],x,y,xwidth,ywidth) for y in yrange] for xL, xR, x in zip(xleft,xright,xrange)]
return np.asarray(res)
class MetDenseFileSmoother(MetDenseFile):
"""
"""
def __init__(self, filename, cell_mask, pseudotime, verbose=0):
super().__init__(filename)
self.set_cell_mask(cell_mask)
self.pseudotime = pseudotime
def sample_smoothed(self, xlist,ylist,zlist,xwidth,ywidth,xrange,yrange, flatten = False):
vals = tricube_mean_sample(np.asarray(xlist),np.asarray(ylist),np.asarray(zlist),xwidth,ywidth,np.asarray(xrange),np.asarray(yrange))
x = np.repeat(np.expand_dims(xrange,1),len(yrange),axis=1)
y = np.repeat(np.expand_dims(yrange,0),len(xrange),axis=0)
if flatten:
result = {"x":x.flatten(), "y":y.flatten(), "vals":vals.flatten()}
else:
result = {"x":x, "y":y, "vals":vals}
return result
def __get_standard_data(self, chromosome,lower,upper, repeat = True):
dat = self.read_data_range(chromosome,lower,upper)
pos = self.read_positions_range(chromosome,lower,upper)
pt = self.pseudotime
if repeat:
pos = np.repeat(np.expand_dims(pos,1),dat.shape[1],axis=1)
pt = np.repeat(np.expand_dims(pt,0),dat.shape[0],axis=0)
return dat, pos, pt
def sample_smoothed_range(self, chromosome, lower, upper, xwidth, ywidth, xrange, yrange, flatten = False):
dat, pos, pt = self.__get_standard_data(chromosome,lower-xwidth,upper+xwidth)
x_positions = pos[np.logical_or(dat==2,dat==1)]
y_positions = pt[np.logical_or(dat==2,dat==1)]
z_values = dat[np.logical_or(dat==2,dat==1)].astype(float)-1
return self.sample_smoothed(x_positions,y_positions,z_values,xwidth,ywidth,xrange,yrange, flatten)
def get_data_points(self, chromosome, lower, upper):
dat, pos, pt = self.__get_standard_data(chromosome,lower,upper)
methylated = {"x":pos[dat==2], "y":pt[dat==2]}
unmethylated = {"x":pos[dat==1], "y":pt[dat==1]}
ambiguous = {"x":pos[dat==3], "y":pt[dat==3]}
return (methylated, unmethylated, ambiguous)
def get_coverage(self, chromosome, lower, upper):
dat, pos, pt = self.__get_standard_data(chromosome,lower,upper, False)
cov = np.logical_or(dat==1,dat==2).sum(axis=1)
return pos, cov
def get_coverage_densesum(self, chromosome, lower_, upper_, sumover_ = 1):
lower, upper, sumover = int(lower_), int(upper_), int(sumover_)
pos, cov = self.get_coverage(chromosome, lower, upper)
densecov = np.zeros(int(upper)-int(lower))
densecov[pos-lower] = cov
densecov = np.pad(densecov, (0,int((sumover-densecov.shape[0]%sumover)%sumover)), 'constant', constant_values=(0,0))
densecov = densecov.reshape(-1,sumover).sum(axis=1)
boundaries = np.asarray(list(np.arange(lower, upper,sumover))+[upper])
return boundaries, densecov