-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathread_metdense.jl
153 lines (129 loc) · 4.51 KB
/
read_metdense.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
@enum MethCall ::UInt32 begin
nocall = 0x00
unmeth = 0x01
meth = 0x02
ambig = 0x03
end
struct GenomicPosition
chrom :: String
pos :: UInt32
ind :: Int64
end
GenomicPosition(chrom::String, pos::Int) = GenomicPosition(chrom, UInt32(pos), -1)
struct GenomicInterval
chrom :: String
iv :: Tuple{UInt32, UInt32}
end
function extend(gi::GenomicInterval; left = 0, right = 0)
if Int64(gi.iv[2]) - Int64(gi.iv[1]) - left + right <= 0
@warn "The resulting interval length is smaller than zero"
return gi
end
if gi.iv[1] - left <= 0
left = gi.iv[1] - 1
end
return GenomicInterval(gi.chrom, (gi.iv[1] - left, gi.iv[2] + right))
end
struct ChromFileposInfo
pos ::StepRange{UInt64} # File positions in Positions Block
data ::StepRange{UInt64} # File positions in Data Block
end
struct MetDenseFile
f ::IOStream
cell_names ::Vector{String}
chroms_filepos ::Dict{ String, ChromFileposInfo }
offset_data_block ::UInt64
end
function MetDenseFile( filename ::String )
f = open( filename )
# Read and check magic string
s = Vector{UInt8}(undef,8)
readbytes!( f, s )
@assert s == b"MetDense"
# Read version
version = ( read( f, UInt32 ), read( f, UInt32 ) )
@assert version == ( 0, 1 )
# Get offsets
offset_data_block = read( f, UInt64 )
offset_chroms_block = read( f, UInt64 )
# Read Cells block
n_cells = read( f, UInt32 )
names_cells = [ readline( f ) for i in 1:n_cells ]
# Read Chromosomes block
seek( f, offset_chroms_block )
n_chroms = read( f, UInt32 )
offsets_chroms = [ read( f, UInt64 ) for i in 1:n_chroms ]
names_chroms = [ readline( f ) for i in 1:n_chroms ]
chroms_filepos = Dict{ String, ChromFileposInfo }()
start_in_data = offset_data_block
rowlength = ceil( UInt64, n_cells / 16 ) * 4 # length of a site in the Data block
for i in 1:n_chroms
filepos_in_positions_block =
range( offsets_chroms[i],
i < n_chroms ? offsets_chroms[i + 1] - 1 : offset_chroms_block - 1;
step = UInt64(4) )
stop_in_data = start_in_data + rowlength * ( length( filepos_in_positions_block ) - 1 )
filepos_in_data_block =
range( start_in_data, stop_in_data; step = rowlength )
start_in_data = stop_in_data + rowlength
chroms_filepos[ names_chroms[i] ] = ChromFileposInfo(
filepos_in_positions_block, filepos_in_data_block )
end
MetDenseFile( f, names_cells, chroms_filepos, offset_data_block )
end
function read_at_position( f, pos, type )
seek( f, pos )
read( f, type )
end
function get_interval( mdf::MetDenseFile, gi::GenomicInterval )
if !haskey(mdf.chroms_filepos, gi.chrom)
return 1:0, Vector{UInt32}(undef, 0)
end
start = searchsortedfirst(
mdf.chroms_filepos[ gi.chrom ].pos, gi.iv[1];
lt = (x, y) -> read_at_position( mdf.f, x, UInt32 ) < y )
stop = searchsortedlast(
mdf.chroms_filepos[ gi.chrom ].pos, gi.iv[2];
lt = (x, y) -> x < read_at_position( mdf.f, y, UInt32 ) )
if start > length(mdf.chroms_filepos[ gi.chrom ].pos)
return start:stop, Vector{UInt32}(undef, 0)
else
seek( mdf.f, mdf.chroms_filepos[ gi.chrom ].pos[ start ])
v = Vector{UInt32}( undef, (stop - start + 1) )
read!( mdf.f, v )
return start:stop, v
end
end
# ok, this should be somehow reorganized not to keep file-related info and more general stuff together
function get_position(mdf::MetDenseFile, gp::GenomicPosition)
if gp.ind !== -1
return gp
end
ind, p = get_interval(mdf, GenomicInterval(gp.chrom, (gp.pos, gp.pos)))
if length(p) == 0
return nothing
end
@assert gp.pos == p[1]
println(ind)
return GenomicPosition(gp.chrom, gp.pos, ind[1])
end
function main_simon()
mdf = MetDenseFile("data/gastrulation.metdense")
iv, bpps = get_interval( mdf, GenomicInterval( "2", (4200000, 4700000) ))
#println( iv )
mypos = findfirst( x -> x == 4220582, bpps )
println( "Now looking at position 2:", bpps[mypos] )
seek( mdf.f, mdf.chroms_filepos["2"].data[ iv[mypos] ] )
word = 99
for cell in 1:length( mdf.cell_names )
if (cell-1) % 16 == 0
word = read( mdf.f, UInt32 )
end
call = MethCall( word & 0x03 )
if call != nocall
println( " Cell $cell ($(mdf.cell_names[cell])): $call" )
end
word >>= 2
end
end
#main_simon()