Skip to content

Commit 671fd3e

Browse files
committed
simplify usage
1 parent 9034e35 commit 671fd3e

9 files changed

+57
-31
lines changed

README.md

+10-11
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,11 @@ is an attempt to provide an API so that users can write their own tools with lit
1919
Design
2020
------
2121

22-
+ Separate (sorted) data-sources are merged into a single stream (but each interval knows it's original source)
2322
+ data-sources must support the *Relatable* Interface.
2423
+ a user-defined function returns true if 2 *Relatable*'s are related. (only a small number of interval-pairs
25-
are sent to be tested--this is handled automatically by *irelate*.).
26-
+ i.Related() gives access to all of the related intervals (after they are added internally by *irelated*)
27-
+ the API is a for loop
24+
are sent to be tested--this is handled automatically by `IRelate`.).
25+
+ i.Related() gives access to all of the related intervals (after they are added internally by `IRelate`)
26+
+ the "API" is a for loop
2827

2928
Example
3029
-------
@@ -44,21 +43,21 @@ func CheckRelatedByOverlap(a Relatable, b Relatable) bool {
4443
// a and b are channels that send Relatables.
4544
a := ScanToRelatable('intervals.bed', IntervalFromBedLine)
4645
b := BamToRelatable('some.bam')
47-
merged := Merge(a, b)
48-
for interval := range IRelate(merged, CheckRelatedByOverlap, false, 0) {
46+
for interval := range IRelate(CheckRelatedByOverlap, false, 0, a, b) {
4947
fmt.Fprintf("%s\t%d\t%d\t%d\n", interval.Chrom(), interval.Start(), interval.End(), len(interval.Related()))
5048
}
5149
```
5250

53-
The 3rd argument to *IRelate* determines if intervals from the same source (file) should be
54-
related (almost always false). The 4th argument determines the *query* set of intervals. So,
51+
The 2nd argument to *IRelate* determines if intervals from the same source (file) should be
52+
related (almost always false). The 3rd argument determines the *query* set of intervals. So,
5553
only intervals from `a` (the 0th) source will be sent from IRelate. If this is set to -1, then
56-
all intervals from all sources will be sent.
54+
all intervals from all sources will be sent. After this, any number of interval streams
55+
can be passed to `IRelate`
5756

5857
If we only want to count alignments with a given mapping quality, the loop becomes:
5958

6059
```go
61-
for interval := range IRelate(merged, CheckRelatedByOverlap, false, 0) {
60+
for interval := range IRelate(CheckRelatedByOverlap, false, 0, a, b) {
6261
n := 0
6362
for _, b := range interval.Related() {
6463
// cast to a bam to ge the mapping quality.
@@ -82,7 +81,7 @@ CheckRelatedByOverlap) and a for loop, it is easy to create custom applications.
8281
For example, here is the function to relate all intervals within 2KB:
8382
```go
8483
// CheckRelatedBy2KB returns true if intervals are within 2KB.
85-
func CheckRelatedByOverlap(a Relatable, b Relatable) bool {
84+
func CheckRelatedBy2KB(a Relatable, b Relatable) bool {
8685
distance := uint32(2000)
8786
// note with distance == 0 this just overlap.
8887
return (b.Start()-distance < a.End()) && (b.Chrom() == a.Chrom())

bam_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import (
77
func TestBam(t *testing.T) {
88
var g RelatableChannel
99
g = BamToRelatable("data/ex.bam")
10-
for i := range IRelate(g, CheckRelatedByOverlap, false, 0) {
10+
for i := range IRelate(CheckRelatedByOverlap, false, 0, g) {
1111
if len(i.Related()) != 0 {
1212
t.Errorf("should have another relation: %d", len(i.Related()))
1313

bench.sh

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
go test -cpu=1,2,4 -bench . -run NOTHING -benchtime 3s -cpuprofile cpu.prof

bench_test.go

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package irelate
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func benchmarkStreams(nStreams int, b *testing.B) {
8+
9+
for n := 0; n < b.N; n++ {
10+
streams := make([]RelatableChannel, 0)
11+
f := "data/test.bed.gz"
12+
13+
for i := 0; i < nStreams; i++ {
14+
streams = append(streams, Streamer(f))
15+
}
16+
17+
for a := range IRelate(CheckRelatedByOverlap, false, 0, streams...) {
18+
a.Start()
19+
}
20+
21+
}
22+
}
23+
24+
func Benchmark2Streams(b *testing.B) { benchmarkStreams(2, b) }
25+
func Benchmark3Streams(b *testing.B) { benchmarkStreams(3, b) }

data/test.bed.gz

65.2 KB
Binary file not shown.

gff_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import (
77
func TestGff(t *testing.T) {
88
var g RelatableChannel
99
g = GFFToRelatable("data/ex.gff")
10-
for i := range IRelate(g, CheckRelatedByOverlap, true, 0) {
10+
for i := range IRelate(CheckRelatedByOverlap, true, 0, g) {
1111
if len(i.Related()) != 1 {
1212
t.Errorf("should have another relation: %d", len(i.Related()))
1313

irelate.go

+16-11
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,10 @@ func relate(a Relatable, b Relatable, includeSameSourceRelations bool, relativeT
4242

4343
// CheckRelatedByOverlap returns true if Relatables overlap.
4444
func CheckRelatedByOverlap(a Relatable, b Relatable) bool {
45-
distance := uint32(0)
45+
return (b.Start() < a.End()) && (b.Chrom() == a.Chrom())
4646
// note with distance == 0 this just overlap.
47-
return (b.Start()-distance < a.End()) && (b.Chrom() == a.Chrom())
47+
//distance := uint32(0)
48+
//return (b.Start()-distance < a.End()) && (b.Chrom() == a.Chrom())
4849
}
4950

5051
// CheckKNN relates an interval to its k-nearest neighbors.
@@ -97,21 +98,23 @@ func sendSortedRelatables(sendQ *relatableQueue, cache []Relatable, out chan Rel
9798
// it is assumed that no other `b` Relatables could possibly be related to `a`
9899
// and so `a` is sent to the returnQ. It is likely that includeSameSourceRelations
99100
// will only be set to true if one is doing something like a merge.
100-
func IRelate(stream RelatableChannel,
101-
checkRelated func(a Relatable, b Relatable) bool,
101+
// streams are a variable number of channels that send intervals.
102+
func IRelate(checkRelated func(a Relatable, b Relatable) bool,
102103
includeSameSourceRelations bool,
103-
relativeTo int) chan Relatable {
104+
relativeTo int,
105+
streams ...RelatableChannel) chan Relatable {
104106

107+
stream := Merge(streams...)
105108
out := make(chan Relatable, 64)
106109
go func() {
107110

108111
// use the cache to keep relatables to test against.
109-
cache := make([]Relatable, 1, 256)
112+
cache := make([]Relatable, 1, 1024)
110113
cache[0] = <-stream
111114

112115
// Use sendQ to make sure we output in sorted order.
113116
// We know we can print something when sendQ.minStart < cache.minStart
114-
sendQ := make(relatableQueue, 0, 256)
117+
sendQ := make(relatableQueue, 0, 1024)
115118
nils := 0
116119

117120
// TODO:if we know the ends are sorted (in addition to start) then we have some additional
@@ -122,6 +125,9 @@ func IRelate(stream RelatableChannel,
122125

123126
for i, c := range cache {
124127
// tried using futures for checkRelated to parallelize... got slower
128+
if c == nil {
129+
continue
130+
}
125131
if checkRelated(c, interval) {
126132
relate(c, interval, includeSameSourceRelations, relativeTo)
127133
} else {
@@ -134,20 +140,20 @@ func IRelate(stream RelatableChannel,
134140
}
135141

136142
// only do this when we have a lot of nils as it's expensive to create a new slice.
137-
if nils > 0 {
143+
if nils > 1 {
138144
// remove nils from the cache (must do this before sending)
139145
cache, nils = filter(cache, nils), 0
140146
// send the elements from cache in order.
141147
// use heuristic to minimize the sending.
142-
if len(sendQ) > 128 {
148+
if len(sendQ) > 12 {
143149
sendSortedRelatables(&sendQ, cache, out)
144150
}
145151
}
146152
cache = append(cache, interval)
147153

148154
}
149155
for _, c := range filter(cache, nils) {
150-
if relativeTo == -1 || c.Source() == uint32(relativeTo) {
156+
if c.Source() == uint32(relativeTo) || relativeTo == -1 {
151157
heap.Push(&sendQ, c)
152158
}
153159
}
@@ -179,7 +185,6 @@ func Merge(streams ...RelatableChannel) RelatableChannel {
179185
interval = heap.Pop(&q).(Relatable)
180186
source := interval.Source()
181187
ch <- interval
182-
// need the case/select stmt here to handle end of each stream
183188
// pull the next interval from the same source.
184189
next_interval, ok := <-streams[source]
185190
if ok {

main/main.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,8 @@ func main() {
4747

4848
buf := bufio.NewWriter(os.Stdout)
4949

50-
merged := I.Merge(streams...)
5150
//for interval := range I.IRelate(merged, I.CheckRelatedByOverlap) {
52-
for interval := range I.IRelate(merged, I.CheckRelatedByOverlap, false, 0) {
51+
for interval := range I.IRelate(I.CheckRelatedByOverlap, false, 0, streams...) {
5352
// for bam output:
5453
// bam := *(interval).(*I.Bam)
5554
fmt.Fprintf(buf, "%s\t%d\t%d\t%d\n", interval.Chrom(), interval.Start(), interval.End(), len(interval.Related()))

utils.go

+2-5
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,11 @@ func ScanToRelatable(file string, fn func(line string) Relatable) RelatableChann
6060
scanner, fh := OpenScanFile(file)
6161
ch := make(chan Relatable, 32)
6262
go func() {
63-
var i Relatable
64-
defer fh.Close()
6563
for scanner.Scan() {
66-
line := scanner.Text()
67-
i = fn(line)
68-
ch <- i
64+
ch <- fn(scanner.Text())
6965
}
7066

67+
fh.Close()
7168
close(ch)
7269
}()
7370
return ch

0 commit comments

Comments
 (0)