Skip to content

Commit 89cf3bc

Browse files
author
arq5x
committedSep 18, 2014
update fisher to use RecordKeyVector
1 parent 3dff8ca commit 89cf3bc

File tree

5 files changed

+100
-19
lines changed

5 files changed

+100
-19
lines changed
 

‎RELEASE_HISTORY

+85-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,88 @@
1-
Version 2.18.2 (16-Dec-2013)
1+
Version 2.19.1 (6-Mar-2014)
2+
Bug fix to intersect causing BAM footers to be erroneously written when -b is BAM
3+
4+
Speedup for the map tool.
5+
http://bedtools.readthedocs.org/en/latest/_images/map-speed-comparo.png
6+
7+
Map tool now allows multiple columns and operations in a single run.
8+
http://bedtools.readthedocs.org/en/latest/content/tools/map.html#multiple-operations-and-columns-at-the-same-time
9+
10+
11+
Version 2.19.0 (8-Feb-2014)
12+
Bug Fixes
13+
=========
14+
15+
1. Fixed a long standing bug in which the number of base pairs of overlap was incorrectly calculated when using the -wo option with the -split option. Thanks to many for reporting this.
16+
17+
2. Fixed a bug in which certain flavors of unmapped BAM alignments were incorrectly rejected in the latest 2.18.* series. Thanks very much to
18+
Gabriel Pratt.
19+
20+
21+
Enhancements
22+
============
23+
24+
1. Substantially reduced memory usage, especially when dealing with unsorted data. Memory usage ballooned in the 2.18.* series owing to default buffer sizes we were using in a custom string class. We have adjusted this and the memory usage has returned to 2.17.* levels while maintaining speed increases. Thanks so much to Ian Sudberry rightfully complaining about this!
25+
26+
27+
New features
28+
============
29+
30+
1. The latest version of the "map" function is ~3X faster than the one available in version 2.17 and 2.18
31+
32+
# bedtools 2.17
33+
$ time bedtools map \
34+
-a hg19.gerp.elements.bed.gz \
35+
-b hg19.rmsk.bed.gz \
36+
-c 4 \
37+
-o collapse > /dev/null
38+
real 0m15.865s
39+
user 0m15.815s
40+
sys 0m0.040s
41+
42+
43+
# bedtools 2.19
44+
$ time bedtools map \
45+
-a hg19.gerp.elements.bed.gz \
46+
-b hg19.rmsk.bed.gz \
47+
-c 4 \
48+
-o collapse > /dev/null
49+
real 0m5.367s
50+
user 0m5.314s
51+
sys 0m0.050s
52+
53+
2. The map function now supports the "-split" option, as well as "absmin" and "absmax" operations.
54+
55+
3. In addition, it supports multiple chromosome sorting criterion by supplying a genome file that defines the expected chromosome order. Here is an example of how to run map with datasets having chromosomes sorted in "version" order, as opposed to the lexicographical chrom order that is the norm.
56+
57+
# version sort the BED files (e.g. chr1, chr2, etc., not chr1, chr10, chr11, etc.)
58+
$ zcat hg19.gerp.elements.bed.gz | sort -k1,1V -k2,2n > hg19.gerp.versionsorted.bed
59+
$ zcat hg19.rmsk.bed.gz | sort -k1,1V -k2,2n > hg19.rmsk.versionsorted.bed
60+
61+
# make a toy genome file
62+
$ cut -f 1 hg19.rmsk.versionsorted.bed | uniq | awk '{print $1"\t"1}' > hg19.versionsorted.genome
63+
64+
$ head hg19.versionsorted.genome
65+
chr1 1
66+
chr1_gl000191_random 1
67+
chr1_gl000192_random 1
68+
chr2 1
69+
chr3 1
70+
chr4 1
71+
chr4_ctg9_hap1 1
72+
chr4_gl000193_random 1
73+
chr4_gl000194_random 1
74+
chr5 1
75+
76+
# tell map to expect a different chrom order.
77+
$ bedtools map \
78+
-a hg19.gerp.versionsorted.bed \
79+
-b hg19.rmsk.versionsorted.bed \
80+
-c 4 \
81+
-o collapse \
82+
-g hg19.versionsorted.genome
83+
84+
85+
Version 2.18.2 (8-Jan-2014)
286

387
bedtools. The changes to bedtools reflect fixes to compilation errors, performance enhancements for smaller files, and a bug fix for BAM files that lack a formal header. Our current focus for the 2.19.* release is is on addressing some standing bug/enhancements and also in updating some of the other more widely used tools (e.g., coverage, map, and substract) to use the new API. We will also continue to look into ways to improve performance while hopefully reducing memory usage for algorithms that work with unsorted data (thanks to Ian Sudberry for the ping!).
488

‎src/fisher/Fisher.cpp

+11-13
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
#include "Fisher.h"
32
#include "BlockMgr.h"
43
#include "NewChromsweep.h"
@@ -69,15 +68,15 @@ bool Fisher::getFisher() {
6968
if (!sweep.init()) {
7069
return false;
7170
}
72-
RecordKeyList hitSet;
71+
RecordKeyVector hitSet;
7372
while (sweep.next(hitSet)) {
7473
if (_context->getObeySplits()) {
75-
RecordKeyList keySet(hitSet.getKey());
76-
RecordKeyList resultSet(hitSet.getKey());
74+
RecordKeyVector keySet(hitSet.getKey());
75+
RecordKeyVector resultSet(hitSet.getKey());
7776
_blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
78-
_intersectionVal += getTotalIntersection(&resultSet);
77+
_intersectionVal += getTotalIntersection(resultSet);
7978
} else {
80-
_intersectionVal += getTotalIntersection(&hitSet);
79+
_intersectionVal += getTotalIntersection(hitSet);
8180
}
8281
}
8382

@@ -89,26 +88,25 @@ bool Fisher::getFisher() {
8988
return true;
9089
}
9190

92-
unsigned long Fisher::getTotalIntersection(RecordKeyList *recList)
91+
unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList)
9392
{
9493
unsigned long intersection = 0;
95-
const Record *key = recList->getKey();
94+
const Record *key = recList.getKey();
9695
int keyStart = key->getStartPos();
9796
int keyEnd = key->getEndPos();
9897

9998
int hitIdx = 0;
100-
for (RecordKeyList::const_iterator_type iter = recList->begin(); iter != recList->end(); iter = recList->next()) {
101-
const Record *currRec = iter->value();
102-
int maxStart = max(currRec->getStartPos(), keyStart);
103-
int minEnd = min(currRec->getEndPos(), keyEnd);
99+
for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) {
100+
int maxStart = max((*iter)->getStartPos(), keyStart);
101+
int minEnd = min((*iter)->getEndPos(), keyEnd);
104102
if (_context->getObeySplits()) {
105103
intersection += _blockMgr->getOverlapBases(hitIdx);
106104
hitIdx++;
107105
} else {
108106
intersection += (unsigned long)(minEnd - maxStart);
109107
}
110108
}
111-
_numIntersections += (int)recList->size();
109+
_numIntersections += (int)recList.size();
112110
return intersection;
113111
}
114112

‎src/fisher/Fisher.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class Fisher {
2424
unsigned long _dbLen;
2525
bool getFisher();
2626

27-
unsigned long getTotalIntersection(RecordKeyList *hits);
27+
unsigned long getTotalIntersection(RecordKeyVector &hits);
2828
};
2929

3030
#endif /* FISHER_H */

‎src/intersectFile/intersectFile.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#ifndef INTERSECTFILE_H
1313
#define INTERSECTFILE_H
1414

15-
#include "RecordKeyList.h"
15+
#include "RecordKeyVector.h"
1616

1717
using namespace std;
1818

‎src/utils/FileRecordTools/Records/RecordKeyVector.h

+2-3
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,11 @@
99
#define KEYVECTOR_H_
1010

1111

12-
12+
#include "Record.h"
13+
#include <vector>
1314

1415
using namespace std;
1516

16-
#include "Record.h"
17-
#include <vector>
1817

1918
class RecordKeyVector {
2019
public:

0 commit comments

Comments
 (0)
Please sign in to comment.