-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathkhmer-counting.bib
760 lines (683 loc) · 37.2 KB
/
khmer-counting.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
%% This BibTeX bibliography file was created using BibDesk.
%% http://bibdesk.sourceforge.net/
%% Created for Qingpeng Zhang at 2014-06-18 15:54:26 -0400
%% Saved with string encoding Unicode (UTF-8)
@inproceedings{DBLP:conf/padl/MaldeO09,
Abstract = {Analysis of biological data often involves large data sets and computationally expensive algorithms. Databases of biological data continue to grow, leading to an increasing demand for improved algorithms and data structures. Despite having many advantages over more traditional indexing structures, the Bloom filter is almost unused in bioinformatics. Here we present a robust and efficient Bloom filter implementation in Haskell, and implement a simple bioinformatics application for indexing and matching sequence data. We use this to index the chromosomes that make up the human genome, and map all available gene sequences to it. Our experiences with developing and tuning our application suggest that for bioinformatics applications, Haskell offers a compelling combination of rapid development, quality assurance, and high performance.},
Author = {Ketil Malde and Bryan O'Sullivan},
Bibsource = {DBLP, http://dblp.uni-trier.de},
Booktitle = {PADL},
Crossref = {DBLP:conf/padl/2009},
Date-Added = {2014-05-06 22:27:05 +0000},
Date-Modified = {2014-05-06 22:27:05 +0000},
Pages = {183-194},
Title = {Using Bloom Filters for Large Scale Gene Sequence Analysis in Haskell},
Year = {2009},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-540-92995-6_13}}
@proceedings{DBLP:conf/padl/2009,
Bibsource = {DBLP, http://dblp.uni-trier.de},
Booktitle = {PADL},
Date-Added = {2014-05-06 22:27:05 +0000},
Date-Modified = {2014-05-06 22:27:05 +0000},
Editor = {Andy Gill and Terrance Swift},
Isbn = {978-3-540-92994-9},
Publisher = {Springer},
Series = {Lecture Notes in Computer Science},
Title = {Practical Aspects of Declarative Languages, 11th International Symposium, PADL 2009, Savannah, GA, USA, January 19-20, 2009. Proceedings},
Volume = {5418},
Year = {2009},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-540-92995-6}}
@inproceedings{DBLP:conf/sigmod/CohenM03,
Abstract = {A Bloom Filter is a space-efficient randomized data structure allowing membership queries over sets with certain allowable errors. It is widely used in many applications which take advantage of its ability to compactly represent a set, and filter out effectively any element that does not belong to the set, with small error probability. This paper introduces the Spectral Bloom Filter (SBF), an extension of the original Bloom Filter to multi-sets, allowing the filtering of elements whose multiplicities are below a threshold given at query time. Using memory only slightly larger than that of the original Bloom Filter, the SBF supports queries on the multiplicities of individual keys with a guaranteed, small error probability. The SBF also supports insertions and deletions over the data set. We present novel methods for reducing the probability and magnitude of errors. We also present an efficient data structure and algorithms to build it incrementally and maintain it over streaming data, as well as over materialized data with arbitrary insertions and deletions. The SBF does not assume any a priori filtering threshold and effectively and efficiently maintains information over the entire data-set, allowing for ad-hoc queries with arbitrary parameters and enabling a range of new applications.},
Author = {Saar Cohen and Yossi Matias},
Bibsource = {DBLP, http://dblp.uni-trier.de},
Booktitle = {SIGMOD Conference},
Crossref = {DBLP:conf/sigmod/2003},
Date-Added = {2014-05-06 22:23:00 +0000},
Date-Modified = {2014-05-06 22:23:00 +0000},
Pages = {241-252},
Title = {Spectral Bloom Filters},
Year = {2003},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/872757.872787}}
@proceedings{DBLP:conf/sigmod/2003,
Bibsource = {DBLP, http://dblp.uni-trier.de},
Booktitle = {SIGMOD Conference},
Date-Added = {2014-05-06 22:23:00 +0000},
Date-Modified = {2014-05-06 22:23:00 +0000},
Editor = {Alon Y. Halevy and Zachary G. Ives and AnHai Doan},
Isbn = {1-58113-634-X},
Publisher = {ACM},
Title = {Proceedings of the 2003 ACM SIGMOD International Conference on Management of Data, San Diego, California, USA, June 9-12, 2003},
Year = {2003}}
@inproceedings{DBLP:conf/sigcomm/EstanV02,
Abstract = {Accurate network traffic measurement is required for accounting, bandwidth provisioning and detecting DoS attacks. These applications see the traffic as a collection of flows they need to measure. As link speeds and the number of flows increase, keeping a counter for each flow is too expensive (using SRAM) or slow (using DRAM). The current state-of-the-art methods (Cisco's sampled NetFlow) which log periodically sampled packets are slow, inaccurate and resource-intensive. Previous work showed that at different granularities a small number of heavy hitters accounts for a large share of traffic. Our paper introduces a paradigm shift for measurement by concentrating only on large flows --- those above some threshold such as 0.1% of the link capacity.We propose two novel and scalable algorithms for identifying the large flows: sample and hold and multistage filters, which take a constant number of memory references per packet and use a small amount of memory. If $M$ is the available memory, we show analytically that the errors of our new algorithms are proportional to $1/M$; by contrast, the error of an algorithm based on classical sampling is proportional to $1/sqrtM$, thus providing much less accuracy for the same amount of memory. We also describe further optimizations such as early removal and conservative update that further improve the accuracy of our algorithms, as measured on real traffic traces, by an order of magnitude. Our schemes allow a new form of accounting called threshold accounting in which only flows above a threshold are charged by usage while the rest are charged a fixed fee. Threshold accounting generalizes usage-based and duration based pricing.},
Author = {Cristian Estan and George Varghese},
Bibsource = {DBLP, http://dblp.uni-trier.de},
Booktitle = {SIGCOMM},
Crossref = {DBLP:conf/sigcomm/2002},
Date-Added = {2014-05-06 22:21:14 +0000},
Date-Modified = {2014-05-06 22:21:14 +0000},
Pages = {323-336},
Title = {New directions in traffic measurement and accounting},
Year = {2002},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/633025.633056}}
@proceedings{DBLP:conf/sigcomm/2002,
Bibsource = {DBLP, http://dblp.uni-trier.de},
Booktitle = {SIGCOMM},
Date-Added = {2014-05-06 22:21:14 +0000},
Date-Modified = {2014-05-06 22:21:14 +0000},
Isbn = {1-58113-570-X},
Publisher = {ACM},
Title = {Proceedings of the ACM SIGCOMM 2002 Conference on Applications, Technologies, Architectures, and Protocols for Computer Communication, August 19-23, 2002, Pittsburgh, PA, USA},
Year = {2002}}
@article{flajolet2008hyperloglog,
Author = {Flajolet, Philippe and Fusy, {\'E}ric and Gandouet, Olivier and Meunier, Fr{\'e}d{\'e}ric},
Date-Added = {2014-05-03 02:17:23 +0000},
Date-Modified = {2014-05-03 02:17:23 +0000},
Journal = {DMTCS Proceedings},
Number = {1},
Title = {HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm},
Year = {2008}}
@article{Fan:2000:SCS:343571.343572,
Acmid = {343572},
Address = {Piscataway, NJ, USA},
Author = {Fan, Li and Cao, Pei and Almeida, Jussara and Broder, Andrei Z.},
Date-Added = {2014-05-03 02:09:38 +0000},
Date-Modified = {2014-05-03 02:09:38 +0000},
Doi = {10.1109/90.851975},
Issn = {1063-6692},
Issue_Date = {June 2000},
Journal = {IEEE/ACM Trans. Netw.},
Keywords = {ICP, Web cache, Web proxy, bloom filter, cache sharing},
Month = jun,
Number = {3},
Numpages = {13},
Pages = {281--293},
Publisher = {IEEE Press},
Title = {Summary Cache: A Scalable Wide-area Web Cache Sharing Protocol},
Url = {http://dx.doi.org/10.1109/90.851975},
Volume = {8},
Year = {2000},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/90.851975}}
@article{broder2004network,
Author = {Broder, Andrei and Mitzenmacher, Michael},
Date-Added = {2014-04-20 17:14:01 +0000},
Date-Modified = {2014-04-20 17:14:01 +0000},
Journal = {Internet mathematics},
Number = {4},
Pages = {485--509},
Publisher = {Taylor \& Francis},
Title = {Network applications of bloom filters: A survey},
Volume = {1},
Year = {2004}}
@manual{khmer,
Author = {Crusoe, Michael and Edvenson, Greg and Fish, Jordan and Howe, Adina and McDonald, Eric and Nahum, Joshua and Nanlohy, Kaben and Ortiz-Zuazaga, Humberto and Pell, Jason and Simpson, Jared and Scott, Camille and Srinivasan, Ramakrishnan and Zhang, Qingpeng and Brown, C. Titus},
Date-Added = {2014-04-18 23:29:55 +0000},
Date-Modified = {2014-06-10 21:44:59 +0000},
Title = {The khmer software package: enabling efficient sequence analysis},
Url = {http://dx.doi.org/10.6084/m9.figshare.979190},
Year = {2014},
Bdsk-Url-1 = {-%20http://figshare.com/articles/The_khmer_software_package_enabling_efficient_sequence_analysis/979190}}
@article{Audano2014,
Abstract = {MOTIVATION: Converting nucleotide sequences into short overlapping fragments of uniform length, k-mers, is a common step in many bioinformatics applications. While existing software packages count k-mers, few are optimized for speed, offer an API (Application Programming Interface), a graphical interface, or contain features that make it extensible and maintainable. We designed KAnalyze to compete with the fastest k-mer counters, to produce reliable output, and to support future development efforts through well architected, documented, and testable code. Currently, KAnalyze can output k-mer counts in a sorted tab-delimited file or stream k-mers as they are read. KAnalyze can process large data sets with 2GB of memory. This project is implemented in Java 7, and the CLI (Command Line Interface) is designed to integrate into pipelines written in any language.
RESULTS: As a k-mer counter, KAnalyze outperforms Jellyfish (Mar{\c c}ais and Kingsford, 2011), DSK (Rizk et al., 2013), and a pipeline built on a Perl and Linux utilities. Through extensive unit and system testing, we have verified that KAnalyze produces the correct k-mer counts over multiple data sets and k-mer sizes.
AVAILABILITY: KAnalyze is available on SourceForge: https://sourceforge.net/projects/kanalyze/ SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.
CONTACT: [email protected].},
Author = {Audano, Peter and Vannberg, Fredrik},
Date-Added = {2014-04-18 20:50:07 +0000},
Date-Modified = {2014-06-10 21:36:18 +0000},
Doi = {10.1093/bioinformatics/btu152},
Journal = {Bioinformatics: Advance Access published March 18, 2014},
Journal-Full = {Bioinformatics (Oxford, England)},
Month = {Mar},
Pages = {doi: 10.1093/bioinformatics/btu152},
Pmid = {24642064},
Pst = {aheadofprint},
Title = {KAnalyze: A Fast Versatile Pipelined K-mer Toolkit},
Year = {2014},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btu152}}
@article{Roy2014,
Abstract = {MOTIVATION: Counting the frequencies of k-mers in read libraries is often a first step in the analysis of high-throughput sequencing data. Infrequent k-mers are assumed to be a result of sequencing errors. The frequent k-mers constitute a reduced but error-free representation of the experiment, which can inform read error correction or serve as the input to de novo assembly methods. Ideally, the memory requirement for counting should be linear in the number of frequent k-mers and not in the, typically much larger, total number of k-mers in the read library.
RESULTS: We present a novel method that balances time, space and accuracy requirements to efficiently extract frequent k-mers even for high-coverage libraries and large genomes such as human. Our method is designed to minimize cache misses in a cache-efficient manner by using a pattern-blocked Bloom filter to remove infrequent k-mers from consideration in combination with a novel sort-and-compact scheme, instead of a hash, for the actual counting. Although this increases theoretical complexity, the savings in cache misses reduce the empirical running times. A variant of method can resort to a counting Bloom filter for even larger savings in memory at the expense of false-negative rates in addition to the false-positive rates common to all Bloom filter-based approaches. A comparison with the state-of-the-art shows reduced memory requirements and running times.
AVAILABILITY AND IMPLEMENTATION: The tools are freely available for download at http://bioinformatics.rutgers.edu/Software/Turtle and http://figshare.com/articles/Turtle/791582.
CONTACT: [email protected] or [email protected] SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
Author = {Roy, Rajat Shuvro and Bhattacharya, Debashish and Schliep, Alexander},
Date-Added = {2014-04-18 20:48:21 +0000},
Date-Modified = {2014-06-10 21:35:20 +0000},
Doi = {10.1093/bioinformatics/btu132},
Journal = {Bioinformatics: Advance Access published March 10, 2014},
Journal-Full = {Bioinformatics (Oxford, England)},
Month = {Apr},
Pages = {doi: 10.1093/bioinformatics/btu132},
Pmid = {24618471},
Pst = {aheadofprint},
Title = {Turtle: Identifying frequent k-mers with cache-efficient algorithms},
Year = {2014},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btu132}}
@inproceedings{CormodeM05,
Author = {Graham Cormode and S. Muthukrishnan},
Bibsource = {DBLP, http://dblp.uni-trier.de},
Booktitle = {SDM},
Crossref = {DBLP:conf/sdm/2005},
Date-Added = {2014-04-18 20:43:06 +0000},
Date-Modified = {2014-04-18 20:43:35 +0000},
Pages = {44-55},
Title = {Summarizing and Mining Skewed Data Streams},
Year = {2005},
Bdsk-Url-1 = {http://dx.doi.org/10.1137/1.9781611972757.5}}
@proceedings{DBLP:conf/sdm/2005,
Bibsource = {DBLP, http://dblp.uni-trier.de},
Booktitle = {SDM},
Date-Added = {2014-04-18 20:43:06 +0000},
Date-Modified = {2014-04-18 20:43:06 +0000},
Editor = {Hillol Kargupta and Jaideep Srivastava and Chandrika Kamath and Arnold Goodman},
Isbn = {978-0-89871-593-4, 978-1-61197-275-7},
Publisher = {SIAM},
Title = {Proceedings of the 2005 SIAM International Conference on Data Mining, SDM 2005, Newport Beach, CA, USA, April 21-23, 2005},
Year = {2005},
Bdsk-Url-1 = {http://dx.doi.org/10.1137/1.9781611972757}}
@article{Li2003,
Abstract = {In shotgun sequencing projects, the genome or BAC length is not always known. We approach estimating genome length by first estimating the repeat structure of the genome or BAC, sometimes of interest in its own right, on the basis of a set of random reads from a genome project. Moreover, we can find the consensus for repeat families before assembly. Our methods are based on the l-tuple content of the reads.},
Author = {Li, Xiaoman and Waterman, Michael S},
Date-Added = {2014-04-18 19:55:32 +0000},
Date-Modified = {2014-04-18 19:55:32 +0000},
Doi = {10.1101/gr.1251803},
Journal = {Genome Res},
Journal-Full = {Genome research},
Mesh = {Algorithms; Base Composition; Chromosome Mapping; Chromosomes, Artificial, Bacterial; Computational Biology; Computer Simulation; Consensus Sequence; DNA, Bacterial; Mathematical Computing; Poisson Distribution; Repetitive Sequences, Nucleic Acid; Software},
Month = {Aug},
Number = {8},
Pages = {1916-22},
Pmc = {PMC403783},
Pmid = {12902383},
Pst = {ppublish},
Title = {Estimating the repeat structure and length of DNA sequences using L-tuples},
Volume = {13},
Year = {2003},
Bdsk-Url-1 = {http://dx.doi.org/10.1101/gr.1251803}}
@article{Howe2012,
Abstract = {The large volumes of sequencing data required to sample deeply the microbial communities of complex environments pose new challenges to sequence analysis. De novo metagenomic assembly effectively reduces the total amount of data to be analyzed but requires substantial computational resources. We combine two preassembly filtering approaches-digital normalization and partitioning-to generate previously intractable large metagenome assemblies. Using a human-gut mock community dataset, we demonstrate that these methods result in assemblies nearly identical to assemblies from unprocessed data. We then assemble two large soil metagenomes totaling 398 billion bp (equivalent to 88,000 Escherichia coli genomes) from matched Iowa corn and native prairie soils. The resulting assembled contigs could be used to identify molecular interactions and reaction networks of known metabolic pathways using the Kyoto Encyclopedia of Genes and Genomes Orthology database. Nonetheless, more than 60% of predicted proteins in assemblies could not be annotated against known databases. Many of these unknown proteins were abundant in both corn and prairie soils, highlighting the benefits of assembly for the discovery and characterization of novelty in soil biodiversity. Moreover, 80% of the sequencing data could not be assembled because of low coverage, suggesting that considerably more sequencing data are needed to characterize the functional content of soil.},
Author = {Howe, Adina Chuang and Jansson, Janet K and Malfatti, Stephanie A and Tringe, Susannah G and Tiedje, James M and Brown, C Titus},
Date-Added = {2014-04-18 18:35:19 +0000},
Date-Modified = {2014-04-18 18:35:55 +0000},
Doi = {10.1073/pnas.1402564111},
Journal = {Proc Natl Acad Sci U S A},
Journal-Full = {Proceedings of the National Academy of Sciences of the United States of America},
Month = {Apr},
Number = {13},
Pages = {4904-9},
Pmid = {24632729},
Pst = {ppublish},
Title = {Tackling soil diversity with the assembly of large, complex metagenomes},
Volume = {111},
Year = {2014},
Bdsk-Url-1 = {http://dx.doi.org/10.1073/pnas.1402564111}}
@article{Chikhi:2014aa,
Abstract = {MOTIVATION: Genome assembly tools based on the de Bruijn graph framework rely on a parameter k, which represents a trade-off between several competing effects that are difficult to quantify. There is currently a lack of tools that would automatically estimate the best k to use and/or quickly generate histograms of k-mer abundances that would allow the user to make an informed decision.
RESULTS: We develop a fast and accurate sampling method that constructs approximate abundance histograms with several orders of magnitude performance improvement over traditional methods. We then present a fast heuristic that uses the generated abundance histograms for putative k values to estimate the best possible value of k. We test the effectiveness of our tool using diverse sequencing datasets and find that its choice of k leads to some of the best assemblies.
AVAILABILITY: Our tool KmerGenie is freely available at: http://kmergenie.bx.psu.edu/.
CONTACT: [email protected].},
Author = {Chikhi, Rayan and Medvedev, Paul},
Date-Added = {2014-01-21 16:11:48 +0000},
Date-Modified = {2014-01-21 16:11:48 +0000},
Doi = {10.1093/bioinformatics/btt310},
Journal = {Bioinformatics},
Journal-Full = {Bioinformatics (Oxford, England)},
Month = {Jan},
Number = {1},
Pages = {31-7},
Pmid = {23732276},
Pst = {ppublish},
Title = {Informed and automated k-mer size selection for genome assembly},
Volume = {30},
Year = {2014},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btt310}}
@article{Jones:2012aa,
Abstract = {We present Quip, a lossless compression algorithm for next-generation sequencing data in the FASTQ and SAM/BAM formats. In addition to implementing reference-based compression, we have developed, to our knowledge, the first assembly-based compressor, using a novel de novo assembly algorithm. A probabilistic data structure is used to dramatically reduce the memory required by traditional de Bruijn graph assemblers, allowing millions of reads to be assembled very efficiently. Read sequences are then stored as positions within the assembled contigs. This is combined with statistical compression of read identifiers, quality scores, alignment information and sequences, effectively collapsing very large data sets to <15% of their original size with no loss of information. Availability: Quip is freely available under the 3-clause BSD license from http://cs.washington.edu/homes/dcjones/quip.},
Author = {Jones, Daniel C and Ruzzo, Walter L and Peng, Xinxia and Katze, Michael G},
Date-Added = {2014-01-21 16:11:26 +0000},
Date-Modified = {2014-01-21 16:11:26 +0000},
Doi = {10.1093/nar/gks754},
Journal = {Nucleic Acids Res},
Journal-Full = {Nucleic acids research},
Mesh = {Algorithms; Data Compression; High-Throughput Nucleotide Sequencing; Probability; Software},
Month = {Dec},
Number = {22},
Pages = {e171},
Pmc = {PMC3526293},
Pmid = {22904078},
Pst = {ppublish},
Title = {Compression of next-generation sequencing reads aided by highly efficient de novo assembly},
Volume = {40},
Year = {2012},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/nar/gks754}}
@article{4160251,
Author = {P{\'e}rez, F. and Granger, B.E.},
Journal = {Computing in Science Engineering},
Number = {3},
Pages = {21-29},
Title = {IPython: A System for Interactive Scientific Computing},
Volume = {9},
Year = {2007}}
@article{Luo2009,
Author = {Luo, Weijun and Friedman, Michael S and Shedden, Kerby and Hankenson, Kurt D and Woolf, Peter J},
Journal = {BMC Bioinformatics},
Pages = {161},
Title = {GAGE: generally applicable gene set enrichment for pathway analysis},
Volume = {10},
Year = {2009}}
@incollection{McDonald2013,
Author = {Eric McDonald and C. Titus Brown},
Booktitle = {The Performance of Open Source Applications},
Chapter = {12},
Date-Modified = {2014-06-10 21:39:36 +0000},
Editor = {Tavish Armstrong},
Pages = {151},
Publisher = {lulu.com},
Title = {Working with Big Data in Bioinformatics},
Year = {2013}}
@unpublished{Brown2012blog,
Author = {C. Titus Brown},
Date-Modified = {2014-06-18 19:54:14 +0000},
Title = {What does Trinity's In Silico normalization do?},
Url = {http://dx.doi.org/10.6084/m9.figshare.98198},
Year = {2012},
Bdsk-Url-1 = {http://ivory.idyll.org/blog/trinity-in-silico-normalize.html}}
@article{Haas2013,
Author = {Haas, Brian J and Papanicolaou, Alexie and Yassour, Moran and Grabherr, Manfred and Blood, Philip D and Bowden, Joshua and Couger, Matthew Brian and Eccles, David and Li, Bo and Lieber, Matthias and Macmanes, Matthew D and Ott, Michael and Orvis, Joshua and Pochet, Nathalie and Strozzi, Francesco and Weeks, Nathan and Westerman, Rick and William, Thomas and Dewey, Colin N and Henschel, Robert and Leduc, Richard D and Friedman, Nir and Regev, Aviv},
Journal = {Nat Protoc},
Month = {Aug},
Number = {8},
Pages = {1494-512},
Title = {De novo transcript sequence reconstruction from RNA-seq using the Trinity platform for reference generation and analysis},
Volume = {8},
Year = {2013}}
@article{pubmed19997069,
Author = {ML Metzker},
Journal = {Nat Rev Genet},
Number = {1},
Pages = {31-46},
Title = {Sequencing technologies - the next generation.},
Volume = {11},
Year = {2010}}
@article{pubmed21926975,
Author = {H Chitsaz and JL Yee-Greenbaum and G Tesler and MJ Lombardo and CL Dupont and JH Badger and M Novotny and DB Rusch and LJ Fraser and NA Gormley and O Schulz-Trieglaff and GP Smith and DJ Evers and PA Pevzner and RS Lasken},
Journal = {Nat Biotechnol},
Number = {10},
Pages = {915-21},
Title = {Efficient de novo assembly of single-cell bacterial genomes from short-read data sets.},
Volume = {29},
Year = {2011}}
@article{Mackelprang2011,
Author = {Mackelprang, Rachel and Waldrop, Mark P and DeAngelis, Kristen M and David, Maude M and Chavarria, Krystle L and Blazewicz, Steven J and Rubin, Edward M and Jansson, Janet K},
Journal = {Nature},
Month = {Dec},
Number = {7377},
Pages = {368-71},
Title = {Metagenomic analysis of a permafrost microbial community reveals a rapid response to thaw},
Volume = {480},
Year = {2011}}
@article{Kelley2010,
Author = {Kelley, David R and Schatz, Michael C and Salzberg, Steven L},
Journal = {Genome Biol},
Number = {11},
Pages = {R116},
Title = {Quake: quality-aware detection and correction of sequencing errors},
Volume = {11},
Year = {2010}}
@article{Medvedev2011,
Author = {Medvedev, Paul and Scott, Eric and Kakaradov, Boyko and Pevzner, Pavel},
Journal = {Bioinformatics},
Month = {Jul},
Number = {13},
Pages = {i137-41},
Title = {Error correction of high-throughput sequencing datasets with non-uniform coverage},
Volume = {27},
Year = {2011}}
@article{Brown2012,
Author = {C. Titus Brown and Adina Howe and Qingpeng Zhang and Alexis B. Pyrkosz and Timothy H. Brom},
Date-Modified = {2014-06-10 21:23:40 +0000},
Journal = {arXiv},
Month = {03},
Pages = {1203.4802},
Title = {A Reference-Free Algorithm for Computational Normalization of Shotgun Sequencing Data},
Year = {2012}}
@article{adina2013,
Author = {Adina Chuang Howe and Jason Pell and Rosangela Canino-Koning and Rachel Mackelprang and Susannah Tringe and Janet Jansson and James M. Tiedje and C. Titus Brown},
Date-Modified = {2014-06-10 21:26:01 +0000},
Journal = {arXiv},
Pages = {1212.0159},
Title = {Illumina Sequencing Artifacts Revealed by Connectivity Analysis of Metagenomic Datasets},
Year = {2012}}
@article{Deorowicz2013,
Author = {Deorowicz, Sebastian and Debudaj-Grabysz, Agnieszka and Grabowski, Szymon},
Journal = {BMC Bioinformatics},
Month = {May},
Number = {1},
Pages = {160},
Title = {Disk-based k-mer counting on a PC},
Volume = {14},
Year = {2013}}
@article{Minoche2011,
Author = {Minoche, Andr{\'e} E and Dohm, Juliane C and Himmelbauer, Heinz},
Journal = {Genome Biol},
Number = {11},
Pages = {R112},
Title = {Evaluation of genomic high-throughput sequencing data generated on Illumina HiSeq and genome analyzer systems},
Volume = {12},
Year = {2011}}
@article{Rizk2013,
Author = {Rizk, Guillaume and Lavenier, Dominique and Chikhi, Rayan},
Journal = {Bioinformatics},
Month = {Mar},
Number = {5},
Pages = {652-3},
Title = {DSK: k-mer counting with very low memory usage},
Volume = {29},
Year = {2013}}
@article{Pell2012,
Author = {Pell, Jason and Hintze, Arend and Canino-Koning, Rosangela and Howe, Adina and Tiedje, James M and Brown, C Titus},
Journal = {Proc Natl Acad Sci U S A},
Month = {Aug},
Number = {33},
Pages = {13272-7},
Title = {Scaling metagenome sequence assembly with probabilistic de Bruijn graphs},
Volume = {109},
Year = {2012}}
@article{BroderM03,
Author = {Andrei Z. Broder and Michael Mitzenmacher},
Journal = {Internet Mathematics},
Number = {4},
Pages = {485-509},
Title = {Survey: Network Applications of Bloom Filters: A Survey},
Volume = {1},
Year = {2003}}
@article{Bloom70,
Author = {Burton H. Bloom},
Journal = {Commun. ACM},
Number = {7},
Pages = {422-426},
Title = {Space/Time Trade-offs in Hash Coding with Allowable Errors},
Volume = {13},
Year = {1970}}
@article{Li2010,
Author = {Li, Ruiqiang and Zhu, Hongmei and Ruan, Jue and Qian, Wubin and Fang, Xiaodong and Shi, Zhongbin and Li, Yingrui and Li, Shengting and Shan, Gao and Kristiansen, Karsten and Li, Songgang and Yang, Huanming and Wang, Jian and Wang, Jun},
Journal = {Genome Res},
Month = {Feb},
Number = {2},
Pages = {265-72},
Title = {De novo assembly of human genomes with massively parallel short read sequencing},
Volume = {20},
Year = {2010}}
@article{Simpson2009,
Author = {Simpson, Jared T and Wong, Kim and Jackman, Shaun D and Schein, Jacqueline E and Jones, Steven J M and Birol, Inan{\c c}},
Journal = {Genome Res},
Month = {Jun},
Number = {6},
Pages = {1117-23},
Title = {ABySS: a parallel assembler for short read sequence data},
Volume = {19},
Year = {2009}}
@article{Butler2008,
Author = {Butler, Jonathan and MacCallum, Iain and Kleber, Michael and Shlyakhter, Ilya A and Belmonte, Matthew K and Lander, Eric S and Nusbaum, Chad and Jaffe, David B},
Journal = {Genome Res},
Month = {May},
Number = {5},
Pages = {810-20},
Title = {ALLPATHS: de novo assembly of whole-genome shotgun microreads},
Volume = {18},
Year = {2008}}
@article{Zerbino2008,
Author = {Zerbino, Daniel R and Birney, Ewan},
Journal = {Genome Res},
Month = {May},
Number = {5},
Pages = {821-9},
Title = {Velvet: algorithms for de novo short read assembly using de Bruijn graphs},
Volume = {18},
Year = {2008}}
@article{Pevzner2001,
Author = {Pevzner, P A and Tang, H and Waterman, M S},
Journal = {Proc Natl Acad Sci U S A},
Month = {Aug},
Number = {17},
Pages = {9748-53},
Title = {An Eulerian path approach to DNA fragment assembly},
Volume = {98},
Year = {2001}}
@article{Marcais2011,
Author = {Mar\c{c}ais, Guillaume and Kingsford, Carl},
Journal = {Bioinformatics},
Number = {6},
Pages = {764--770},
Title = {{A fast, lock-free approach for efficient parallel counting of occurrences of k-mers.}},
Volume = {27},
Year = {2011}}
@article{Qin2010,
Author = {Qin, Junjie and Li, Ruiqiang and Raes, Jeroen and Arumugam, Manimozhiyan and Burgdorf, Kristoffer Solvsten and Manichanh, Chaysavanh and Nielsen, Trine and Pons, Nicolas and Levenez, Florence and Yamada, Takuji and Mende, Daniel R and Li, Junhua and Xu, Junming and Li, Songgang Shaochuan Shengting and Li, Dongfang and Cao, Jianjun and Wang, Bo and Liang, Huiqing and Zheng, Huisong and Xie, Yinlong and Tap, Julien and Lepage, Patricia and Bertalan, Marcelo and Batto, Jean-Michel and Hansen, Torben and {Le Paslier}, Denis and Linneberg, Allan and Nielsen, H Bj\o rn and Pelletier, Eric and Renault, Pierre and Sicheritz-Ponten, Thomas and Turner, Keith and Zhu, Hongmei and Yu, Chang and Jian, Min and Zhou, Yan and Li, Yingrui and Zhang, Xiuqing and Qin, Nan and Yang, Huanming and Wang, Jun Jian and Brunak, S\o ren and Dor\'{e}, Joel and Guarner, Francisco and Kristiansen, Karsten and Pedersen, Oluf and Parkhill, Julian and Weissenbach, Jean and Bork, Peer and Ehrlich, S Dusko},
Journal = {Nature},
Number = {7285},
Pages = {59--65},
Title = {{A human gut microbial gene catalogue established by metagenomic sequencing.}},
Volume = {464},
Year = {2010}}
@article{Kurtz2008,
Author = {Kurtz, Stefan and Narechania, Apurva and Stein, Joshua C and Ware, Doreen},
Journal = {BMC Genomics},
Number = {1},
Pages = {517},
Title = {{A new method to compute K-mer frequencies and its application to annotate large repetitive plant genomes}},
Volume = {9},
Year = {2008}}
@article{Shi2010,
Author = {Shi, Haixiang and Schmidt, Bertil and Liu, Weiguo and M\"{u}ller-Wittig, Wolfgang},
Journal = {Journal of computational biology a journal of computational molecular cell biology},
Number = {4},
Pages = {603--615},
Title = {{A parallel algorithm for error correction in high-throughput short-read data on CUDA-enabled graphics hardware.}},
Volume = {17},
Year = {2010}}
@article{Davenport2010,
Author = {Davenport, Colin F and T\"{u}mmler, Burkhard},
Journal = {PLoS ONE},
Number = {3},
Pages = {8},
Title = {{Abundant Oligonucleotides Common to Most Bacteria}},
Volume = {5},
Year = {2010}}
@article{Cormode2005,
Author = {Cormode, Graham and Muthukrishnan, S},
Journal = {Journal of Algorithms},
Month = apr,
Number = {1},
Pages = {58--75},
Title = {{An improved data stream summary: the count-min sketch and its applications}},
Volume = {55},
Year = {2005}}
@article{Healy2003,
Author = {Healy, John and Thomas, Elizabeth E and Schwartz, Jacob T and Wigler, Michael},
Journal = {Genome Research},
Number = {10},
Pages = {2306--2315},
Title = {{Annotating large genomes with exact word matches.}},
Volume = {13},
Year = {2003}}
@article{Miller2010,
Author = {Miller, Jason R and Koren, Sergey and Sutton, Granger},
Journal = {Genomics},
Month = {Jun},
Number = {6},
Pages = {315-27},
Title = {Assembly algorithms for next-generation sequencing data},
Volume = {95},
Year = {2010}}
@article{Bar-yossef,
Author = {Bar-yossef, Ziv and Jayram, T S and Kumar, Ravi and Sivakumar, D},
Journal = {Memory},
Title = {{Counting distinct elements in a data stream}}}
@article{Hampson2002,
Author = {Hampson, Steven and Kibler, Dennis and Baldi, Pierre},
Journal = {Bioinformatics},
Number = {4},
Pages = {513--528},
Title = {{Distribution patterns of over-represented k-mers in non-coding yeast DNA.}},
Volume = {18},
Year = {2002}}
@article{Sindi2008,
Author = {Sindi, Suzanne S and Hunt, Brian R and Yorke, James A},
Journal = {Physical Review E - Statistical, Nonlinear and Soft Matter Physics},
Number = {6 Pt 1},
Pages = {061912},
Title = {{Duplication count distributions in DNA sequences.}},
Volume = {78},
Year = {2008}}
@article{Melsted2011,
Author = {Melsted, P\'{a}ll and Pritchard, Jonathan K},
Journal = {BMC bioinformatics},
Month = jan,
Pages = {333},
Title = {{Efficient counting of k-mers in DNA sequences using a bloom filter.}},
Volume = {12},
Year = {2011}}
@article{Hooper2010,
Author = {Hooper, Sean D and Dalevi, Daniel and Pati, Amrita and Mavromatis, Konstantinos and Ivanova, Natalia N and Kyrpides, Nikos C},
Journal = {Bioinformatics},
Number = {3},
Pages = {295--301},
Title = {{Estimating DNA coverage and abundance in metagenomes using a gamma approximation}},
Volume = {26},
Year = {2010}}
@article{Charikar2004,
Author = {Charikar, M},
Journal = {Theoretical Computer Science},
Month = jan,
Number = {1},
Pages = {3--15},
Title = {{Finding frequent items in data streams}},
Volume = {312},
Year = {2004}}
@article{Trifonov2010,
Author = {Trifonov, Vladimir and Rabadan, Raul},
Journal = {mBio},
Number = {3},
Pages = {1--8},
Title = {{Frequency Analysis Techniques for Identification of Viral Genetic Data}},
Volume = {1},
Year = {2010}}
@article{Chor2009,
Author = {Chor, Benny and Horn, David and Goldman, Nick and Levy, Yaron and Massingham, Tim},
Journal = {Genome Biology},
Number = {10},
Pages = {R108},
Title = {{Genomic DNA k-mer spectra: models and modalities}},
Volume = {10},
Year = {2009}}
@article{Kirsch2006,
Author = {Kirsch, Adam and Mitzenmacher, Michael},
Journal = {Building},
Pages = {456--467},
Title = {{Less Hashing , Same Performance : Building a Better Bloom Filter}},
Year = {2006}}
@article{Hess2011,
Author = {Hess, M and Sczyrba, A and Egan, R and Kim, T W and Chokhawala, H and Schroth, G and Luo, S and Clark, D S and Chen, F and Zhang, T and Mackie, R I and Pennacchio, L A and Tringe, S G and Visel, A and Woyke, T and Wang, Z and Rubin, E M},
Journal = {Science},
Number = {6016},
Pages = {463--467},
Title = {{Metagenomic Discovery of Biomass-Degrading Genes and Genomes from Cow Rumen}},
Volume = {331},
Year = {2011}}
@article{Richter2008,
Author = {Richter, Daniel C and Ott, Felix and Auch, Alexander F and Schmid, Ramona and Huson, Daniel H},
Journal = {PLoS ONE},
Number = {10},
Pages = {12},
Title = {{MetaSim---A Sequencing Simulator for Genomics and Metagenomics}},
Volume = {3},
Year = {2008}}
@article{Chen2005,
Author = {Chen, Yaw-Hwang and Nyeo, Su-Long and Yeh, Chiung-Yuh},
Journal = {Physical Review E},
Number = {1},
Pages = {1--7},
Title = {{Model for the distributions of k-mers in DNA sequences}},
Volume = {72},
Year = {2005}}
@article{Woyke2010,
Author = {Woyke, Tanja and Tighe, Damon and Mavromatis, Konstantinos and Clum, Alicia and Copeland, Alex and Schackwitz, Wendy and Lapidus, Alla and Wu, Dongying and McCutcheon, John P and McDonald, Bradon R and Moran, Nancy A and Bristow, James and Cheng, Jan-Fang},
Journal = {PLoS ONE},
Number = {4},
Pages = {8},
Title = {{One Bacterial Cell, One Complete Genome}},
Volume = {5},
Year = {2010}}
@article{Kane2010,
Author = {Kane, Daniel M and Nelson, Jelani and Woodruff, David P and Road, Harry and Jose, San},
Journal = {PODS '10 P},
Pages = {41--52},
Title = {An Optimal Algorithm for the Distinct Elements Problem},
Year = {2010}}
@article{Campagna2005,
Author = {Campagna, Davide and Romualdi, Chiara and Vitulo, Nicola and {Del Favero}, Micky and Lexa, Matej and Cannata, Nicola and Valle, Giorgio},
Journal = {Bioinformatics},
Number = {5},
Pages = {582--588},
Title = {{RAP: a new computer program for de novo identification of repeated sequences in whole genomes.}},
Volume = {21},
Year = {2005}}
@article{Yang2011,
Author = {Yang, Xiao and Aluru, Srinivas and Dorman, Karin S},
Journal = {BMC Bioinformatics},
Number = {Suppl 1},
Pages = {S52},
Title = {{Repeat-aware modeling and correction of short read errors}},
Volume = {12},
Year = {2011}}
@article{Rusu2008,
Author = {Rusu, Florin and Dobra, Alin},
Journal = {ACM Transactions on Database Systems},
Month = aug,
Number = {3},
Pages = {1--46},
Title = {{Sketches for size of join estimation}},
Volume = {33},
Year = {2008}}
@article{Do2008,
Author = {Do, Huy Hoang and Choi, Kwok Pui and Preparata, Franco P and Sung, Wing Kin and Zhang, Louxin},
Journal = {Journal of computational biology a journal of computational molecular cell biology},
Number = {5},
Pages = {469--487},
Title = {{Spectrum-based de novo repeat detection in genomic sequences.}},
Volume = {15},
Year = {2008}}
@article{Conway2011,
Author = {Conway, Thomas C and Bromage, Andrew J},
Journal = {Bioinformatics},
Month = {Feb},
Number = {4},
Pages = {479-86},
Title = {Succinct data structures for assembling large genomes},
Volume = {27},
Year = {2011}}
@article{Stein2010,
Author = {Stein, Lincoln D},
Journal = {Genome Biol},
Number = {5},
Pages = {207},
Title = {The case for cloud computing in genome informatics},
Volume = {11},
Year = {2010}}
@article{Sboner2011,
Author = {Sboner, Andrea and Mu, Xinmeng Jasmine and Greenbaum, Dov and Auerbach, Raymond K and Gerstein, Mark B},
Journal = {Genome Biol},
Number = {8},
Pages = {125},
Title = {The real cost of sequencing: higher than you think!},
Volume = {12},
Year = {2011}}
@article{McElroy2012,
Author = {McElroy, Kerensa E and Luciani, Fabio and Thomas, Torsten},
Journal = {BMC Genomics},
Pages = {74},
Title = {GemSIM: general, error-model based simulator of next-generation sequencing data},
Volume = {13},
Year = {2012}}