Skip to content

Commit e5957cf

Browse files
committedApr 4, 2016
Merge pull request #106 from pachterlab/devel
Update to v0.42.5
2 parents f0678a2 + f48933d commit e5957cf

13 files changed

+578
-45
lines changed
 

‎gulpfile.js

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
var gulp = require('gulp');
2+
var exec = require('child_process').exec;
3+
4+
var cMakeCommand = 'cd build; cmake ..;';
5+
var buildCommand = 'cd build; make;';
6+
7+
var indexCommand = 'build/src/kallisto' +
8+
' index -i test/transcripts.kidx' +
9+
' test/transcripts.fasta.gz';
10+
11+
var pairedEndCommand = 'build/src/kallisto' +
12+
' quant -i test/transcripts.kidx' +
13+
' -b 10' +
14+
' -t 2' +
15+
' -o test/paired_end' +
16+
' test/reads_1.fastq.gz test/reads_2.fastq.gz';
17+
18+
var singleEndCommand = 'build/src/kallisto' +
19+
' quant -i test/transcripts.kidx' +
20+
' -b 10' +
21+
' -t 2' +
22+
' -l 200 -s 3' +
23+
' -o test/single_end' +
24+
' --single' +
25+
' test/reads_1.fastq.gz';
26+
27+
console.log('build command: ' + buildCommand);
28+
29+
gulp.task('watch', function() {
30+
gulp.watch('src/*.cpp', ['build']);
31+
gulp.watch('src/*.h', ['build']);
32+
gulp.watch('src/*.hpp', ['build']);
33+
});
34+
35+
gulp.task('build', ['watch'], function() {
36+
exec(buildCommand, function(error, standardOutput, standardError) {
37+
if (error) {
38+
console.error('There was an error: ' + error);
39+
}
40+
console.log(standardOutput);
41+
console.log(standardError);
42+
});
43+
});
44+
45+
gulp.task('pairedEnd', ['build'], function() {
46+
exec(pairedEndCommand, function(error, standardOut, standardError) {
47+
if (error) {
48+
console.error('There was a pairedEnd error');
49+
}
50+
console.log(standardOut);
51+
console.log(standardError);
52+
});
53+
});
54+
55+
gulp.task('singleEnd', ['build'], function() {
56+
exec(singleEndCommand, function(error, standardOut, standardError) {
57+
if (error) {
58+
console.error('There was a singleEnd error');
59+
}
60+
console.log(standardOut);
61+
console.log(standardError);
62+
});
63+
});
64+
65+
// gulp.task('default', ['install', 'watch'], function() {});
66+
// gulp.task('compile', ['build', 'watch'], function() {});
67+
// gulp.task('pairedEnd', ['compile'], function() {});
68+
// gulp.task('singleEnd', ['compile'], function() {});
69+
gulp.task('default', ['pairedEnd', 'singleEnd'], function() {});

‎src/EMAlgorithm.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ struct EMAlgorithm {
152152
}
153153

154154
//std::cout << chcount << std::endl;
155-
if (chcount == 0) {
155+
if (chcount == 0 && i > min_rounds) {
156156

157157
stopEM=true;
158158
}
@@ -275,7 +275,7 @@ struct EMAlgorithm {
275275
}
276276
}
277277

278-
std::cout << sum_big << " " << count_big << " " << n << std::endl;
278+
//std::cout << sum_big << " " << count_big << " " << n << std::endl;
279279

280280
std::copy(em_start.alpha_before_zeroes_.begin(), em_start.alpha_before_zeroes_.end(),
281281
alpha_.begin());

‎src/MinCollector.cpp

+31
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,37 @@ void MinCollector::loadCounts(ProgramOptions& opt) {
241241

242242
}
243243

244+
void MinCollector::write(const std::string& pseudoprefix) const {
245+
std::string ecfilename = pseudoprefix + ".ec";
246+
std::string countsfilename = pseudoprefix + ".tsv";
247+
248+
std::ofstream ecof, countsof;
249+
ecof.open(ecfilename.c_str(), std::ios::out);
250+
// output equivalence classes in the form "EC TXLIST";
251+
for (int i = 0; i < index.ecmap.size(); i++) {
252+
ecof << i << "\t";
253+
// output the rest of the class
254+
const auto &v = index.ecmap[i];
255+
bool first = true;
256+
for (auto x : v) {
257+
if (!first) {
258+
ecof << ",";
259+
} else {
260+
first = false;
261+
}
262+
ecof << x;
263+
}
264+
ecof << "\n";
265+
}
266+
ecof.close();
267+
268+
countsof.open(countsfilename.c_str(), std::ios::out);
269+
for (int i = 0; i < counts.size(); i++) {
270+
countsof << i << "\t" << counts[i] << "\n";
271+
}
272+
countsof.close();
273+
}
274+
244275
double MinCollector::get_mean_frag_len() const {
245276
if (has_mean_fl) {
246277
return mean_fl;

‎src/MinCollector.h

+4
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,17 @@ struct MinCollector {
5454
int findEC(const std::vector<int>& u) const;
5555

5656

57+
// deprecated
5758
void write(std::ostream& o) {
5859
for (int id = 0; id < counts.size(); id++) {
5960
o << id << "\t" << counts[id] << "\n";
6061
}
6162
}
63+
void write(const std::string& index_out) const;
64+
6265
void loadCounts(ProgramOptions& opt);
6366

67+
6468
bool countBias(const char *s1, const char *s2, const std::vector<std::pair<KmerEntry,int>> v1, const std::vector<std::pair<KmerEntry,int>> v2, bool paired);
6569
bool countBias(const char *s1, const char *s2, const std::vector<std::pair<KmerEntry,int>> v1, const std::vector<std::pair<KmerEntry,int>> v2, bool paired, std::vector<int>& biasOut) const;
6670

‎src/PlaintextWriter.cpp

+61
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,64 @@ void plaintext_aux(
111111

112112
of.close();
113113
}
114+
115+
116+
void writeBatchMatrix(
117+
const std::string &prefix,
118+
const KmerIndex &index,
119+
const std::vector<std::string> &ids,
120+
std::vector<std::vector<int>> &counts) {
121+
122+
std::string ecfilename = prefix + ".ec";
123+
std::string countsfilename = prefix + ".tsv";
124+
125+
std::ofstream ecof, countsof;
126+
ecof.open(ecfilename.c_str(), std::ios::out);
127+
// output equivalence classes in the form "EC TXLIST";
128+
for (int i = 0; i < index.ecmap.size(); i++) {
129+
ecof << i << "\t";
130+
// output the rest of the class
131+
const auto &v = index.ecmap[i];
132+
bool first = true;
133+
for (auto x : v) {
134+
if (!first) {
135+
ecof << ",";
136+
} else {
137+
first = false;
138+
}
139+
ecof << x;
140+
}
141+
ecof << "\n";
142+
}
143+
ecof.close();
144+
145+
countsof.open(countsfilename.c_str(), std::ios::out);
146+
for (int j = 0; j < ids.size(); j++) {
147+
countsof << "\t" << ids[j];
148+
}
149+
countsof << "\n";
150+
if (!counts.empty()) {
151+
// write out the NxM matrix, N is # of ecs, M is number of samples
152+
int M = counts.size();
153+
int N = 0;
154+
for (int j = 0; j < M; j++) {
155+
if (N < counts[j].size()) {
156+
N = counts[j].size();
157+
}
158+
}
159+
160+
for (int i = 0; i < N; i++) {
161+
countsof << i;
162+
for (int j = 0; j < M; j++) {
163+
if (counts[j].size() <= i) {
164+
countsof << "\t0";
165+
} else {
166+
countsof << "\t" << counts[j][i];
167+
}
168+
}
169+
countsof << "\n";
170+
}
171+
}
172+
countsof.close();
173+
174+
}

‎src/PlaintextWriter.h

+8
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include <string>
1010
#include <vector>
1111

12+
#include "KmerIndex.h"
13+
1214
void plaintext_writer(
1315
const std::string& out_name,
1416
const std::vector<std::string>& targ_ids,
@@ -30,4 +32,10 @@ void plaintext_aux(
3032
const std::string& start_time,
3133
const std::string& call);
3234

35+
void writeBatchMatrix(
36+
const std::string &prefix,
37+
const KmerIndex &index,
38+
const std::vector<std::string> &ids,
39+
std::vector<std::vector<int>> &counts);
40+
3341
#endif

‎src/ProcessReads.cpp

+50-34
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ bool isSubset(const std::vector<int>& x, const std::vector<int>& y) {
5353
int ProcessReads(KmerIndex& index, const ProgramOptions& opt, MinCollector& tc) {
5454

5555
int limit = 1048576;
56-
char *buf = new char[limit];
5756
std::vector<std::pair<const char*, int>> seqs;
5857
seqs.reserve(limit/50);
5958

@@ -208,10 +207,29 @@ ReadProcessor::ReadProcessor(const KmerIndex& index, const ProgramOptions& opt,
208207
clear();
209208
}
210209

210+
ReadProcessor::ReadProcessor(ReadProcessor && o) :
211+
paired(o.paired),
212+
tc(o.tc),
213+
index(o.index),
214+
mp(o.mp),
215+
bufsize(o.bufsize),
216+
numreads(o.numreads),
217+
seqs(std::move(o.seqs)),
218+
names(std::move(o.names)),
219+
quals(std::move(o.quals)),
220+
newEcs(std::move(o.newEcs)),
221+
flens(std::move(o.flens)),
222+
bias5(std::move(o.bias5)),
223+
counts(std::move(o.counts)) {
224+
buffer = o.buffer;
225+
o.buffer = nullptr;
226+
o.bufsize = 0;
227+
}
228+
211229
ReadProcessor::~ReadProcessor() {
212-
if (buffer) {
213-
/*delete[] buffer;
214-
buffer = nullptr;*/
230+
if (buffer != nullptr) {
231+
delete[] buffer;
232+
buffer = nullptr;
215233
}
216234
}
217235

@@ -307,17 +325,12 @@ void ReadProcessor::processBuffer() {
307325
// collect the target information
308326
int ec = -1;
309327
int r = tc.intersectKmers(v1, v2, !paired,u);
310-
if (u.empty()) {
311-
continue;
312-
} else {
313-
ec = tc.findEC(u);
314-
}
315328

316329
/* -- possibly modify the pseudoalignment -- */
317330

318-
// If we have paired end reads where one end maps, check if some transcsripts
331+
// If we have paired end reads where one end maps or single end reads, check if some transcsripts
319332
// are not compatible with the mean fragment length
320-
if (paired && !u.empty() && (v1.empty() || v2.empty()) && tc.has_mean_fl) {
333+
if (!u.empty() && (!paired || v1.empty() || v2.empty()) && tc.has_mean_fl) {
321334
vtmp.clear();
322335
// inspect the positions
323336
int fl = (int) tc.get_mean_frag_len();
@@ -365,33 +378,36 @@ void ReadProcessor::processBuffer() {
365378
}
366379
}
367380

368-
// count the pseudoalignment
369-
if (ec == -1 || ec >= counts.size()) {
370-
// something we haven't seen before
371-
newEcs.push_back(u);
372-
} else {
373-
// add to count vector
374-
++counts[ec];
375-
}
376-
381+
// find the ec
382+
if (!u.empty()) {
383+
ec = tc.findEC(u);
377384

385+
// count the pseudoalignment
386+
if (ec == -1 || ec >= counts.size()) {
387+
// something we haven't seen before
388+
newEcs.push_back(u);
389+
} else {
390+
// add to count vector
391+
++counts[ec];
392+
}
378393

379-
/* -- collect extra information -- */
380-
// collect bias info
381-
if (findBias && !u.empty() && biasgoal > 0) {
382-
// collect sequence specific bias info
383-
if (tc.countBias(s1, (paired) ? s2 : nullptr, v1, v2, paired, bias5)) {
384-
biasgoal--;
394+
/* -- collect extra information -- */
395+
// collect bias info
396+
if (findBias && !u.empty() && biasgoal > 0) {
397+
// collect sequence specific bias info
398+
if (tc.countBias(s1, (paired) ? s2 : nullptr, v1, v2, paired, bias5)) {
399+
biasgoal--;
400+
}
385401
}
386-
}
387402

388-
// collect fragment length info
389-
if (findFragmentLength && flengoal > 0 && paired && 0 <= ec && ec < index.num_trans && !v1.empty() && !v2.empty()) {
390-
// try to map the reads
391-
int tl = index.mapPair(s1, l1, s2, l2, ec);
392-
if (0 < tl && tl < flens.size()) {
393-
flens[tl]++;
394-
flengoal--;
403+
// collect fragment length info
404+
if (findFragmentLength && flengoal > 0 && paired && 0 <= ec && ec < index.num_trans && !v1.empty() && !v2.empty()) {
405+
// try to map the reads
406+
int tl = index.mapPair(s1, l1, s2, l2, ec);
407+
if (0 < tl && tl < flens.size()) {
408+
flens[tl]++;
409+
flengoal--;
410+
}
395411
}
396412
}
397413

‎src/ProcessReads.h

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class MasterProcessor {
7878
class ReadProcessor {
7979
public:
8080
ReadProcessor(const KmerIndex& index, const ProgramOptions& opt, const MinCollector& tc, MasterProcessor& mp);
81+
ReadProcessor(ReadProcessor && o);
8182
~ReadProcessor();
8283
char *buffer;
8384
size_t bufsize;

‎src/PseudoBam.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ void outputPseudoBam(const KmerIndex &index, const std::vector<int> &u,
2929
//o << seq1->name.s << "" << seq1->seq.s << "\t" << seq1->qual.s << "\n";
3030
//o << seq2->name.s << "\t141\t*\t0\t0\t*\t*\t0\t0\t" << seq2->seq.s << "\t" << seq2->qual.s << "\n";
3131
} else {
32-
printf("%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", n1,s2,q1);
32+
printf("%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", n1,s1,q1);
3333
}
3434
} else {
3535
if (paired) {

‎src/common.h

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#ifndef KALLISTO_COMMON_H
22
#define KALLISTO_COMMON_H
33

4-
#define KALLISTO_VERSION "0.42.4"
4+
#define KALLISTO_VERSION "0.42.5"
55

66
#include <string>
77
#include <vector>
@@ -20,6 +20,10 @@ struct ProgramOptions {
2020
int min_range;
2121
int bootstrap;
2222
std::vector<std::string> transfasta;
23+
bool batch_mode;
24+
std::string batch_file_name;
25+
std::vector<std::vector<std::string>> batch_files;
26+
std::vector<std::string> batch_ids;
2327
std::vector<std::string> files;
2428
bool plaintext;
2529
bool write_index;
@@ -42,6 +46,7 @@ ProgramOptions() :
4246
sd(0.0),
4347
min_range(1),
4448
bootstrap(0),
49+
batch_mode(false),
4550
plaintext(false),
4651
write_index(false),
4752
single_end(false),

‎src/h5utils.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ char* vec_to_ptr(const std::vector<std::string>& v) {
1212
max_len += 1;
1313
// allocate a contiguous block of memory
1414
char *pool = new char[max_len * v.size()];
15+
memset(pool,0,max_len * v.size());
1516
char *ptr = pool;
1617

1718
for (size_t i = 0; i < v.size(); ++i, ptr += max_len) {

‎src/main.cpp

+340-3
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,92 @@ void ParseOptionsEMOnly(int argc, char **argv, ProgramOptions& opt) {
305305
}
306306
}
307307

308+
void ParseOptionsPseudo(int argc, char **argv, ProgramOptions& opt) {
309+
int verbose_flag = 0;
310+
int single_flag = 0;
311+
int strand_flag = 0;
312+
int pbam_flag = 0;
313+
314+
const char *opt_string = "t:i:l:s:o:b:";
315+
static struct option long_options[] = {
316+
// long args
317+
{"verbose", no_argument, &verbose_flag, 1},
318+
{"single", no_argument, &single_flag, 1},
319+
//{"strand-specific", no_argument, &strand_flag, 1},
320+
{"pseudobam", no_argument, &pbam_flag, 1},
321+
{"batch", required_argument, 0, 'b'},
322+
// short args
323+
{"threads", required_argument, 0, 't'},
324+
{"index", required_argument, 0, 'i'},
325+
{"fragment-length", required_argument, 0, 'l'},
326+
{"sd", required_argument, 0, 's'},
327+
{"output-dir", required_argument, 0, 'o'},
328+
{0,0,0,0}
329+
};
330+
int c;
331+
int option_index = 0;
332+
while (true) {
333+
c = getopt_long(argc,argv,opt_string, long_options, &option_index);
334+
335+
if (c == -1) {
336+
break;
337+
}
338+
339+
switch (c) {
340+
case 0:
341+
break;
342+
case 't': {
343+
stringstream(optarg) >> opt.threads;
344+
break;
345+
}
346+
case 'i': {
347+
opt.index = optarg;
348+
break;
349+
}
350+
case 'l': {
351+
stringstream(optarg) >> opt.fld;
352+
break;
353+
}
354+
case 's': {
355+
stringstream(optarg) >> opt.sd;
356+
break;
357+
}
358+
case 'o': {
359+
opt.output = optarg;
360+
break;
361+
}
362+
case 'b': {
363+
opt.batch_mode = true;
364+
opt.batch_file_name = optarg;
365+
break;
366+
}
367+
default: break;
368+
}
369+
}
370+
371+
// all other arguments are fast[a/q] files to be read
372+
for (int i = optind; i < argc; i++) {
373+
opt.files.push_back(argv[i]);
374+
}
375+
376+
if (verbose_flag) {
377+
opt.verbose = true;
378+
}
379+
380+
if (single_flag) {
381+
opt.single_end = true;
382+
}
383+
384+
if (strand_flag) {
385+
opt.strand_specific = true;
386+
}
387+
388+
if (pbam_flag) {
389+
opt.pseudobam = true;
390+
}
391+
}
392+
393+
308394
void ParseOptionsH5Dump(int argc, char **argv, ProgramOptions& opt) {
309395
int peek_flag = 0;
310396
const char *opt_string = "o:";
@@ -544,6 +630,181 @@ bool CheckOptionsEM(ProgramOptions& opt, bool emonly = false) {
544630
}
545631

546632

633+
634+
bool CheckOptionsPseudo(ProgramOptions& opt) {
635+
636+
bool ret = true;
637+
638+
cerr << endl;
639+
// check for index
640+
if (opt.index.empty()) {
641+
cerr << ERROR_STR << " kallisto index file missing" << endl;
642+
ret = false;
643+
} else {
644+
struct stat stFileInfo;
645+
auto intStat = stat(opt.index.c_str(), &stFileInfo);
646+
if (intStat != 0) {
647+
cerr << ERROR_STR << " kallisto index file not found " << opt.index << endl;
648+
ret = false;
649+
}
650+
}
651+
652+
// check for read files
653+
if (!opt.batch_mode) {
654+
if (opt.files.size() == 0) {
655+
cerr << ERROR_STR << " Missing read files" << endl;
656+
ret = false;
657+
} else {
658+
struct stat stFileInfo;
659+
for (auto& fn : opt.files) {
660+
auto intStat = stat(fn.c_str(), &stFileInfo);
661+
if (intStat != 0) {
662+
cerr << ERROR_STR << " file not found " << fn << endl;
663+
ret = false;
664+
}
665+
}
666+
}
667+
} else {
668+
if (opt.files.size() != 0) {
669+
cerr << ERROR_STR << " cannot specify batch mode and supply read files" << endl;
670+
ret = false;
671+
} else {
672+
// check for batch files
673+
if (opt.batch_mode) {
674+
struct stat stFileInfo;
675+
auto intstat = stat(opt.batch_file_name.c_str(), &stFileInfo);
676+
if (intstat != 0) {
677+
cerr << ERROR_STR << " file not found " << opt.batch_file_name << endl;
678+
ret = false;
679+
}
680+
// open the file, parse and fill the batch_files values
681+
std::ifstream bfile(opt.batch_file_name);
682+
std::string line;
683+
std::string id,f1,f2;
684+
while (std::getline(bfile,line)) {
685+
if (line.size() == 0) {
686+
continue;
687+
}
688+
std::stringstream ss(line);
689+
ss >> id;
690+
if (id[0] == '#') {
691+
continue;
692+
}
693+
opt.batch_ids.push_back(id);
694+
if (opt.single_end) {
695+
ss >> f1;
696+
opt.batch_files.push_back({f1});
697+
intstat = stat(f1.c_str(), &stFileInfo);
698+
if (intstat != 0) {
699+
cerr << ERROR_STR << " file not found " << f1 << endl;
700+
ret = false;
701+
}
702+
} else {
703+
ss >> f1 >> f2;
704+
opt.batch_files.push_back({f1,f2});
705+
intstat = stat(f1.c_str(), &stFileInfo);
706+
if (intstat != 0) {
707+
cerr << ERROR_STR << " file not found " << f1 << endl;
708+
ret = false;
709+
}
710+
intstat = stat(f2.c_str(), &stFileInfo);
711+
if (intstat != 0) {
712+
cerr << ERROR_STR << " file not found " << f2 << endl;
713+
ret = false;
714+
}
715+
}
716+
}
717+
}
718+
}
719+
}
720+
721+
722+
/*
723+
if (opt.strand_specific && !opt.single_end) {
724+
cerr << "Error: strand-specific mode requires single end mode" << endl;
725+
ret = false;
726+
}*/
727+
728+
if (!opt.single_end) {
729+
if (opt.files.size() % 2 != 0) {
730+
cerr << "Error: paired-end mode requires an even number of input files" << endl
731+
<< " (use --single for processing single-end reads)" << endl;
732+
ret = false;
733+
}
734+
}
735+
736+
if ((opt.fld != 0.0 && opt.sd == 0.0) || (opt.sd != 0.0 && opt.fld == 0.0)) {
737+
cerr << "Error: cannot supply mean/sd without supplying both -l and -s" << endl;
738+
ret = false;
739+
}
740+
741+
if (opt.single_end && (opt.fld == 0.0 || opt.sd == 0.0)) {
742+
cerr << "Error: fragment length mean and sd must be supplied for single-end reads using -l and -s" << endl;
743+
ret = false;
744+
} else if (opt.fld == 0.0 && ret) {
745+
// In the future, if we have single-end data we should require this
746+
// argument
747+
cerr << "[quant] fragment length distribution will be estimated from the data" << endl;
748+
} else if (ret && opt.fld > 0.0 && opt.sd > 0.0) {
749+
cerr << "[quant] fragment length distribution is truncated gaussian with mean = " <<
750+
opt.fld << ", sd = " << opt.sd << endl;
751+
}
752+
753+
if (!opt.single_end && (opt.fld > 0.0 && opt.sd > 0.0)) {
754+
cerr << "[~warn] you specified using a gaussian but have paired end data" << endl;
755+
cerr << "[~warn] we suggest omitting these parameters and let us estimate the distribution from data" << endl;
756+
}
757+
758+
if (opt.fld < 0.0) {
759+
cerr << "Error: invalid value for mean fragment length " << opt.fld << endl;
760+
ret = false;
761+
}
762+
763+
if (opt.sd < 0.0) {
764+
cerr << "Error: invalid value for fragment length standard deviation " << opt.sd << endl;
765+
ret = false;
766+
}
767+
768+
if (opt.output.empty()) {
769+
cerr << "Error: need to specify output directory " << opt.output << endl;
770+
ret = false;
771+
} else {
772+
struct stat stFileInfo;
773+
auto intStat = stat(opt.output.c_str(), &stFileInfo);
774+
if (intStat == 0) {
775+
// file/dir exits
776+
if (!S_ISDIR(stFileInfo.st_mode)) {
777+
cerr << "Error: file " << opt.output << " exists and is not a directory" << endl;
778+
ret = false;
779+
}
780+
} else {
781+
// create directory
782+
if (mkdir(opt.output.c_str(), 0777) == -1) {
783+
cerr << "Error: could not create directory " << opt.output << endl;
784+
ret = false;
785+
}
786+
}
787+
}
788+
789+
if (opt.threads <= 0) {
790+
cerr << "Error: invalid number of threads " << opt.threads << endl;
791+
ret = false;
792+
} else {
793+
unsigned int n = std::thread::hardware_concurrency();
794+
if (n != 0 && n < opt.threads) {
795+
cerr << "Warning: you asked for " << opt.threads
796+
<< ", but only " << n << " cores on the machine" << endl;
797+
}
798+
if (opt.threads > 1 && opt.pseudobam) {
799+
cerr << "Error: pseudobam is not compatible with running on many threads."<< endl;
800+
ret = false;
801+
}
802+
}
803+
804+
return ret;
805+
}
806+
807+
547808
bool CheckOptionsInspect(ProgramOptions& opt) {
548809

549810
bool ret = true;
@@ -611,8 +872,11 @@ bool CheckOptionsH5Dump(ProgramOptions& opt) {
611872
}
612873

613874
void PrintCite() {
614-
cout << "The paper describing this software has not been published." << endl;
615-
// cerr << "When using this program in your research, please cite" << endl << endl;
875+
cout << "When using this program in your research, please cite" << endl << endl
876+
<< " Bray, N. L., Pimentel, H., Melsted, P. & Pachter, L." << endl
877+
<< " Near-optimal probabilistic RNA-seq quantification, "<< endl
878+
<< " Nature Biotechnology (2016), doi:10.1038/nbt.3519" << endl
879+
<< endl;
616880
}
617881

618882
void PrintVersion() {
@@ -625,8 +889,10 @@ void usage() {
625889
<< "Where <CMD> can be one of:" << endl << endl
626890
<< " index Builds a kallisto index "<< endl
627891
<< " quant Runs the quantification algorithm " << endl
892+
<< " pseudo Runs the pseudoalignment step " << endl
628893
<< " h5dump Converts HDF5-formatted results to plaintext" << endl
629-
<< " version Prints version information"<< endl << endl
894+
<< " version Prints version information"<< endl
895+
<< " cite Prints citation information" << endl << endl
630896
<< "Running kallisto <CMD> without arguments prints usage information for <CMD>"<< endl << endl;
631897
}
632898

@@ -684,6 +950,28 @@ void usageEM(bool valid_input = true) {
684950

685951
}
686952

953+
void usagePseudo(bool valid_input = true) {
954+
if (valid_input) {
955+
cout << "kallisto " << KALLISTO_VERSION << endl
956+
<< "Computes equivalence classes for reads and quantifies abundances" << endl << endl;
957+
}
958+
959+
cout << "Usage: kallisto pseudo [arguments] FASTQ-files" << endl << endl
960+
<< "Required arguments:" << endl
961+
<< "-i, --index=STRING Filename for the kallisto index to be used for" << endl
962+
<< " pseudoalignment" << endl
963+
<< "-o, --output-dir=STRING Directory to write output to" << endl << endl
964+
<< "Optional arguments:" << endl
965+
<< "-b --batch=FILE Process files listed in FILE" << endl
966+
<< " --single Quantify single-end reads" << endl
967+
<< "-l, --fragment-length=DOUBLE Estimated average fragment length" << endl
968+
<< "-s, --sd=DOUBLE Estimated standard deviation of fragment length" << endl
969+
<< " (default: value is estimated from the input data)" << endl
970+
<< "-t, --threads=INT Number of threads to use (default: 1)" << endl
971+
<< " --pseudobam Output pseudoalignments in SAM format to stdout" << endl;
972+
973+
}
974+
687975
void usageEMOnly() {
688976
cout << "kallisto " << KALLISTO_VERSION << endl
689977
<< "Computes equivalence classes for reads and quantifies abundance" << endl << endl
@@ -997,6 +1285,55 @@ int main(int argc, char *argv[]) {
9971285
}
9981286
cerr << endl;
9991287
}
1288+
} else if (cmd == "pseudo") {
1289+
if (argc==2) {
1290+
usagePseudo();
1291+
return 0;
1292+
}
1293+
ParseOptionsPseudo(argc-1,argv+1,opt);
1294+
if (!CheckOptionsPseudo(opt)) {
1295+
cerr << endl;
1296+
usagePseudo(false);
1297+
exit(1);
1298+
} else {
1299+
// pseudoalign the reads
1300+
KmerIndex index(opt);
1301+
index.load(opt);
1302+
1303+
MinCollector collection(index, opt);
1304+
int num_processed = 0;
1305+
1306+
if (!opt.batch_mode) {
1307+
num_processed = ProcessReads(index, opt, collection);
1308+
collection.write((opt.output + "/pseudoalignments"));
1309+
} else {
1310+
1311+
std::vector<std::vector<int>> batchCounts;
1312+
for (int i = 0; i < opt.batch_ids.size(); i++) {
1313+
std::fill(collection.counts.begin(), collection.counts.end(),0);
1314+
opt.files = opt.batch_files[i];
1315+
num_processed += ProcessReads(index, opt, collection);
1316+
batchCounts.push_back(collection.counts);
1317+
}
1318+
1319+
writeBatchMatrix((opt.output + "/matrix"),index, opt.batch_ids,batchCounts);
1320+
1321+
}
1322+
1323+
std::string call = argv_to_string(argc, argv);
1324+
1325+
plaintext_aux(
1326+
opt.output + "/run_info.json",
1327+
std::string(std::to_string(index.num_trans)),
1328+
std::string(std::to_string(0)), // no bootstraps in pseudo
1329+
std::string(std::to_string(num_processed)),
1330+
KALLISTO_VERSION,
1331+
std::string(std::to_string(index.INDEX_VERSION)),
1332+
start_time,
1333+
call);
1334+
1335+
cerr << endl;
1336+
}
10001337
} else if (cmd == "h5dump") {
10011338

10021339
if (argc == 2) {

‎src/weights.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -134,13 +134,13 @@ std::vector<double> update_eff_lens(
134134
const char* cs = index.target_seqs_[i].c_str();
135135

136136
int hex = hexamerToInt(cs,false);
137-
int fwlimit = (int) (seqlen - means[i] - 6);
137+
int fwlimit = (int) std::max(seqlen - means[i] - 6, 0.0);
138138
for (int j = 0; j < fwlimit; j++) {
139139
dbias5[hex] += contrib;
140140
hex = update_hexamer(hex,*(cs+j+6),false);
141141
}
142142

143-
int bwlimit = (int) (means[i] - 6);
143+
int bwlimit = (int) std::max(means[i] - 6, 0.0);
144144
hex = hexamerToInt(cs+bwlimit,true);
145145
for (int j = bwlimit; j < seqlen - 6; j++) {
146146
dbias5[hex] += contrib;
@@ -165,14 +165,14 @@ std::vector<double> update_eff_lens(
165165

166166
// forward direction
167167
int hex = hexamerToInt(cs,false);
168-
int fwlimit = (int) seqlen - means[i] -6;
168+
int fwlimit = (int) std::max(seqlen - means[i] - 6, 0.0);
169169
for (int j = 0; j < fwlimit; j++) {
170170
//int hex = hexamerToInt(cs+j,false);
171171
//efflen += 0.5*(tc.bias5[hex]/biasDataNorm) / (dbias5[hex]/biasAlphaNorm );
172172
efflen += tc.bias5[hex] / dbias5[hex];
173173
hex = update_hexamer(hex,*(cs+j+6),false);
174174
}
175-
int bwlimit = (int) std::max(means[i]-6,0.0);
175+
int bwlimit = (int) std::max(means[i] - 6 , 0.0);
176176
hex = hexamerToInt(cs+bwlimit,true);
177177
for (int j = bwlimit; j < seqlen - 6; j++) {
178178
efflen += tc.bias5[hex] / dbias5[hex];

0 commit comments

Comments
 (0)
Please sign in to comment.