@@ -305,6 +305,92 @@ void ParseOptionsEMOnly(int argc, char **argv, ProgramOptions& opt) {
305
305
}
306
306
}
307
307
308
+ void ParseOptionsPseudo (int argc, char **argv, ProgramOptions& opt) {
309
+ int verbose_flag = 0 ;
310
+ int single_flag = 0 ;
311
+ int strand_flag = 0 ;
312
+ int pbam_flag = 0 ;
313
+
314
+ const char *opt_string = " t:i:l:s:o:b:" ;
315
+ static struct option long_options[] = {
316
+ // long args
317
+ {" verbose" , no_argument, &verbose_flag, 1 },
318
+ {" single" , no_argument, &single_flag, 1 },
319
+ // {"strand-specific", no_argument, &strand_flag, 1},
320
+ {" pseudobam" , no_argument, &pbam_flag, 1 },
321
+ {" batch" , required_argument, 0 , ' b' },
322
+ // short args
323
+ {" threads" , required_argument, 0 , ' t' },
324
+ {" index" , required_argument, 0 , ' i' },
325
+ {" fragment-length" , required_argument, 0 , ' l' },
326
+ {" sd" , required_argument, 0 , ' s' },
327
+ {" output-dir" , required_argument, 0 , ' o' },
328
+ {0 ,0 ,0 ,0 }
329
+ };
330
+ int c;
331
+ int option_index = 0 ;
332
+ while (true ) {
333
+ c = getopt_long (argc,argv,opt_string, long_options, &option_index);
334
+
335
+ if (c == -1 ) {
336
+ break ;
337
+ }
338
+
339
+ switch (c) {
340
+ case 0 :
341
+ break ;
342
+ case ' t' : {
343
+ stringstream (optarg ) >> opt.threads ;
344
+ break ;
345
+ }
346
+ case ' i' : {
347
+ opt.index = optarg ;
348
+ break ;
349
+ }
350
+ case ' l' : {
351
+ stringstream (optarg ) >> opt.fld ;
352
+ break ;
353
+ }
354
+ case ' s' : {
355
+ stringstream (optarg ) >> opt.sd ;
356
+ break ;
357
+ }
358
+ case ' o' : {
359
+ opt.output = optarg ;
360
+ break ;
361
+ }
362
+ case ' b' : {
363
+ opt.batch_mode = true ;
364
+ opt.batch_file_name = optarg ;
365
+ break ;
366
+ }
367
+ default : break ;
368
+ }
369
+ }
370
+
371
+ // all other arguments are fast[a/q] files to be read
372
+ for (int i = optind ; i < argc; i++) {
373
+ opt.files .push_back (argv[i]);
374
+ }
375
+
376
+ if (verbose_flag) {
377
+ opt.verbose = true ;
378
+ }
379
+
380
+ if (single_flag) {
381
+ opt.single_end = true ;
382
+ }
383
+
384
+ if (strand_flag) {
385
+ opt.strand_specific = true ;
386
+ }
387
+
388
+ if (pbam_flag) {
389
+ opt.pseudobam = true ;
390
+ }
391
+ }
392
+
393
+
308
394
void ParseOptionsH5Dump (int argc, char **argv, ProgramOptions& opt) {
309
395
int peek_flag = 0 ;
310
396
const char *opt_string = " o:" ;
@@ -544,6 +630,181 @@ bool CheckOptionsEM(ProgramOptions& opt, bool emonly = false) {
544
630
}
545
631
546
632
633
+
634
+ bool CheckOptionsPseudo (ProgramOptions& opt) {
635
+
636
+ bool ret = true ;
637
+
638
+ cerr << endl;
639
+ // check for index
640
+ if (opt.index .empty ()) {
641
+ cerr << ERROR_STR << " kallisto index file missing" << endl;
642
+ ret = false ;
643
+ } else {
644
+ struct stat stFileInfo;
645
+ auto intStat = stat (opt.index .c_str (), &stFileInfo);
646
+ if (intStat != 0 ) {
647
+ cerr << ERROR_STR << " kallisto index file not found " << opt.index << endl;
648
+ ret = false ;
649
+ }
650
+ }
651
+
652
+ // check for read files
653
+ if (!opt.batch_mode ) {
654
+ if (opt.files .size () == 0 ) {
655
+ cerr << ERROR_STR << " Missing read files" << endl;
656
+ ret = false ;
657
+ } else {
658
+ struct stat stFileInfo;
659
+ for (auto & fn : opt.files ) {
660
+ auto intStat = stat (fn.c_str (), &stFileInfo);
661
+ if (intStat != 0 ) {
662
+ cerr << ERROR_STR << " file not found " << fn << endl;
663
+ ret = false ;
664
+ }
665
+ }
666
+ }
667
+ } else {
668
+ if (opt.files .size () != 0 ) {
669
+ cerr << ERROR_STR << " cannot specify batch mode and supply read files" << endl;
670
+ ret = false ;
671
+ } else {
672
+ // check for batch files
673
+ if (opt.batch_mode ) {
674
+ struct stat stFileInfo;
675
+ auto intstat = stat (opt.batch_file_name .c_str (), &stFileInfo);
676
+ if (intstat != 0 ) {
677
+ cerr << ERROR_STR << " file not found " << opt.batch_file_name << endl;
678
+ ret = false ;
679
+ }
680
+ // open the file, parse and fill the batch_files values
681
+ std::ifstream bfile (opt.batch_file_name );
682
+ std::string line;
683
+ std::string id,f1,f2;
684
+ while (std::getline (bfile,line)) {
685
+ if (line.size () == 0 ) {
686
+ continue ;
687
+ }
688
+ std::stringstream ss (line);
689
+ ss >> id;
690
+ if (id[0 ] == ' #' ) {
691
+ continue ;
692
+ }
693
+ opt.batch_ids .push_back (id);
694
+ if (opt.single_end ) {
695
+ ss >> f1;
696
+ opt.batch_files .push_back ({f1});
697
+ intstat = stat (f1.c_str (), &stFileInfo);
698
+ if (intstat != 0 ) {
699
+ cerr << ERROR_STR << " file not found " << f1 << endl;
700
+ ret = false ;
701
+ }
702
+ } else {
703
+ ss >> f1 >> f2;
704
+ opt.batch_files .push_back ({f1,f2});
705
+ intstat = stat (f1.c_str (), &stFileInfo);
706
+ if (intstat != 0 ) {
707
+ cerr << ERROR_STR << " file not found " << f1 << endl;
708
+ ret = false ;
709
+ }
710
+ intstat = stat (f2.c_str (), &stFileInfo);
711
+ if (intstat != 0 ) {
712
+ cerr << ERROR_STR << " file not found " << f2 << endl;
713
+ ret = false ;
714
+ }
715
+ }
716
+ }
717
+ }
718
+ }
719
+ }
720
+
721
+
722
+ /*
723
+ if (opt.strand_specific && !opt.single_end) {
724
+ cerr << "Error: strand-specific mode requires single end mode" << endl;
725
+ ret = false;
726
+ }*/
727
+
728
+ if (!opt.single_end ) {
729
+ if (opt.files .size () % 2 != 0 ) {
730
+ cerr << " Error: paired-end mode requires an even number of input files" << endl
731
+ << " (use --single for processing single-end reads)" << endl;
732
+ ret = false ;
733
+ }
734
+ }
735
+
736
+ if ((opt.fld != 0.0 && opt.sd == 0.0 ) || (opt.sd != 0.0 && opt.fld == 0.0 )) {
737
+ cerr << " Error: cannot supply mean/sd without supplying both -l and -s" << endl;
738
+ ret = false ;
739
+ }
740
+
741
+ if (opt.single_end && (opt.fld == 0.0 || opt.sd == 0.0 )) {
742
+ cerr << " Error: fragment length mean and sd must be supplied for single-end reads using -l and -s" << endl;
743
+ ret = false ;
744
+ } else if (opt.fld == 0.0 && ret) {
745
+ // In the future, if we have single-end data we should require this
746
+ // argument
747
+ cerr << " [quant] fragment length distribution will be estimated from the data" << endl;
748
+ } else if (ret && opt.fld > 0.0 && opt.sd > 0.0 ) {
749
+ cerr << " [quant] fragment length distribution is truncated gaussian with mean = " <<
750
+ opt.fld << " , sd = " << opt.sd << endl;
751
+ }
752
+
753
+ if (!opt.single_end && (opt.fld > 0.0 && opt.sd > 0.0 )) {
754
+ cerr << " [~warn] you specified using a gaussian but have paired end data" << endl;
755
+ cerr << " [~warn] we suggest omitting these parameters and let us estimate the distribution from data" << endl;
756
+ }
757
+
758
+ if (opt.fld < 0.0 ) {
759
+ cerr << " Error: invalid value for mean fragment length " << opt.fld << endl;
760
+ ret = false ;
761
+ }
762
+
763
+ if (opt.sd < 0.0 ) {
764
+ cerr << " Error: invalid value for fragment length standard deviation " << opt.sd << endl;
765
+ ret = false ;
766
+ }
767
+
768
+ if (opt.output .empty ()) {
769
+ cerr << " Error: need to specify output directory " << opt.output << endl;
770
+ ret = false ;
771
+ } else {
772
+ struct stat stFileInfo;
773
+ auto intStat = stat (opt.output .c_str (), &stFileInfo);
774
+ if (intStat == 0 ) {
775
+ // file/dir exits
776
+ if (!S_ISDIR (stFileInfo.st_mode )) {
777
+ cerr << " Error: file " << opt.output << " exists and is not a directory" << endl;
778
+ ret = false ;
779
+ }
780
+ } else {
781
+ // create directory
782
+ if (mkdir (opt.output .c_str (), 0777 ) == -1 ) {
783
+ cerr << " Error: could not create directory " << opt.output << endl;
784
+ ret = false ;
785
+ }
786
+ }
787
+ }
788
+
789
+ if (opt.threads <= 0 ) {
790
+ cerr << " Error: invalid number of threads " << opt.threads << endl;
791
+ ret = false ;
792
+ } else {
793
+ unsigned int n = std::thread::hardware_concurrency ();
794
+ if (n != 0 && n < opt.threads ) {
795
+ cerr << " Warning: you asked for " << opt.threads
796
+ << " , but only " << n << " cores on the machine" << endl;
797
+ }
798
+ if (opt.threads > 1 && opt.pseudobam ) {
799
+ cerr << " Error: pseudobam is not compatible with running on many threads." << endl;
800
+ ret = false ;
801
+ }
802
+ }
803
+
804
+ return ret;
805
+ }
806
+
807
+
547
808
bool CheckOptionsInspect (ProgramOptions& opt) {
548
809
549
810
bool ret = true ;
@@ -611,8 +872,11 @@ bool CheckOptionsH5Dump(ProgramOptions& opt) {
611
872
}
612
873
613
874
void PrintCite () {
614
- cout << " The paper describing this software has not been published." << endl;
615
- // cerr << "When using this program in your research, please cite" << endl << endl;
875
+ cout << " When using this program in your research, please cite" << endl << endl
876
+ << " Bray, N. L., Pimentel, H., Melsted, P. & Pachter, L." << endl
877
+ << " Near-optimal probabilistic RNA-seq quantification, " << endl
878
+ << " Nature Biotechnology (2016), doi:10.1038/nbt.3519" << endl
879
+ << endl;
616
880
}
617
881
618
882
void PrintVersion () {
@@ -625,8 +889,10 @@ void usage() {
625
889
<< " Where <CMD> can be one of:" << endl << endl
626
890
<< " index Builds a kallisto index " << endl
627
891
<< " quant Runs the quantification algorithm " << endl
892
+ << " pseudo Runs the pseudoalignment step " << endl
628
893
<< " h5dump Converts HDF5-formatted results to plaintext" << endl
629
- << " version Prints version information" << endl << endl
894
+ << " version Prints version information" << endl
895
+ << " cite Prints citation information" << endl << endl
630
896
<< " Running kallisto <CMD> without arguments prints usage information for <CMD>" << endl << endl;
631
897
}
632
898
@@ -684,6 +950,28 @@ void usageEM(bool valid_input = true) {
684
950
685
951
}
686
952
953
+ void usagePseudo (bool valid_input = true ) {
954
+ if (valid_input) {
955
+ cout << " kallisto " << KALLISTO_VERSION << endl
956
+ << " Computes equivalence classes for reads and quantifies abundances" << endl << endl;
957
+ }
958
+
959
+ cout << " Usage: kallisto pseudo [arguments] FASTQ-files" << endl << endl
960
+ << " Required arguments:" << endl
961
+ << " -i, --index=STRING Filename for the kallisto index to be used for" << endl
962
+ << " pseudoalignment" << endl
963
+ << " -o, --output-dir=STRING Directory to write output to" << endl << endl
964
+ << " Optional arguments:" << endl
965
+ << " -b --batch=FILE Process files listed in FILE" << endl
966
+ << " --single Quantify single-end reads" << endl
967
+ << " -l, --fragment-length=DOUBLE Estimated average fragment length" << endl
968
+ << " -s, --sd=DOUBLE Estimated standard deviation of fragment length" << endl
969
+ << " (default: value is estimated from the input data)" << endl
970
+ << " -t, --threads=INT Number of threads to use (default: 1)" << endl
971
+ << " --pseudobam Output pseudoalignments in SAM format to stdout" << endl;
972
+
973
+ }
974
+
687
975
void usageEMOnly () {
688
976
cout << " kallisto " << KALLISTO_VERSION << endl
689
977
<< " Computes equivalence classes for reads and quantifies abundance" << endl << endl
@@ -997,6 +1285,55 @@ int main(int argc, char *argv[]) {
997
1285
}
998
1286
cerr << endl;
999
1287
}
1288
+ } else if (cmd == " pseudo" ) {
1289
+ if (argc==2 ) {
1290
+ usagePseudo ();
1291
+ return 0 ;
1292
+ }
1293
+ ParseOptionsPseudo (argc-1 ,argv+1 ,opt);
1294
+ if (!CheckOptionsPseudo (opt)) {
1295
+ cerr << endl;
1296
+ usagePseudo (false );
1297
+ exit (1 );
1298
+ } else {
1299
+ // pseudoalign the reads
1300
+ KmerIndex index (opt);
1301
+ index .load (opt);
1302
+
1303
+ MinCollector collection (index , opt);
1304
+ int num_processed = 0 ;
1305
+
1306
+ if (!opt.batch_mode ) {
1307
+ num_processed = ProcessReads (index , opt, collection);
1308
+ collection.write ((opt.output + " /pseudoalignments" ));
1309
+ } else {
1310
+
1311
+ std::vector<std::vector<int >> batchCounts;
1312
+ for (int i = 0 ; i < opt.batch_ids .size (); i++) {
1313
+ std::fill (collection.counts .begin (), collection.counts .end (),0 );
1314
+ opt.files = opt.batch_files [i];
1315
+ num_processed += ProcessReads (index , opt, collection);
1316
+ batchCounts.push_back (collection.counts );
1317
+ }
1318
+
1319
+ writeBatchMatrix ((opt.output + " /matrix" ),index , opt.batch_ids ,batchCounts);
1320
+
1321
+ }
1322
+
1323
+ std::string call = argv_to_string (argc, argv);
1324
+
1325
+ plaintext_aux (
1326
+ opt.output + " /run_info.json" ,
1327
+ std::string (std::to_string (index .num_trans )),
1328
+ std::string (std::to_string (0 )), // no bootstraps in pseudo
1329
+ std::string (std::to_string (num_processed)),
1330
+ KALLISTO_VERSION,
1331
+ std::string (std::to_string (index .INDEX_VERSION )),
1332
+ start_time,
1333
+ call);
1334
+
1335
+ cerr << endl;
1336
+ }
1000
1337
} else if (cmd == " h5dump" ) {
1001
1338
1002
1339
if (argc == 2 ) {
0 commit comments