Merge pull request #106 from pachterlab/devel

pmelsted · pmelsted · commit e5957cf96f02 · 2016-04-04T21:43:15.000Z
Update to v0.42.5
diff --git a/gulpfile.js b/gulpfile.js
@@ -0,0 +1,69 @@
+var gulp = require('gulp');
+var exec = require('child_process').exec;
+
+var cMakeCommand = 'cd build; cmake ..;';
+var buildCommand = 'cd build; make;';
+
+var indexCommand = 'build/src/kallisto' +
+  ' index -i test/transcripts.kidx' +
+  ' test/transcripts.fasta.gz';
+
+var pairedEndCommand = 'build/src/kallisto' +
+  ' quant -i test/transcripts.kidx' +
+  ' -b 10' +
+  ' -t 2' +
+  ' -o test/paired_end' +
+  ' test/reads_1.fastq.gz test/reads_2.fastq.gz';
+
+var singleEndCommand = 'build/src/kallisto' +
+  ' quant -i test/transcripts.kidx' +
+  ' -b 10' +
+  ' -t 2' +
+  ' -l 200 -s 3' +
+  ' -o test/single_end' +
+  ' --single' +
+  ' test/reads_1.fastq.gz';
+
+console.log('build command: ' + buildCommand);
+
+gulp.task('watch', function() {
+  gulp.watch('src/*.cpp', ['build']);
+  gulp.watch('src/*.h', ['build']);
+  gulp.watch('src/*.hpp', ['build']);
+});
+
+gulp.task('build', ['watch'], function() {
+  exec(buildCommand, function(error, standardOutput, standardError) {
+    if (error) {
+      console.error('There was an error: ' + error);
+    }
+    console.log(standardOutput);
+    console.log(standardError);
+  });
+});
+
+gulp.task('pairedEnd', ['build'], function() {
+  exec(pairedEndCommand, function(error, standardOut, standardError) {
+    if (error) {
+      console.error('There was a pairedEnd error');
+    }
+    console.log(standardOut);
+    console.log(standardError);
+  });
+});
+
+gulp.task('singleEnd', ['build'], function() {
+  exec(singleEndCommand, function(error, standardOut, standardError) {
+    if (error) {
+      console.error('There was a singleEnd error');
+    }
+    console.log(standardOut);
+    console.log(standardError);
+  });
+});
+
+// gulp.task('default', ['install', 'watch'], function() {});
+// gulp.task('compile', ['build', 'watch'], function() {});
+// gulp.task('pairedEnd', ['compile'], function() {});
+// gulp.task('singleEnd', ['compile'], function() {});
+gulp.task('default', ['pairedEnd', 'singleEnd'], function() {});
diff --git a/src/EMAlgorithm.h b/src/EMAlgorithm.h
@@ -152,7 +152,7 @@ struct EMAlgorithm {
       }
 
       //std::cout << chcount << std::endl;
-      if (chcount == 0) {
+      if (chcount == 0 && i > min_rounds) {
 
         stopEM=true;
       }
@@ -275,7 +275,7 @@ struct EMAlgorithm {
       }
     }
 
-    std::cout << sum_big << " " << count_big << " " << n << std::endl;
+    //std::cout << sum_big << " " << count_big << " " << n << std::endl;
 
     std::copy(em_start.alpha_before_zeroes_.begin(), em_start.alpha_before_zeroes_.end(),
         alpha_.begin());
diff --git a/src/MinCollector.cpp b/src/MinCollector.cpp
@@ -241,6 +241,37 @@ void MinCollector::loadCounts(ProgramOptions& opt) {
 
 }
 
+void MinCollector::write(const std::string& pseudoprefix) const {
+  std::string ecfilename = pseudoprefix + ".ec";
+  std::string countsfilename = pseudoprefix + ".tsv";
+
+  std::ofstream ecof, countsof;
+  ecof.open(ecfilename.c_str(), std::ios::out);
+  // output equivalence classes in the form "EC TXLIST";
+  for (int i = 0; i < index.ecmap.size(); i++) {
+    ecof << i << "\t";
+    // output the rest of the class
+    const auto &v = index.ecmap[i];
+    bool first = true;
+    for (auto x : v) {
+      if (!first) {
+        ecof << ",";
+      } else {
+        first = false;
+      }
+      ecof << x;
+    }
+    ecof << "\n";
+  }
+  ecof.close();
+
+  countsof.open(countsfilename.c_str(), std::ios::out);
+  for (int i = 0; i < counts.size(); i++) {
+    countsof << i << "\t" << counts[i] << "\n";
+  }
+  countsof.close();
+}
+
 double MinCollector::get_mean_frag_len() const {
   if (has_mean_fl) {
     return mean_fl;
diff --git a/src/MinCollector.h b/src/MinCollector.h
@@ -54,13 +54,17 @@ struct MinCollector {
   int findEC(const std::vector<int>& u) const;
 
 
+  // deprecated
   void write(std::ostream& o) {
     for (int id = 0; id < counts.size(); id++) {
       o << id << "\t" << counts[id] << "\n";
     }
   }
+  void write(const std::string& index_out) const;
+
   void loadCounts(ProgramOptions& opt);
 
+
   bool countBias(const char *s1, const char *s2, const std::vector<std::pair<KmerEntry,int>> v1, const std::vector<std::pair<KmerEntry,int>> v2, bool paired);
   bool countBias(const char *s1, const char *s2, const std::vector<std::pair<KmerEntry,int>> v1, const std::vector<std::pair<KmerEntry,int>> v2, bool paired, std::vector<int>& biasOut) const;
 
diff --git a/src/PlaintextWriter.cpp b/src/PlaintextWriter.cpp
@@ -111,3 +111,64 @@ void plaintext_aux(
 
   of.close();
 }
+
+
+void writeBatchMatrix(
+  const std::string &prefix,
+  const KmerIndex &index,
+  const std::vector<std::string> &ids,
+  std::vector<std::vector<int>> &counts) {
+
+    std::string ecfilename = prefix + ".ec";
+    std::string countsfilename = prefix + ".tsv";
+
+    std::ofstream ecof, countsof;
+    ecof.open(ecfilename.c_str(), std::ios::out);
+    // output equivalence classes in the form "EC TXLIST";
+    for (int i = 0; i < index.ecmap.size(); i++) {
+      ecof << i << "\t";
+      // output the rest of the class
+      const auto &v = index.ecmap[i];
+      bool first = true;
+      for (auto x : v) {
+        if (!first) {
+          ecof << ",";
+        } else {
+          first = false;
+        }
+        ecof << x;
+      }
+      ecof << "\n";
+    }
+    ecof.close();
+
+    countsof.open(countsfilename.c_str(), std::ios::out);
+    for (int j = 0; j < ids.size(); j++) {
+      countsof << "\t" << ids[j];
+    }
+    countsof << "\n";
+    if (!counts.empty()) {
+      // write out the NxM matrix, N is # of ecs, M is number of samples
+      int M = counts.size();
+      int N = 0;
+      for (int j = 0; j < M; j++) {
+        if (N < counts[j].size()) {
+          N = counts[j].size();
+        }
+      }
+
+      for (int i = 0; i < N; i++) {
+        countsof << i;
+        for (int j = 0; j < M; j++) {
+          if (counts[j].size() <= i) {
+            countsof << "\t0";
+          } else {
+            countsof << "\t" << counts[j][i];
+          }
+        }
+        countsof << "\n";
+      }
+    }
+    countsof.close();
+
+}
diff --git a/src/PlaintextWriter.h b/src/PlaintextWriter.h
@@ -9,6 +9,8 @@
 #include <string>
 #include <vector>
 
+#include "KmerIndex.h"
+
 void plaintext_writer(
     const std::string& out_name,
     const std::vector<std::string>& targ_ids,
@@ -30,4 +32,10 @@ void plaintext_aux(
     const std::string& start_time,
     const std::string& call);
 
+void writeBatchMatrix(
+  const std::string &prefix,
+  const KmerIndex &index,
+  const std::vector<std::string> &ids,
+  std::vector<std::vector<int>> &counts);
+
 #endif
diff --git a/src/ProcessReads.cpp b/src/ProcessReads.cpp
@@ -53,7 +53,6 @@ bool isSubset(const std::vector<int>& x, const std::vector<int>& y) {
 int ProcessReads(KmerIndex& index, const ProgramOptions& opt, MinCollector& tc) {
 
   int limit = 1048576;
-  char *buf = new char[limit];
   std::vector<std::pair<const char*, int>> seqs;
   seqs.reserve(limit/50);
 
@@ -208,10 +207,29 @@ ReadProcessor::ReadProcessor(const KmerIndex& index, const ProgramOptions& opt,
    clear();
 }
 
+ReadProcessor::ReadProcessor(ReadProcessor && o) :
+  paired(o.paired),
+  tc(o.tc),
+  index(o.index),
+  mp(o.mp),
+  bufsize(o.bufsize),
+  numreads(o.numreads),
+  seqs(std::move(o.seqs)),
+  names(std::move(o.names)),
+  quals(std::move(o.quals)),
+  newEcs(std::move(o.newEcs)),
+  flens(std::move(o.flens)),
+  bias5(std::move(o.bias5)),
+  counts(std::move(o.counts)) {
+    buffer = o.buffer;
+    o.buffer = nullptr;
+    o.bufsize = 0;
+}
+
 ReadProcessor::~ReadProcessor() {
-  if (buffer) {
-      /*delete[] buffer;
-    buffer = nullptr;*/
+  if (buffer != nullptr) {
+      delete[] buffer;
+      buffer = nullptr;
   }
 }
 
@@ -307,17 +325,12 @@ void ReadProcessor::processBuffer() {
     // collect the target information
     int ec = -1;
     int r = tc.intersectKmers(v1, v2, !paired,u);
-    if (u.empty()) {
-      continue;
-    } else {
-      ec = tc.findEC(u);
-    }
 
     /* --  possibly modify the pseudoalignment  -- */
 
-    // If we have paired end reads where one end maps, check if some transcsripts
+    // If we have paired end reads where one end maps or single end reads, check if some transcsripts
     // are not compatible with the mean fragment length
-    if (paired && !u.empty() && (v1.empty() || v2.empty()) && tc.has_mean_fl) {
+    if (!u.empty() && (!paired || v1.empty() || v2.empty()) && tc.has_mean_fl) {
       vtmp.clear();
       // inspect the positions
       int fl = (int) tc.get_mean_frag_len();
@@ -365,33 +378,36 @@ void ReadProcessor::processBuffer() {
       }
     }
 
-    // count the pseudoalignment
-    if (ec == -1 || ec >= counts.size()) {
-      // something we haven't seen before
-      newEcs.push_back(u);
-    } else {
-      // add to count vector
-      ++counts[ec];
-    }
-
+    // find the ec
+    if (!u.empty()) {
+      ec = tc.findEC(u);
 
+      // count the pseudoalignment
+      if (ec == -1 || ec >= counts.size()) {
+        // something we haven't seen before
+        newEcs.push_back(u);
+      } else {
+        // add to count vector
+        ++counts[ec];
+      }
 
-    /* -- collect extra information -- */
-    // collect bias info
-    if (findBias && !u.empty() && biasgoal > 0) {
-      // collect sequence specific bias info
-      if (tc.countBias(s1, (paired) ? s2 : nullptr, v1, v2, paired, bias5)) {
-        biasgoal--;
+      /* -- collect extra information -- */
+      // collect bias info
+      if (findBias && !u.empty() && biasgoal > 0) {
+        // collect sequence specific bias info
+        if (tc.countBias(s1, (paired) ? s2 : nullptr, v1, v2, paired, bias5)) {
+          biasgoal--;
+        }
       }
-    }
 
-    // collect fragment length info
-    if (findFragmentLength && flengoal > 0 && paired && 0 <= ec &&  ec < index.num_trans && !v1.empty() && !v2.empty()) {
-      // try to map the reads
-      int tl = index.mapPair(s1, l1, s2, l2, ec);
-      if (0 < tl && tl < flens.size()) {
-        flens[tl]++;
-        flengoal--;
+      // collect fragment length info
+      if (findFragmentLength && flengoal > 0 && paired && 0 <= ec &&  ec < index.num_trans && !v1.empty() && !v2.empty()) {
+        // try to map the reads
+        int tl = index.mapPair(s1, l1, s2, l2, ec);
+        if (0 < tl && tl < flens.size()) {
+          flens[tl]++;
+          flengoal--;
+        }
       }
     }
 
diff --git a/src/ProcessReads.h b/src/ProcessReads.h
@@ -78,6 +78,7 @@ class MasterProcessor {
 class ReadProcessor {
 public:
   ReadProcessor(const KmerIndex& index, const ProgramOptions& opt, const MinCollector& tc, MasterProcessor& mp);
+  ReadProcessor(ReadProcessor && o);
   ~ReadProcessor();
   char *buffer;
   size_t bufsize;
diff --git a/src/PseudoBam.cpp b/src/PseudoBam.cpp
@@ -29,7 +29,7 @@ void outputPseudoBam(const KmerIndex &index, const std::vector<int> &u,
       //o << seq1->name.s << "" << seq1->seq.s << "\t" << seq1->qual.s << "\n";
       //o << seq2->name.s << "\t141\t*\t0\t0\t*\t*\t0\t0\t" << seq2->seq.s << "\t" << seq2->qual.s << "\n";
     } else {
-      printf("%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", n1,s2,q1);
+      printf("%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", n1,s1,q1);
     }
   } else {
     if (paired) {
diff --git a/src/common.h b/src/common.h
@@ -1,7 +1,7 @@
 #ifndef KALLISTO_COMMON_H
 #define KALLISTO_COMMON_H
 
-#define KALLISTO_VERSION "0.42.4"
+#define KALLISTO_VERSION "0.42.5"
 
 #include <string>
 #include <vector>
@@ -20,6 +20,10 @@ struct ProgramOptions {
   int min_range;
   int bootstrap;
   std::vector<std::string> transfasta;
+  bool batch_mode;
+  std::string batch_file_name;
+  std::vector<std::vector<std::string>> batch_files;
+  std::vector<std::string> batch_ids;
   std::vector<std::string> files;
   bool plaintext;
   bool write_index;
@@ -42,6 +46,7 @@ ProgramOptions() :
   sd(0.0),
   min_range(1),
   bootstrap(0),
+  batch_mode(false),
   plaintext(false),
   write_index(false),
   single_end(false),
diff --git a/src/h5utils.cpp b/src/h5utils.cpp
diff --git a/src/main.cpp b/src/main.cpp
diff --git a/src/weights.cpp b/src/weights.cpp

Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,7 @@ struct EMAlgorithm {`
`152`	`152`	`}`
`153`	`153`
`154`	`154`	`//std::cout << chcount << std::endl;`
`155`		`- if (chcount == 0) {`
	`155`	`+ if (chcount == 0 && i > min_rounds) {`
`156`	`156`
`157`	`157`	`stopEM=true;`
`158`	`158`	`}`
`@@ -275,7 +275,7 @@ struct EMAlgorithm {`
`275`	`275`	`}`
`276`	`276`	`}`
`277`	`277`
`278`		`- std::cout << sum_big << " " << count_big << " " << n << std::endl;`
	`278`	`+ //std::cout << sum_big << " " << count_big << " " << n << std::endl;`
`279`	`279`
`280`	`280`	`std::copy(em_start.alpha_before_zeroes_.begin(), em_start.alpha_before_zeroes_.end(),`
`281`	`281`	`alpha_.begin());`
Original file line number	Diff line number	Diff line change
`@@ -54,13 +54,17 @@ struct MinCollector {`
`54`	`54`	`int findEC(const std::vector<int>& u) const;`
`55`	`55`
`56`	`56`
	`57`	`+ // deprecated`
`57`	`58`	`void write(std::ostream& o) {`
`58`	`59`	`for (int id = 0; id < counts.size(); id++) {`
`59`	`60`	`o << id << "\t" << counts[id] << "\n";`
`60`	`61`	`}`
`61`	`62`	`}`
	`63`	`+ void write(const std::string& index_out) const;`
	`64`	`+`
`62`	`65`	`void loadCounts(ProgramOptions& opt);`
`63`	`66`
	`67`	`+`
`64`	`68`	`bool countBias(const char s1, const char s2, const std::vector<std::pair<KmerEntry,int>> v1, const std::vector<std::pair<KmerEntry,int>> v2, bool paired);`
`65`	`69`	`bool countBias(const char s1, const char s2, const std::vector<std::pair<KmerEntry,int>> v1, const std::vector<std::pair<KmerEntry,int>> v2, bool paired, std::vector<int>& biasOut) const;`
`66`	`70`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ void outputPseudoBam(const KmerIndex &index, const std::vector<int> &u,`
`29`	`29`	`//o << seq1->name.s << "" << seq1->seq.s << "\t" << seq1->qual.s << "\n";`
`30`	`30`	`//o << seq2->name.s << "\t141\t\t0\t0\t\t*\t0\t0\t" << seq2->seq.s << "\t" << seq2->qual.s << "\n";`
`31`	`31`	`} else {`
`32`		`- printf("%s\t4\t\t0\t0\t\t*\t0\t0\t%s\t%s\n", n1,s2,q1);`
	`32`	`+ printf("%s\t4\t\t0\t0\t\t*\t0\t0\t%s\t%s\n", n1,s1,q1);`
`33`	`33`	`}`
`34`	`34`	`} else {`
`35`	`35`	`if (paired) {`
-Original file line number
+Diff line change
   max_len += 1;
   // allocate a contiguous block of memory
   char *pool = new char[max_len * v.size()];
 +  memset(pool,0,max_len * v.size());
   char *ptr = pool;
   for (size_t i = 0; i < v.size(); ++i, ptr += max_len) {
-Original file line number
+Diff line change
+  }
+}
 +void ParseOptionsPseudo(int argc, char **argv, ProgramOptions& opt) {
 +  int verbose_flag = 0;
 +  int single_flag = 0;
 +  int strand_flag = 0;
 +  int pbam_flag = 0;
++
 +  const char *opt_string = "t:i:l:s:o:b:";
 +  static struct option long_options[] = {
 +    // long args
 +    {"verbose", no_argument, &verbose_flag, 1},
 +    {"single", no_argument, &single_flag, 1},
 +    //{"strand-specific", no_argument, &strand_flag, 1},
 +    {"pseudobam", no_argument, &pbam_flag, 1},
 +    {"batch", required_argument, 0, 'b'},
 +    // short args
 +    {"threads", required_argument, 0, 't'},
 +    {"index", required_argument, 0, 'i'},
 +    {"fragment-length", required_argument, 0, 'l'},
 +    {"sd", required_argument, 0, 's'},
 +    {"output-dir", required_argument, 0, 'o'},
 +    {0,0,0,0}
 +  };
 +  int c;
 +  int option_index = 0;
 +  while (true) {
 +    c = getopt_long(argc,argv,opt_string, long_options, &option_index);
++
 +    if (c == -1) {
 +      break;
 +    }
++
 +    switch (c) {
 +    case 0:
 +      break;
 +    case 't': {
 +      stringstream(optarg) >> opt.threads;
 +      break;
 +    }
 +    case 'i': {
 +      opt.index = optarg;
 +      break;
 +    }
 +    case 'l': {
 +      stringstream(optarg) >> opt.fld;
 +      break;
 +    }
 +    case 's': {
 +      stringstream(optarg) >> opt.sd;
 +      break;
 +    }
 +    case 'o': {
 +      opt.output = optarg;
 +      break;
 +    }
 +    case 'b': {
 +      opt.batch_mode = true;
 +      opt.batch_file_name = optarg;
 +      break;
 +    }
 +    default: break;
 +    }
 +  }
++
 +  // all other arguments are fast[a/q] files to be read
 +  for (int i = optind; i < argc; i++) {
 +    opt.files.push_back(argv[i]);
 +  }
++
 +  if (verbose_flag) {
 +    opt.verbose = true;
 +  }
++
 +  if (single_flag) {
 +    opt.single_end = true;
 +  }
++
 +  if (strand_flag) {
 +    opt.strand_specific = true;
 +  }
++
 +  if (pbam_flag) {
 +    opt.pseudobam = true;
 +  }
 +}
++
++
 void ParseOptionsH5Dump(int argc, char **argv, ProgramOptions& opt) {
   int peek_flag = 0;
   const char *opt_string = "o:";
+}
++
 +bool CheckOptionsPseudo(ProgramOptions& opt) {
++
 +  bool ret = true;
++
 +  cerr << endl;
 +  // check for index
 +  if (opt.index.empty()) {
 +    cerr << ERROR_STR << " kallisto index file missing" << endl;
 +    ret = false;
 +  } else {
 +    struct stat stFileInfo;
 +    auto intStat = stat(opt.index.c_str(), &stFileInfo);
 +    if (intStat != 0) {
 +      cerr << ERROR_STR << " kallisto index file not found " << opt.index << endl;
 +      ret = false;
 +    }
 +  }
++
 +  // check for read files
 +  if (!opt.batch_mode) {
 +    if (opt.files.size() == 0) {
 +      cerr << ERROR_STR << " Missing read files" << endl;
 +      ret = false;
 +    } else {
 +      struct stat stFileInfo;
 +      for (auto& fn : opt.files) {
 +        auto intStat = stat(fn.c_str(), &stFileInfo);
 +        if (intStat != 0) {
 +          cerr << ERROR_STR << " file not found " << fn << endl;
 +          ret = false;
 +        }
 +      }
 +    }
 +  } else {
 +    if (opt.files.size() != 0) {
 +      cerr << ERROR_STR << " cannot specify batch mode and supply read files" << endl;
 +      ret = false;
 +    } else {
 +      // check for batch files
 +      if (opt.batch_mode) {
 +        struct stat stFileInfo;
 +        auto intstat = stat(opt.batch_file_name.c_str(), &stFileInfo);
 +        if (intstat != 0) {
 +          cerr << ERROR_STR << " file not found " << opt.batch_file_name << endl;
 +          ret = false;
 +        }
 +        // open the file, parse and fill the batch_files values
 +        std::ifstream bfile(opt.batch_file_name);
 +        std::string line;
 +        std::string id,f1,f2;
 +        while (std::getline(bfile,line)) {
 +          if (line.size() == 0) {
 +            continue;
 +          }
 +          std::stringstream ss(line);
 +          ss >> id;
 +          if (id[0] == '#') {
 +            continue;
 +          }
 +          opt.batch_ids.push_back(id);
 +          if (opt.single_end) {
 +            ss >> f1;
 +            opt.batch_files.push_back({f1});
 +            intstat = stat(f1.c_str(), &stFileInfo);
 +            if (intstat != 0) {
 +              cerr << ERROR_STR << " file not found " << f1 << endl;
 +              ret = false;
 +            }
 +          } else {
 +            ss >> f1 >> f2;
 +            opt.batch_files.push_back({f1,f2});
 +            intstat = stat(f1.c_str(), &stFileInfo);
 +            if (intstat != 0) {
 +              cerr << ERROR_STR << " file not found " << f1 << endl;
 +              ret = false;
 +            }
 +            intstat = stat(f2.c_str(), &stFileInfo);
 +            if (intstat != 0) {
 +              cerr << ERROR_STR << " file not found " << f2 << endl;
 +              ret = false;
 +            }
 +          }
 +        }
 +      }
 +    }
 +  }
++
++
 +  /*
 +  if (opt.strand_specific && !opt.single_end) {
 +    cerr << "Error: strand-specific mode requires single end mode" << endl;
 +    ret = false;
 +  }*/
++
 +  if (!opt.single_end) {
 +    if (opt.files.size() % 2 != 0) {
 +      cerr << "Error: paired-end mode requires an even number of input files" << endl
 +           << "       (use --single for processing single-end reads)" << endl;
 +      ret = false;
 +    }
 +  }
++
 +  if ((opt.fld != 0.0 && opt.sd == 0.0) || (opt.sd != 0.0 && opt.fld == 0.0)) {
 +    cerr << "Error: cannot supply mean/sd without supplying both -l and -s" << endl;
 +    ret = false;
 +  }
++
 +  if (opt.single_end && (opt.fld == 0.0 || opt.sd == 0.0)) {
 +    cerr << "Error: fragment length mean and sd must be supplied for single-end reads using -l and -s" << endl;
 +    ret = false;
 +  } else if (opt.fld == 0.0 && ret) {
 +    // In the future, if we have single-end data we should require this
 +    // argument
 +    cerr << "[quant] fragment length distribution will be estimated from the data" << endl;
 +  } else if (ret && opt.fld > 0.0 && opt.sd > 0.0) {
 +    cerr << "[quant] fragment length distribution is truncated gaussian with mean = " <<
 +      opt.fld << ", sd = " << opt.sd << endl;
 +  }
++
 +  if (!opt.single_end && (opt.fld > 0.0 && opt.sd > 0.0)) {
 +    cerr << "[~warn] you specified using a gaussian but have paired end data" << endl;
 +    cerr << "[~warn] we suggest omitting these parameters and let us estimate the distribution from data" << endl;
 +  }
++
 +  if (opt.fld < 0.0) {
 +    cerr << "Error: invalid value for mean fragment length " << opt.fld << endl;
 +    ret = false;
 +  }
++
 +  if (opt.sd < 0.0) {
 +    cerr << "Error: invalid value for fragment length standard deviation " << opt.sd << endl;
 +    ret = false;
 +  }
++
 +  if (opt.output.empty()) {
 +    cerr << "Error: need to specify output directory " << opt.output << endl;
 +    ret = false;
 +  } else {
 +    struct stat stFileInfo;
 +    auto intStat = stat(opt.output.c_str(), &stFileInfo);
 +    if (intStat == 0) {
 +      // file/dir exits
 +      if (!S_ISDIR(stFileInfo.st_mode)) {
 +        cerr << "Error: file " << opt.output << " exists and is not a directory" << endl;
 +        ret = false;
 +      }
 +    } else {
 +      // create directory
 +      if (mkdir(opt.output.c_str(), 0777) == -1) {
 +        cerr << "Error: could not create directory " << opt.output << endl;
 +        ret = false;
 +      }
 +    }
 +  }
++
 +  if (opt.threads <= 0) {
 +    cerr << "Error: invalid number of threads " << opt.threads << endl;
 +    ret = false;
 +  } else {
 +    unsigned int n = std::thread::hardware_concurrency();
 +    if (n != 0 && n < opt.threads) {
 +      cerr << "Warning: you asked for " << opt.threads
 +           << ", but only " << n << " cores on the machine" << endl;
 +    }
 +    if (opt.threads > 1 && opt.pseudobam) {
 +      cerr << "Error: pseudobam is not compatible with running on many threads."<< endl;
 +      ret = false;
 +    }
 +  }
++
 +  return ret;
 +}
++
++
 bool CheckOptionsInspect(ProgramOptions& opt) {
   bool ret = true;
+}
 void PrintCite() {
 -  cout << "The paper describing this software has not been published." << endl;
 -  //  cerr << "When using this program in your research, please cite" << endl << endl;
 +  cout << "When using this program in your research, please cite" << endl << endl
 +       << "  Bray, N. L., Pimentel, H., Melsted, P. & Pachter, L." << endl
 +       << "  Near-optimal probabilistic RNA-seq quantification, "<< endl
 +       << "  Nature Biotechnology (2016), doi:10.1038/nbt.3519" << endl
 +       << endl;
+}
 void PrintVersion() {
        << "Where <CMD> can be one of:" << endl << endl
        << "    index         Builds a kallisto index "<< endl
        << "    quant         Runs the quantification algorithm " << endl
 +       << "    pseudo        Runs the pseudoalignment step " << endl
        << "    h5dump        Converts HDF5-formatted results to plaintext" << endl
 -       << "    version       Prints version information"<< endl << endl
 +       << "    version       Prints version information"<< endl
 +       << "    cite          Prints citation information" << endl << endl
        << "Running kallisto <CMD> without arguments prints usage information for <CMD>"<< endl << endl;
+}
+}
 +void usagePseudo(bool valid_input = true) {
 +  if (valid_input) {
 +    cout << "kallisto " << KALLISTO_VERSION << endl
 +         << "Computes equivalence classes for reads and quantifies abundances" << endl << endl;
 +  }
++
 +  cout << "Usage: kallisto pseudo [arguments] FASTQ-files" << endl << endl
 +       << "Required arguments:" << endl
 +       << "-i, --index=STRING            Filename for the kallisto index to be used for" << endl
 +       << "                              pseudoalignment" << endl
 +       << "-o, --output-dir=STRING       Directory to write output to" << endl << endl
 +       << "Optional arguments:" << endl
 +       << "-b  --batch=FILE              Process files listed in FILE" << endl
 +       << "    --single                  Quantify single-end reads" << endl
 +       << "-l, --fragment-length=DOUBLE  Estimated average fragment length" << endl
 +       << "-s, --sd=DOUBLE               Estimated standard deviation of fragment length" << endl
 +       << "                              (default: value is estimated from the input data)" << endl
 +       << "-t, --threads=INT             Number of threads to use (default: 1)" << endl
 +       << "    --pseudobam               Output pseudoalignments in SAM format to stdout" << endl;
++
 +}
++
 void usageEMOnly() {
   cout << "kallisto " << KALLISTO_VERSION << endl
        << "Computes equivalence classes for reads and quantifies abundance" << endl << endl
+        }
         cerr << endl;
+      }
 +    } else if (cmd == "pseudo") {
 +      if (argc==2) {
 +        usagePseudo();
 +        return 0;
 +      }
 +      ParseOptionsPseudo(argc-1,argv+1,opt);
 +      if (!CheckOptionsPseudo(opt)) {
 +        cerr << endl;
 +        usagePseudo(false);
 +        exit(1);
 +      } else {
 +        // pseudoalign the reads
 +        KmerIndex index(opt);
 +        index.load(opt);
++
 +        MinCollector collection(index, opt);
 +        int num_processed = 0;
++
 +        if (!opt.batch_mode) {
 +          num_processed = ProcessReads(index, opt, collection);
 +          collection.write((opt.output + "/pseudoalignments"));
 +        } else {
++
 +          std::vector<std::vector<int>> batchCounts;
 +          for (int i = 0; i < opt.batch_ids.size(); i++) {
 +            std::fill(collection.counts.begin(), collection.counts.end(),0);
 +            opt.files = opt.batch_files[i];
 +            num_processed += ProcessReads(index, opt, collection);
 +            batchCounts.push_back(collection.counts);
 +          }
++
 +          writeBatchMatrix((opt.output + "/matrix"),index, opt.batch_ids,batchCounts);
++
 +        }
++
 +        std::string call = argv_to_string(argc, argv);
++
 +        plaintext_aux(
 +            opt.output + "/run_info.json",
 +            std::string(std::to_string(index.num_trans)),
 +            std::string(std::to_string(0)), // no bootstraps in pseudo
 +            std::string(std::to_string(num_processed)),
 +            KALLISTO_VERSION,
 +            std::string(std::to_string(index.INDEX_VERSION)),
 +            start_time,
 +            call);
++
 +        cerr << endl;
 +      }
     } else if (cmd == "h5dump") {
       if (argc == 2) {
-Original file line number
+Diff line change
     const char* cs = index.target_seqs_[i].c_str();
     int hex = hexamerToInt(cs,false);
 -    int fwlimit = (int) (seqlen - means[i] - 6);
 +    int fwlimit = (int) std::max(seqlen - means[i] - 6, 0.0);
     for (int j = 0; j < fwlimit; j++) {
       dbias5[hex] += contrib;
       hex = update_hexamer(hex,*(cs+j+6),false);
+    }
 -    int bwlimit = (int) (means[i] - 6);
 +    int bwlimit = (int) std::max(means[i] - 6, 0.0);
     hex = hexamerToInt(cs+bwlimit,true);
     for (int j = bwlimit; j < seqlen - 6; j++) {
       dbias5[hex] += contrib;
       // forward direction
       int hex = hexamerToInt(cs,false);
 -      int fwlimit = (int) seqlen - means[i] -6;
 +      int fwlimit = (int) std::max(seqlen - means[i] - 6, 0.0);
       for (int j = 0; j < fwlimit; j++) {
         //int hex = hexamerToInt(cs+j,false);
         //efflen += 0.5*(tc.bias5[hex]/biasDataNorm) / (dbias5[hex]/biasAlphaNorm );
         efflen += tc.bias5[hex] / dbias5[hex];
         hex = update_hexamer(hex,*(cs+j+6),false);
+      }
 -      int bwlimit = (int) std::max(means[i]-6,0.0);
 +      int bwlimit = (int) std::max(means[i] - 6 , 0.0);
       hex = hexamerToInt(cs+bwlimit,true);
       for (int j = bwlimit; j < seqlen - 6; j++) {
         efflen += tc.bias5[hex] / dbias5[hex];