forked from chienchi/FaQCs
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathfastq.cpp
138 lines (96 loc) · 3.75 KB
/
fastq.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#include <string.h>
#include "fastq.h"
#include <iostream>
using namespace std;
bool next_read(gzFile m_fin, string &m_def, string &m_seq, string &m_quality)
{
// The FASTQ file format is as follows for each record
// (from the Wikipedia entry):
// Line 1 begins with a '@' character and is followed by a sequence identifier and an optional description (like a FASTA title line).
// Line 2 is the raw sequence letters.
// Line 3 begins with a '+' character and is optionally followed by the same sequence identifier (and any description) again.
// Line 4 encodes the quality values for the sequence in Line 2, and must contain the same number of symbols as letters in the sequence.
// Read each line of the fastq file in chunks of size "buffer_len" bytes. As of 2017, most platforms (i.e. Illumina) have a line length
// that is less than 256 bytes. However, a notable exception is the PacBio platform (which can have line lengths greater
// than 1 KB). The value of buffer_len may need to be tuned to the sequencing platform, or, implement an adaptive strategy
// that grows the buffer size to a platform-optimal value.
const int buffer_len = 4096000;
char buffer[buffer_len];
char *ptr = NULL;
/////////////////////////////////////////////////////////////////////////////////
// Read the header
/////////////////////////////////////////////////////////////////////////////////
m_def.clear(); // Start with an emply header
while(true){
if(gzgets(m_fin, buffer, buffer_len) == NULL){
if( gzeof(m_fin) ){
return false;
}
throw __FILE__ ":next_read: Unable to read header";
}
if( ( ptr = strpbrk(buffer, "\n\r") ) != NULL ){
*ptr = '\0';
m_def += buffer;
break;
}
m_def += buffer;
}
/////////////////////////////////////////////////////////////////////////////////
// Read the sequence
/////////////////////////////////////////////////////////////////////////////////
m_seq.clear(); // Start with an emply sequence
while(true){
if(gzgets(m_fin, buffer, buffer_len) == NULL){
cerr << "Error in read: " << m_def << endl;
throw __FILE__ ":next_read: Unable to read sequence";
}
if( ( ptr = strpbrk(buffer, "\n\r") ) != NULL ){
*ptr = '\0';
m_seq += buffer;
break;
}
m_seq += buffer;
}
// Skip the '+'. Note that we *don't* actually test to make sure we read a single '+' on a line
// by itself.
if(gzgets(m_fin, buffer, buffer_len) == NULL){
cerr << "Error in read: " << m_def << endl;
throw __FILE__ ":next_read: Unable to read '+'";
}
if( strpbrk(buffer, "\n\r") == NULL ){
cerr << "Error in read: " << m_def << endl;
throw __FILE__ ":next_read: Error reading '+' delimiter";
}
/////////////////////////////////////////////////////////////////////////////////
// Read the quality
/////////////////////////////////////////////////////////////////////////////////
m_quality.clear(); // Start with an emply quality string
while(true){
if(gzgets(m_fin, buffer, buffer_len) == NULL){
cerr << "Error in read: " << m_def << endl;
throw __FILE__ ":next_read: Unable to read quality";
}
if( ( ptr = strpbrk(buffer, "\n\r") ) != NULL ){
*ptr = '\0';
m_quality += buffer;
break;
}
m_quality += buffer;
}
if( m_seq.size() != m_quality.size() ){
cerr << "Error in read: " << m_def << endl;
throw __FILE__ ":next_read: |Sequence| != |Quality|";
}
return true;
}
void write_read(gzFile m_fout, const string &m_def, const string &m_seq,
const string &m_quality)
{
//m_fout << m_def << '\n' << m_seq << "\n+\n" << m_quality << endl;
gzputs( m_fout, m_def.c_str() );
gzputc( m_fout, '\n' );
gzputs( m_fout, m_seq.c_str() );
gzputs( m_fout, "\n+\n" );
gzputs( m_fout, m_quality.c_str() );
gzputc( m_fout, '\n' );
}