libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
bafasciifilereader.cpp
Go to the documentation of this file.
1/////////////////////// StdLib includes
2#include <iostream>
3#include <iomanip>
4
5
6/////////////////////// Qt includes
7#include <QDebug>
8#include <QFile>
9#include <QFileInfo>
10
11
12/////////////////////// libpwiz includes
13#include <pwiz/data/msdata/DefaultReaderList.hpp>
14
15
16/////////////////////// Local includes
17#include "bafasciifilereader.h"
18#include "../exception/exceptionnotfound.h"
19#include "../utils.h"
20#include "../types.h"
21#include "../msrun/msrunid.h"
22
23
24namespace pappso
25{
26
27static const std::size_t CHECKED_LINES_COUNT = 10;
28
30 : MsFileReader{file_name}
31{
32 // To avoid initializing multiple times (costly process), we
33 // only initialize when needed, that is, upon getMsRunIds().
34 // initialize();
35}
36
37
41
42bool
43BafAsciiFileReader::initialize(std::size_t &line_count)
44{
45 // Here we just test some the lines of the file to check that they comply with
46 // the brukerBafAscii format.
47
48 line_count = 0;
49
50 QFile file(m_fileName);
51
52 if(!file.open(QFile::ReadOnly | QFile::Text))
53 {
54 qDebug() << "Failed to open file" << m_fileName;
55
56 return false;
57 }
58
59 // Construct the regular expression pattern, piecemeal...
60
61 // The retention time as the very first value in the line.
62
63 QString regexp_pattern = QString("^(%1)").arg(
65
66 // The ionization mode (positive or negative)
67 regexp_pattern += QString(",([+-])");
68
69 regexp_pattern += QString(",(ESI|MALDI)");
70
71 // The MS level (ms1 for full scan mass spectrum)
72 regexp_pattern += QString(",ms(\\d)");
73
74 // Do no know what this is for.
75 regexp_pattern += QString(",(-)");
76
77 // The type of peak (profile or centroid).
78 regexp_pattern += QString(",(profile|line)");
79
80 // The m/z range of the mass spectrum.
81
82 regexp_pattern +=
83 QString(",(%1-%2)")
86
87 // The count of peaks following this element in the remaining of the line.
88
89 regexp_pattern += QString(",(\\d+)");
90
91 regexp_pattern += QString("(.*$)");
92
93 // qDebug() << "The full regexp_pattern:" << regexp_pattern;
94
95 QRegularExpression line_regexp(regexp_pattern);
96
97 QRegularExpressionMatch regexp_match;
98
99 QString line;
100 bool file_reading_failed = false;
101 bool ok = false;
102
103 // Reading, parsing and checking lines is extremely time consuming.
104 // What we want here is reduce the time all the file's lines are
105 // read. We could say that we want to parse and check the first
106 // CHECKED_LINES_COUNT lines and then avoid parsing and checking, just go
107 // through the lines. At the end of the file, the number of lines that have
108 // been read is stored in the out parameter line_count.
109 std::size_t iter = 0;
110
111 while(!file.atEnd())
112 {
113 line = file.readLine().trimmed();
114
115 ++iter;
116 // qDebug() << "Read one line more: (not yet checked)" << iter;
117 if(iter > CHECKED_LINES_COUNT)
118 continue;
119
120 if(line.startsWith('#') || line.isEmpty() ||
121 Utils::endOfLineRegExp.match(line).hasMatch())
122 continue;
123
124 // qDebug() << "Current brukerBafAscii format line " << line_count << ": "
125 // << line.left(30) << " ... " << line.right(30);
126
127 regexp_match = line_regexp.match(line);
128
129 if(regexp_match.hasMatch())
130 {
131 // qDebug() << "The match succeeded.";
132
133 double retention_time = regexp_match.captured(1).toDouble(&ok);
134 if(!ok)
135 {
136 qDebug()
137 << "Failed to extract the retention time of the mass spectrum.";
138
139 file_reading_failed = true;
140
141 break;
142 }
143
144 QString ionization_mode = regexp_match.captured(2);
145 QString source_type = regexp_match.captured(3);
146
147 int ms_level = regexp_match.captured(4).toInt(&ok);
148 if(!ok)
149 {
150 qDebug()
151 << "Failed to extract the MS level of the mass spectrum.";
152
153 file_reading_failed = true;
154
155 break;
156 }
157
158 QString peak_shape_type = regexp_match.captured(6);
159
160 QString mz_range = regexp_match.captured(7);
161
162 double mz_range_start =
163 mz_range.left(mz_range.indexOf("-")).toDouble(&ok);
164 if(!ok)
165 {
166 qDebug() << "Failed to extract the start of the m/z range.";
167
168 file_reading_failed = true;
169
170 break;
171 }
172
173 double mz_range_end =
174 mz_range.right(mz_range.indexOf("-") + 1).toDouble(&ok);
175 if(!ok)
176 {
177 qDebug() << "Failed to extract the end of the m/z range.";
178
179 file_reading_failed = true;
180
181 break;
182 }
183
184 // qDebug() << qSetRealNumberPrecision(10)
185 // << "mz_range_start: " << mz_range_start
186 // << "mz_range_end: " << mz_range_end;
187
188 int peak_count = regexp_match.captured(8).toInt(&ok);
189 if(!ok)
190 {
191 qDebug() << "Failed to extract the number of peaks in the mass "
192 "spectrum.";
193
194 file_reading_failed = true;
195
196 break;
197 }
198
199 QString peaks = regexp_match.captured(9);
200 QStringList peaks_stringlist = peaks.split(",", Qt::SkipEmptyParts);
201
202 // qDebug() << "The number of peaks:" << peaks_stringlist.size();
203
204 // Sanity check:
205 if(peaks_stringlist.size() != peak_count)
206 {
207 // qDebug() << "The number of peaks in the mass spectrum does not
208 // "
209 // "match the advertised one.";
210
211 file_reading_failed = true;
212
213 break;
214 }
215
216 // qDebug() << "The retention time:" << retention_time
217 // << "the ionization mode: " << ionization_mode
218 // << "the source type: " << source_type
219 // << "MS level is:" << ms_level
220 // << "peak shape type: " << peak_shape_type
221 // << "m/z range: " << mz_range << "peak count: " <<
222 // peak_count
223 // << "and peaks: " << peaks.left(100) << " ... "
224 // << peaks.right(100) << "";
225
226 // If we are here, that means that the read line has conformed
227 // to the format expected.
228 ++line_count;
229 // qDebug() << "Checked one line more:" << line_count;
230 }
231 // End end of
232 // if(regexp_match.hasMatch())
233 else
234 {
235 qDebug() << "The match failed.";
236 file_reading_failed = true;
237
238 break;
239 }
240 }
241 // End of
242 // while(!file.atEnd())
243
244 file.close();
245
246 if(!file_reading_failed && line_count >= 1)
247 {
249 return true;
250 }
251
253
254 // qDebug() << "The number of parsed mass spectra: " << line_count;
255
256 // qDebug() << "Detected file format:"
257 // << Utils::msDataFormatAsString(m_fileFormat)
258 // << "with number of spectra: " << line_count;
259
260 return false;
261}
262
263
269
270
271std::vector<MsRunIdCstSPtr>
272BafAsciiFileReader::getMsRunIds(const QString &run_prefix)
273{
274 std::vector<MsRunIdCstSPtr> ms_run_ids;
275
276 std::size_t ms_data_line_count = 0;
277
278 if(!initialize(ms_data_line_count))
279 return ms_run_ids;
280
281 // Finally create the MsRunId with the file name.
282 MsRunId ms_run_id(m_fileName);
283 ms_run_id.setMsDataFormat(m_fileFormat);
284
285 // We need to set the unambiguous xmlId string.
286 ms_run_id.setXmlId(
287 QString("%1%2").arg(run_prefix).arg(Utils::getLexicalOrderedString(0)));
288
289 // Craft a meaningful sample name because otherwise all the files loaded from
290 // text files will have the same sample name and it will be difficult to
291 // differentiate them.
292 // Orig version:
293 // ms_run_id.setRunId("Single spectrum");
294 // Now the sample name is nothing but the file name without the path.
295
296 QFileInfo file_info(m_fileName);
297
298 // qDebug() << "file name:" << m_fileName;
299
300 QString sample_name = file_info.fileName();
301
302 // qDebug() << "sample name:" << sample_name;
303
304 ms_run_id.setRunId(sample_name);
305
306 // Now set the sample name to the run id:
307
308 ms_run_id.setSampleName(ms_run_id.getRunId());
309
310 // qDebug() << __FILE__ << "@" << __LINE__ << __FUNCTION__ << "()"
311 //<< "Current ms_run_id:" << ms_run_id.toString();
312
313 // Finally make a shared pointer out of it and append it to the vector.
314 ms_run_ids.push_back(std::make_shared<MsRunId>(ms_run_id));
315
316 return ms_run_ids;
317}
318
319
320} // namespace pappso
virtual MsDataFormat getFileFormat() override
virtual bool initialize(std::size_t &line_count)
virtual std::vector< MsRunIdCstSPtr > getMsRunIds(const QString &run_prefix) override
BafAsciiFileReader(const QString &file_name)
MsDataFormat m_fileFormat
MS run identity MsRunId identifies an MS run with a unique ID (XmlId) and contains eventually informa...
Definition msrunid.h:54
const QString & getRunId() const
Definition msrunid.cpp:130
void setRunId(const QString &run_id)
Definition msrunid.cpp:123
void setXmlId(const QString &xml_id)
set an XML unique identifier for this MsRunId
Definition msrunid.cpp:137
void setMsDataFormat(MsDataFormat format)
Definition msrunid.cpp:158
void setSampleName(const QString &name)
set a sample name for this MsRunId
Definition msrunid.cpp:79
static QRegularExpression unsignedDoubleNumberNoExponentialRegExp
Definition utils.h:52
static const QString getLexicalOrderedString(unsigned int num)
Definition utils.cpp:73
static QRegularExpression endOfLineRegExp
Regular expression that tracks the end of line in text files.
Definition utils.h:67
tries to keep as much as possible monoisotopes, removing any possible C13 peaks and changes multichar...
Definition aa.cpp:39
MsDataFormat
Definition types.h:120
@ unknown
unknown format
static const std::size_t CHECKED_LINES_COUNT