Skip to content

Commit e39f274

Browse files
authored
Merge pull request #34 from h-2/fastq
[feature] add FastQ reading
2 parents e5ebbdb + e4a2b77 commit e39f274

File tree

8 files changed

+659
-42
lines changed

8 files changed

+659
-42
lines changed

include/bio/format/fasta.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
// -----------------------------------------------------------------------------------------------------
88

99
/*!\file
10-
* brief Provides the seqan3::format_fasta.
10+
* \brief Provides the bio::fasta.
1111
* \author Hannes Hauswedell <hannes.hauswedell AT decode.is>
1212
*/
1313

include/bio/format/fastq.hpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// -----------------------------------------------------------------------------------------------------
2+
// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3+
// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4+
// Copyright (c) 2020-2021, deCODE Genetics
5+
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
6+
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
7+
// -----------------------------------------------------------------------------------------------------
8+
9+
/*!\file
10+
* \brief Provides the bio::fastq.
11+
* \author Hannes Hauswedell <hannes.hauswedell AT decode.is>
12+
*/
13+
14+
#pragma once
15+
16+
#include <string>
17+
#include <vector>
18+
19+
#include <bio/platform.hpp>
20+
21+
namespace bio
22+
{
23+
24+
/*!\brief The FastQ format.
25+
* \ingroup format
26+
*
27+
* \details
28+
*
29+
* This is the FastQ format tag. If you want to read FastQ files, use bio::seq_io::reader, and if you want
30+
* to write FastQ files, use bio::seq_io::writer.
31+
*
32+
* ### Introduction
33+
*
34+
* FastQ is the de-facto-standard for read data in bionformatics. See the
35+
* [article on wikipedia](https://en.wikipedia.org/wiki/FASTQ_format) for a an in-depth description of the format.
36+
*
37+
* ### Fields
38+
*
39+
* The FastQ format provides the fields bio::field::seq, bio::field::id and bio::field::qual.
40+
* All fields are required when writing.
41+
*
42+
* ### Implementation notes
43+
*
44+
* The following optional features are supported by the implementation:
45+
*
46+
* * Second ID line as third line (after `+`).
47+
*
48+
* The following optional features are *not* supported by the implementation:
49+
*
50+
* * Linebreaks within any field.
51+
*/
52+
struct fastq
53+
{
54+
//!\brief The valid file extensions for this format; note that you can modify this value.
55+
static inline std::vector<std::string> file_extensions{"fastq", "fq"};
56+
};
57+
58+
} // namespace bio
Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
// -----------------------------------------------------------------------------------------------------
2+
// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3+
// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4+
// Copyright (c) 2020-2021, deCODE Genetics
5+
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
6+
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
7+
// -----------------------------------------------------------------------------------------------------
8+
9+
/*!\file
10+
* \brief Provides the bio::format_input_handler implementation for bio::fastq.
11+
* \author Hannes Hauswedell <hannes.hauswedell AT decode.is>
12+
*/
13+
14+
#pragma once
15+
16+
#include <algorithm>
17+
#include <iostream>
18+
#include <ranges>
19+
#include <string>
20+
#include <string_view>
21+
#include <vector>
22+
23+
#include <seqan3/core/range/type_traits.hpp>
24+
25+
#include <seqan3/alphabet/views/to_char.hpp>
26+
#include <seqan3/core/debug_stream/detail/to_string.hpp>
27+
#include <seqan3/utility/char_operations/predicate.hpp>
28+
29+
#include <bio/detail/range.hpp>
30+
#include <bio/format/fastq.hpp>
31+
#include <bio/format/format_input_handler.hpp>
32+
#include <bio/plain_io/reader.hpp>
33+
34+
namespace bio
35+
{
36+
37+
/*!\brief Format input handler for the FastQ format (bio::fastq).
38+
* \ingroup format
39+
* \details
40+
*
41+
* ### Attention
42+
*
43+
* Most users should not perform I/O through input/output handlers but should instead use the respective
44+
* readers/writers. See the overview (TODO link) for more information.
45+
*
46+
* ### Options
47+
*
48+
* The following options are considered if the respective member variable is availabele in the object passed to
49+
* the constructor:
50+
*
51+
* | Member | Type | Default | Description |
52+
* |-----------------|---------|---------|-----------------------------------------------------|
53+
* |`truncate_ids` |`bool` | `false` | Whether to truncate IDs on the first whitespace |
54+
*
55+
* ### Performance
56+
*
57+
* The current implementation is not optimised for performance and performs an unnecessary copy operation––even
58+
* when shallow records are returned.
59+
*/
60+
template <>
61+
class format_input_handler<fastq> : public format_input_handler_base<format_input_handler<fastq>>
62+
{
63+
private:
64+
/*!\name CRTP related entities
65+
* \{
66+
*/
67+
//!\brief The type of the CRTP base class.
68+
using base_t = format_input_handler_base<format_input_handler<fastq>>;
69+
using base_t::parse_field;
70+
using base_t::parse_field_aux;
71+
using base_t::stream;
72+
73+
//!\brief Befriend the base class to enable CRTP.
74+
friend base_t;
75+
//!\}
76+
77+
//!\brief Print an error message with current line number in diagnostic.
78+
[[noreturn]] void error(auto const &... messages) const
79+
{
80+
std::string message = "[B.I.O. FastQ format error in line " + detail::to_string(line) + "] ";
81+
((message += detail::to_string(messages)), ...);
82+
83+
throw parse_error{message};
84+
}
85+
86+
/*!\name Options
87+
* \{
88+
*/
89+
//!\brief Whether to truncate IDs on first whitespace.
90+
bool truncate_ids = false;
91+
//!\}
92+
93+
/*!\name Raw record handling
94+
* \{
95+
*/
96+
//!\brief The fields that this format supports [the base class accesses this type].
97+
using format_fields = vtag_t<field::id, field::seq, field::qual>;
98+
//!\brief Type of the raw record.
99+
using raw_record_type = record<format_fields, seqan3::list_traits::repeat<3, std::string_view>>;
100+
//!\brief Type of the low-level iterator.
101+
using lowlevel_iterator = detail::plaintext_input_iterator<plain_io::record_kind::line>;
102+
103+
//!\brief The raw record.
104+
raw_record_type raw_record;
105+
//!\brief Buffer for the ID.
106+
std::string id_buffer;
107+
//!\brief Buffer for the sequence.
108+
std::string seq_buffer;
109+
//!\brief Buffer for the qualities.
110+
std::string qual_buffer;
111+
112+
//!\brief The low-level iterator.
113+
lowlevel_iterator it;
114+
//!\brief A line counter.
115+
size_t line = -1;
116+
117+
//!\brief Read the raw record [the base class invokes this function].
118+
void read_raw_record()
119+
{
120+
id_buffer.clear();
121+
seq_buffer.clear();
122+
qual_buffer.clear();
123+
124+
/* READ ID */
125+
{
126+
++it;
127+
++line;
128+
129+
if (it == std::default_sentinel)
130+
error("Reached end of file while trying to read ID.");
131+
132+
std::string_view current_line = *it;
133+
134+
if (current_line.empty())
135+
error("Expected to be on begin of record but line is empty.");
136+
137+
if ((!seqan3::is_char<'@'>)(current_line[0]))
138+
error("ID-line does not begin with '@'.");
139+
140+
if (truncate_ids)
141+
{
142+
size_t e = 1;
143+
for (; e < current_line.size() && (!seqan3::is_space)(current_line[e]); ++e)
144+
{}
145+
detail::string_copy(current_line.substr(1, e - 1), id_buffer);
146+
}
147+
else
148+
{
149+
detail::string_copy(current_line.substr(1), id_buffer);
150+
}
151+
152+
get<field::id>(raw_record) = id_buffer;
153+
}
154+
155+
/* READ SEQ */
156+
{
157+
++it;
158+
++line;
159+
160+
if (it == std::default_sentinel)
161+
error("Reached end of file while trying to read SEQ.");
162+
163+
detail::string_copy(*it, seq_buffer); // is allowed to be empty
164+
165+
get<field::seq>(raw_record) = seq_buffer;
166+
}
167+
168+
/* READ third line */
169+
{
170+
++it;
171+
++line;
172+
173+
if (it == std::default_sentinel)
174+
error("Reached end of file while trying to read third FastQ record line.");
175+
176+
if ((!seqan3::is_char<'+'>)((*it)[0]))
177+
error("Third FastQ record line does not begin with '+'.");
178+
179+
// we don't process the rest of the line
180+
}
181+
182+
/* READ QUAL */
183+
{
184+
++it;
185+
++line;
186+
187+
if (it == std::default_sentinel)
188+
error("Reached end of file while trying to read QUALITIES.");
189+
190+
detail::string_copy(*it, qual_buffer); // is allowed to be empty
191+
192+
get<field::qual>(raw_record) = qual_buffer;
193+
}
194+
195+
if (size_t ssize = seq_buffer.size(), qsize = qual_buffer.size(); ssize != qsize)
196+
error("Size mismatch between sequence (", ssize, ") and qualities (", qsize, ").");
197+
}
198+
//!\}
199+
200+
/*!\name Parsed record handling
201+
* \brief This is mostly done via the defaults in the base class.
202+
* \{
203+
*/
204+
//!\brief We can prevent another copy if the user wants a string.
205+
void parse_field(vtag_t<field::id> const & /**/, std::string & parsed_field) { std::swap(id_buffer, parsed_field); }
206+
207+
//!\brief We can prevent another copy if the user wants a string.
208+
void parse_field(vtag_t<field::seq> const & /**/, std::string & parsed_field)
209+
{
210+
std::swap(seq_buffer, parsed_field);
211+
}
212+
213+
//!\brief We can prevent another copy if the user wants a string.
214+
void parse_field(vtag_t<field::qual> const & /**/, std::string & parsed_field)
215+
{
216+
std::swap(qual_buffer, parsed_field);
217+
}
218+
//!\}
219+
220+
public:
221+
/*!\name Constructors, destructor and assignment.
222+
* \{
223+
*/
224+
format_input_handler() = default; //!< Defaulted.
225+
format_input_handler(format_input_handler const &) = delete; //!< Deleted.
226+
format_input_handler(format_input_handler &&) = default; //!< Defaulted.
227+
~format_input_handler() = default; //!< Defaulted.
228+
format_input_handler & operator=(format_input_handler const &) = delete; //!< Deleted.
229+
format_input_handler & operator=(format_input_handler &&) = default; //!< Defaulted.
230+
231+
/*!\brief Construct with an options object.
232+
* \param[in,out] str The input stream.
233+
* \param[in] options An object with options for the input handler.
234+
* \details
235+
*
236+
* The options argument is typically bio::seq_io::reader_options, but any object with a subset of similarly named
237+
* members is also accepted. See bio::format_input_handler<bio::fastq> for the supported options and defaults.
238+
*/
239+
format_input_handler(std::istream & str, auto const & options) : base_t{str}, it{str, false}
240+
{
241+
if constexpr (requires { (bool)options.truncate_ids; })
242+
{
243+
truncate_ids = options.truncate_ids;
244+
}
245+
}
246+
247+
//!\brief Construct with only an input stream.
248+
format_input_handler(std::istream & str) : format_input_handler{str, int{}} {}
249+
//!\}
250+
};
251+
252+
} // namespace bio

include/bio/seq_io/reader.hpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
#include <bio/detail/reader_base.hpp>
2828
#include <bio/format/fasta_input_handler.hpp>
29+
#include <bio/format/fastq_input_handler.hpp>
2930
#include <bio/seq_io/reader_options.hpp>
3031

3132
namespace bio::seq_io
@@ -54,7 +55,10 @@ namespace bio::seq_io
5455
*
5556
* And it supports the following formats:
5657
*
57-
* 1. FASTA (see also bio::fasta)
58+
* 1. FastA (see also bio::fasta)
59+
* 2. FastQ (see also bio::fastq)
60+
*
61+
* Fields that are not present in a format (e.g. bio::field::qual in FastA) will be returned empty.
5862
*
5963
* ### Simple usage
6064
*
@@ -95,14 +99,12 @@ class reader : public reader_base<reader_options<option_args_t...>>
9599
{
96100
private:
97101
//!\brief The base class.
98-
using base_t = reader_base<reader_options<option_args_t...>>;
102+
using base_t = reader_base<reader_options<option_args_t...>>;
103+
104+
public:
99105
//!\brief Inherit the format_type definition.
100106
using format_type = typename base_t::format_type;
101-
/* Implementation note
102-
* format_type is "inherited" as private here to avoid appearing twice in the documentation.
103-
* Its actual visibility is public because it is public in the base class.
104-
*/
105-
public:
107+
106108
// clang-format off
107109
//!\copydoc bio::reader_base::reader_base(std::filesystem::path const & filename, format_type const & fmt, options_t const & opt = options_t{})
108110
// clang-format on

0 commit comments

Comments
 (0)