|
| 1 | +// ----------------------------------------------------------------------------------------------------- |
| 2 | +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin |
| 3 | +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik |
| 4 | +// Copyright (c) 2020-2021, deCODE Genetics |
| 5 | +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License |
| 6 | +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md |
| 7 | +// ----------------------------------------------------------------------------------------------------- |
| 8 | + |
| 9 | +/*!\file |
| 10 | + * \brief Provides the bio::format_input_handler implementation for bio::fastq. |
| 11 | + * \author Hannes Hauswedell <hannes.hauswedell AT decode.is> |
| 12 | + */ |
| 13 | + |
| 14 | +#pragma once |
| 15 | + |
| 16 | +#include <algorithm> |
| 17 | +#include <iostream> |
| 18 | +#include <ranges> |
| 19 | +#include <string> |
| 20 | +#include <string_view> |
| 21 | +#include <vector> |
| 22 | + |
| 23 | +#include <seqan3/core/range/type_traits.hpp> |
| 24 | + |
| 25 | +#include <seqan3/alphabet/views/to_char.hpp> |
| 26 | +#include <seqan3/core/debug_stream/detail/to_string.hpp> |
| 27 | +#include <seqan3/utility/char_operations/predicate.hpp> |
| 28 | + |
| 29 | +#include <bio/detail/range.hpp> |
| 30 | +#include <bio/format/fastq.hpp> |
| 31 | +#include <bio/format/format_input_handler.hpp> |
| 32 | +#include <bio/plain_io/reader.hpp> |
| 33 | + |
| 34 | +namespace bio |
| 35 | +{ |
| 36 | + |
| 37 | +/*!\brief Format input handler for the FastQ format (bio::fastq). |
| 38 | + * \ingroup format |
| 39 | + * \details |
| 40 | + * |
| 41 | + * ### Attention |
| 42 | + * |
| 43 | + * Most users should not perform I/O through input/output handlers but should instead use the respective |
| 44 | + * readers/writers. See the overview (TODO link) for more information. |
| 45 | + * |
| 46 | + * ### Options |
| 47 | + * |
| 48 | + * The following options are considered if the respective member variable is availabele in the object passed to |
| 49 | + * the constructor: |
| 50 | + * |
| 51 | + * | Member | Type | Default | Description | |
| 52 | + * |-----------------|---------|---------|-----------------------------------------------------| |
| 53 | + * |`truncate_ids` |`bool` | `false` | Whether to truncate IDs on the first whitespace | |
| 54 | + * |
| 55 | + * ### Performance |
| 56 | + * |
| 57 | + * The current implementation is not optimised for performance and performs an unnecessary copy operation––even |
| 58 | + * when shallow records are returned. |
| 59 | + */ |
| 60 | +template <> |
| 61 | +class format_input_handler<fastq> : public format_input_handler_base<format_input_handler<fastq>> |
| 62 | +{ |
| 63 | +private: |
| 64 | + /*!\name CRTP related entities |
| 65 | + * \{ |
| 66 | + */ |
| 67 | + //!\brief The type of the CRTP base class. |
| 68 | + using base_t = format_input_handler_base<format_input_handler<fastq>>; |
| 69 | + using base_t::parse_field; |
| 70 | + using base_t::parse_field_aux; |
| 71 | + using base_t::stream; |
| 72 | + |
| 73 | + //!\brief Befriend the base class to enable CRTP. |
| 74 | + friend base_t; |
| 75 | + //!\} |
| 76 | + |
| 77 | + //!\brief Print an error message with current line number in diagnostic. |
| 78 | + [[noreturn]] void error(auto const &... messages) const |
| 79 | + { |
| 80 | + std::string message = "[B.I.O. FastQ format error in line " + detail::to_string(line) + "] "; |
| 81 | + ((message += detail::to_string(messages)), ...); |
| 82 | + |
| 83 | + throw parse_error{message}; |
| 84 | + } |
| 85 | + |
| 86 | + /*!\name Options |
| 87 | + * \{ |
| 88 | + */ |
| 89 | + //!\brief Whether to truncate IDs on first whitespace. |
| 90 | + bool truncate_ids = false; |
| 91 | + //!\} |
| 92 | + |
| 93 | + /*!\name Raw record handling |
| 94 | + * \{ |
| 95 | + */ |
| 96 | + //!\brief The fields that this format supports [the base class accesses this type]. |
| 97 | + using format_fields = vtag_t<field::id, field::seq, field::qual>; |
| 98 | + //!\brief Type of the raw record. |
| 99 | + using raw_record_type = record<format_fields, seqan3::list_traits::repeat<3, std::string_view>>; |
| 100 | + //!\brief Type of the low-level iterator. |
| 101 | + using lowlevel_iterator = detail::plaintext_input_iterator<plain_io::record_kind::line>; |
| 102 | + |
| 103 | + //!\brief The raw record. |
| 104 | + raw_record_type raw_record; |
| 105 | + //!\brief Buffer for the ID. |
| 106 | + std::string id_buffer; |
| 107 | + //!\brief Buffer for the sequence. |
| 108 | + std::string seq_buffer; |
| 109 | + //!\brief Buffer for the qualities. |
| 110 | + std::string qual_buffer; |
| 111 | + |
| 112 | + //!\brief The low-level iterator. |
| 113 | + lowlevel_iterator it; |
| 114 | + //!\brief A line counter. |
| 115 | + size_t line = -1; |
| 116 | + |
| 117 | + //!\brief Read the raw record [the base class invokes this function]. |
| 118 | + void read_raw_record() |
| 119 | + { |
| 120 | + id_buffer.clear(); |
| 121 | + seq_buffer.clear(); |
| 122 | + qual_buffer.clear(); |
| 123 | + |
| 124 | + /* READ ID */ |
| 125 | + { |
| 126 | + ++it; |
| 127 | + ++line; |
| 128 | + |
| 129 | + if (it == std::default_sentinel) |
| 130 | + error("Reached end of file while trying to read ID."); |
| 131 | + |
| 132 | + std::string_view current_line = *it; |
| 133 | + |
| 134 | + if (current_line.empty()) |
| 135 | + error("Expected to be on begin of record but line is empty."); |
| 136 | + |
| 137 | + if ((!seqan3::is_char<'@'>)(current_line[0])) |
| 138 | + error("ID-line does not begin with '@'."); |
| 139 | + |
| 140 | + if (truncate_ids) |
| 141 | + { |
| 142 | + size_t e = 1; |
| 143 | + for (; e < current_line.size() && (!seqan3::is_space)(current_line[e]); ++e) |
| 144 | + {} |
| 145 | + detail::string_copy(current_line.substr(1, e - 1), id_buffer); |
| 146 | + } |
| 147 | + else |
| 148 | + { |
| 149 | + detail::string_copy(current_line.substr(1), id_buffer); |
| 150 | + } |
| 151 | + |
| 152 | + get<field::id>(raw_record) = id_buffer; |
| 153 | + } |
| 154 | + |
| 155 | + /* READ SEQ */ |
| 156 | + { |
| 157 | + ++it; |
| 158 | + ++line; |
| 159 | + |
| 160 | + if (it == std::default_sentinel) |
| 161 | + error("Reached end of file while trying to read SEQ."); |
| 162 | + |
| 163 | + detail::string_copy(*it, seq_buffer); // is allowed to be empty |
| 164 | + |
| 165 | + get<field::seq>(raw_record) = seq_buffer; |
| 166 | + } |
| 167 | + |
| 168 | + /* READ third line */ |
| 169 | + { |
| 170 | + ++it; |
| 171 | + ++line; |
| 172 | + |
| 173 | + if (it == std::default_sentinel) |
| 174 | + error("Reached end of file while trying to read third FastQ record line."); |
| 175 | + |
| 176 | + if ((!seqan3::is_char<'+'>)((*it)[0])) |
| 177 | + error("Third FastQ record line does not begin with '+'."); |
| 178 | + |
| 179 | + // we don't process the rest of the line |
| 180 | + } |
| 181 | + |
| 182 | + /* READ QUAL */ |
| 183 | + { |
| 184 | + ++it; |
| 185 | + ++line; |
| 186 | + |
| 187 | + if (it == std::default_sentinel) |
| 188 | + error("Reached end of file while trying to read QUALITIES."); |
| 189 | + |
| 190 | + detail::string_copy(*it, qual_buffer); // is allowed to be empty |
| 191 | + |
| 192 | + get<field::qual>(raw_record) = qual_buffer; |
| 193 | + } |
| 194 | + |
| 195 | + if (size_t ssize = seq_buffer.size(), qsize = qual_buffer.size(); ssize != qsize) |
| 196 | + error("Size mismatch between sequence (", ssize, ") and qualities (", qsize, ")."); |
| 197 | + } |
| 198 | + //!\} |
| 199 | + |
| 200 | + /*!\name Parsed record handling |
| 201 | + * \brief This is mostly done via the defaults in the base class. |
| 202 | + * \{ |
| 203 | + */ |
| 204 | + //!\brief We can prevent another copy if the user wants a string. |
| 205 | + void parse_field(vtag_t<field::id> const & /**/, std::string & parsed_field) { std::swap(id_buffer, parsed_field); } |
| 206 | + |
| 207 | + //!\brief We can prevent another copy if the user wants a string. |
| 208 | + void parse_field(vtag_t<field::seq> const & /**/, std::string & parsed_field) |
| 209 | + { |
| 210 | + std::swap(seq_buffer, parsed_field); |
| 211 | + } |
| 212 | + |
| 213 | + //!\brief We can prevent another copy if the user wants a string. |
| 214 | + void parse_field(vtag_t<field::qual> const & /**/, std::string & parsed_field) |
| 215 | + { |
| 216 | + std::swap(qual_buffer, parsed_field); |
| 217 | + } |
| 218 | + //!\} |
| 219 | + |
| 220 | +public: |
| 221 | + /*!\name Constructors, destructor and assignment. |
| 222 | + * \{ |
| 223 | + */ |
| 224 | + format_input_handler() = default; //!< Defaulted. |
| 225 | + format_input_handler(format_input_handler const &) = delete; //!< Deleted. |
| 226 | + format_input_handler(format_input_handler &&) = default; //!< Defaulted. |
| 227 | + ~format_input_handler() = default; //!< Defaulted. |
| 228 | + format_input_handler & operator=(format_input_handler const &) = delete; //!< Deleted. |
| 229 | + format_input_handler & operator=(format_input_handler &&) = default; //!< Defaulted. |
| 230 | + |
| 231 | + /*!\brief Construct with an options object. |
| 232 | + * \param[in,out] str The input stream. |
| 233 | + * \param[in] options An object with options for the input handler. |
| 234 | + * \details |
| 235 | + * |
| 236 | + * The options argument is typically bio::seq_io::reader_options, but any object with a subset of similarly named |
| 237 | + * members is also accepted. See bio::format_input_handler<bio::fastq> for the supported options and defaults. |
| 238 | + */ |
| 239 | + format_input_handler(std::istream & str, auto const & options) : base_t{str}, it{str, false} |
| 240 | + { |
| 241 | + if constexpr (requires { (bool)options.truncate_ids; }) |
| 242 | + { |
| 243 | + truncate_ids = options.truncate_ids; |
| 244 | + } |
| 245 | + } |
| 246 | + |
| 247 | + //!\brief Construct with only an input stream. |
| 248 | + format_input_handler(std::istream & str) : format_input_handler{str, int{}} {} |
| 249 | + //!\} |
| 250 | +}; |
| 251 | + |
| 252 | +} // namespace bio |
0 commit comments