SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <bit>
16#include <iterator>
17#include <ranges>
18#include <string>
19#include <vector>
20
33
34namespace seqan3
35{
36
50{
51public:
55 // string_buffer is of type std::string and has some problems with pre-C++11 ABI
56 format_bam() = default;
57 format_bam(format_bam const &) = default;
58 format_bam & operator=(format_bam const &) = default;
59 format_bam(format_bam &&) = default;
61 ~format_bam() = default;
62
64
67
68protected:
69 template <typename stream_type, // constraints checked by file
70 typename seq_legal_alph_type,
71 typename ref_seqs_type,
72 typename ref_ids_type,
73 typename stream_pos_type,
74 typename seq_type,
75 typename id_type,
76 typename ref_seq_type,
77 typename ref_id_type,
78 typename ref_offset_type,
79 typename cigar_type,
80 typename flag_type,
81 typename mapq_type,
82 typename qual_type,
83 typename mate_type,
84 typename tag_dict_type,
85 typename e_value_type,
86 typename bit_score_type>
87 void read_alignment_record(stream_type & stream,
88 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
89 ref_seqs_type & ref_seqs,
91 stream_pos_type & position_buffer,
92 seq_type & seq,
93 qual_type & qual,
94 id_type & id,
95 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
96 ref_id_type & ref_id,
97 ref_offset_type & ref_offset,
98 cigar_type & cigar_vector,
99 flag_type & flag,
100 mapq_type & mapq,
101 mate_type & mate,
102 tag_dict_type & tag_dict,
103 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
104 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
105
106 template <typename stream_type,
107 typename header_type,
108 typename seq_type,
109 typename id_type,
110 typename ref_seq_type,
111 typename ref_id_type,
112 typename cigar_type,
113 typename qual_type,
114 typename mate_type,
115 typename tag_dict_type>
116 void write_alignment_record([[maybe_unused]] stream_type & stream,
117 [[maybe_unused]] sam_file_output_options const & options,
118 [[maybe_unused]] header_type && header,
119 [[maybe_unused]] seq_type && seq,
120 [[maybe_unused]] qual_type && qual,
121 [[maybe_unused]] id_type && id,
122 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
123 [[maybe_unused]] ref_id_type && ref_id,
124 [[maybe_unused]] std::optional<int32_t> ref_offset,
125 [[maybe_unused]] cigar_type && cigar_vector,
126 [[maybe_unused]] sam_flag const flag,
127 [[maybe_unused]] uint8_t const mapq,
128 [[maybe_unused]] mate_type && mate,
129 [[maybe_unused]] tag_dict_type && tag_dict,
130 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
131 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
132
134 template <typename stream_t, typename header_type>
135 void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);
136
137private:
139 bool header_was_read{false};
140
143
146 { // naming corresponds to official SAM/BAM specifications
147 int32_t block_size;
148 int32_t refID;
149 int32_t pos;
150 uint32_t l_read_name : 8;
151 uint32_t mapq : 8;
152 uint32_t bin : 16;
153 uint32_t n_cigar_op : 16;
155 int32_t l_seq;
156 int32_t next_refID;
157 int32_t next_pos;
158 int32_t tlen;
159 };
160
161 // clang-format off
164 {
165 []() constexpr {
167
168 using index_t = std::make_unsigned_t<char>;
169
170 // ret['M'] = 0; set anyway by initialization
171 ret[static_cast<index_t>('I')] = 1;
172 ret[static_cast<index_t>('D')] = 2;
173 ret[static_cast<index_t>('N')] = 3;
174 ret[static_cast<index_t>('S')] = 4;
175 ret[static_cast<index_t>('H')] = 5;
176 ret[static_cast<index_t>('P')] = 6;
177 ret[static_cast<index_t>('=')] = 7;
178 ret[static_cast<index_t>('X')] = 8;
179
180 return ret;
181 }()
182 };
183 // clang-format on
184
186 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
187 {
188 --end;
189 if (beg >> 14 == end >> 14)
190 return ((1 << 15) - 1) / 7 + (beg >> 14);
191 if (beg >> 17 == end >> 17)
192 return ((1 << 12) - 1) / 7 + (beg >> 17);
193 if (beg >> 20 == end >> 20)
194 return ((1 << 9) - 1) / 7 + (beg >> 20);
195 if (beg >> 23 == end >> 23)
196 return ((1 << 6) - 1) / 7 + (beg >> 23);
197 if (beg >> 26 == end >> 26)
198 return ((1 << 3) - 1) / 7 + (beg >> 26);
199 return 0;
200 }
201
208 template <typename stream_view_type, std::integral number_type>
209 void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
210 {
211 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
212 }
213
219 template <typename stream_view_type>
220 void read_float_byte_field(stream_view_type && stream_view, float & target)
221 {
222 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
223 }
224
225 template <typename stream_view_type, typename value_type>
227 stream_view_type && stream_view,
228 value_type const & SEQAN3_DOXYGEN_ONLY(value));
229
230 template <typename stream_view_type>
231 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
232
233 template <typename cigar_input_type>
234 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
235
236 static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
237};
238
240template <typename stream_type, // constraints checked by file
241 typename seq_legal_alph_type,
242 typename ref_seqs_type,
243 typename ref_ids_type,
244 typename stream_pos_type,
245 typename seq_type,
246 typename id_type,
247 typename ref_seq_type,
248 typename ref_id_type,
249 typename ref_offset_type,
250 typename cigar_type,
251 typename flag_type,
252 typename mapq_type,
253 typename qual_type,
254 typename mate_type,
255 typename tag_dict_type,
256 typename e_value_type,
257 typename bit_score_type>
258inline void
260 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
261 ref_seqs_type & ref_seqs,
263 stream_pos_type & position_buffer,
264 seq_type & seq,
265 qual_type & qual,
266 id_type & id,
267 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
268 ref_id_type & ref_id,
269 ref_offset_type & ref_offset,
270 cigar_type & cigar_vector,
271 flag_type & flag,
272 mapq_type & mapq,
273 mate_type & mate,
274 tag_dict_type & tag_dict,
275 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
276 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
277{
278 static_assert(detail::decays_to_ignore_v<ref_offset_type>
279 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
280 "The ref_offset must be a specialisation of std::optional.");
281
282 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
283 "The type of field::mapq must be uint8_t.");
284
285 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
286 "The type of field::flag must be seqan3::sam_flag.");
287
288 auto stream_view = seqan3::detail::istreambuf(stream);
289
290 [[maybe_unused]] int32_t ref_length{}; // needed in case the cigar string was stored in the tag dictionary
291
292 // Header
293 // -------------------------------------------------------------------------------------------------------------
294 if (!header_was_read)
295 {
296 // magic BAM string
298 throw format_error{"File is not in BAM format."};
299
300 int32_t l_text{}; // length of header text including \0 character
301 int32_t n_ref{}; // number of reference sequences
302 int32_t l_name{}; // 1 + length of reference name including \0 character
303 int32_t l_ref{}; // length of reference sequence
304
305 read_integral_byte_field(stream_view, l_text);
306
307 if (l_text > 0) // header text is present
308 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
309
310 read_integral_byte_field(stream_view, n_ref);
311
312 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
313 {
314 read_integral_byte_field(stream_view, l_name);
315
316 string_buffer.resize(l_name - 1);
318 l_name - 1,
319 string_buffer.data()); // copy without \0 character
320 ++std::ranges::begin(stream_view); // skip \0 character
321
322 read_integral_byte_field(stream_view, l_ref);
323
324 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
325 {
326 // If there was no header text, we parse reference sequences block as header information
327 if (l_text == 0)
328 {
329 auto & reference_ids = header.ref_ids();
330 // put the length of the reference sequence into ref_id_info
331 header.ref_id_info.emplace_back(l_ref, "");
332 // put the reference name into reference_ids
333 reference_ids.push_back(string_buffer);
334 // assign the reference name an ascending reference id (starts at index 0).
335 header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
336 continue;
337 }
338 }
339
340 auto id_it = header.ref_dict.find(string_buffer);
341
342 // sanity checks of reference information to existing header object:
343 if (id_it == header.ref_dict.end()) // [unlikely]
344 {
345 throw format_error{detail::to_string("Unknown reference name '" + string_buffer
346 + "' found in BAM file header (header.ref_ids():",
347 header.ref_ids(),
348 ").")};
349 }
350 else if (id_it->second != ref_idx) // [unlikely]
351 {
352 throw format_error{detail::to_string("Reference id '",
354 "' at position ",
355 ref_idx,
356 " does not correspond to the position ",
357 id_it->second,
358 " in the header (header.ref_ids():",
359 header.ref_ids(),
360 ").")};
361 }
362 else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
363 {
364 throw format_error{"Provided reference has unequal length as specified in the header."};
365 }
366 }
367
368 header_was_read = true;
369
370 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
371 return;
372 }
373
374 // read alignment record into buffer
375 // -------------------------------------------------------------------------------------------------------------
376 position_buffer = stream.tellg();
378 std::ranges::copy(stream_view | detail::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
379
380 if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
381 {
382 throw format_error{detail::to_string("Reference id index '",
383 core.refID,
384 "' is not in range of ",
385 "header.ref_ids(), which has size ",
386 header.ref_ids().size(),
387 ".")};
388 }
389 else if (core.refID > -1) // not unmapped
390 {
391 ref_id = core.refID; // field::ref_id
392 }
393
394 flag = core.flag; // field::flag
395 mapq = core.mapq; // field::mapq
396
397 if (core.pos > -1) // [[likely]]
398 ref_offset = core.pos; // field::ref_offset
399
400 if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
401 {
402 if (core.next_refID > -1)
403 get<0>(mate) = core.next_refID;
404
405 if (core.next_pos > -1) // [[likely]]
406 get<1>(mate) = core.next_pos;
407
408 get<2>(mate) = core.tlen;
409 }
410
411 // read id
412 // -------------------------------------------------------------------------------------------------------------
413 auto id_view = stream_view | detail::take_exactly_or_throw(core.l_read_name - 1);
414 if constexpr (!detail::decays_to_ignore_v<id_type>)
415 read_forward_range_field(id_view, id); // field::id
416 else
417 detail::consume(id_view);
418 ++std::ranges::begin(stream_view); // skip '\0'
419
420 // read cigar string
421 // -------------------------------------------------------------------------------------------------------------
422 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
423 {
424 int32_t seq_length{};
425 std::tie(cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
426 }
427 else
428 {
430 }
431
432 // read sequence
433 // -------------------------------------------------------------------------------------------------------------
434 if (core.l_seq > 0) // sequence information is given
435 {
436 auto seq_stream = stream_view | detail::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
439 {
440 return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
441 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
442 });
443
444 if constexpr (detail::decays_to_ignore_v<seq_type>)
445 {
446 auto skip_sequence_bytes = [&]()
447 {
448 detail::consume(seq_stream);
449 if (core.l_seq & 1)
450 ++std::ranges::begin(stream_view);
451 };
452
453 skip_sequence_bytes();
454 }
455 else
456 {
457 using alph_t = std::ranges::range_value_t<decltype(seq)>;
458 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
459
460 for (auto [d1, d2] : seq_stream)
461 {
462 seq.push_back(from_dna16[to_rank(d1)]);
463 seq.push_back(from_dna16[to_rank(d2)]);
464 }
465
466 if (core.l_seq & 1)
467 {
468 dna16sam d =
469 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
470 seq.push_back(from_dna16[to_rank(d)]);
471 ++std::ranges::begin(stream_view);
472 }
473 }
474 }
475
476 // read qual string
477 // -------------------------------------------------------------------------------------------------------------
478 auto qual_view = stream_view | detail::take_exactly_or_throw(core.l_seq)
480 [](char chr)
481 {
482 return static_cast<char>(chr + 33);
483 });
484 if constexpr (!detail::decays_to_ignore_v<qual_type>)
485 read_forward_range_field(qual_view, qual);
486 else
487 detail::consume(qual_view);
488
489 // All remaining optional fields if any: SAM tags dictionary
490 // -------------------------------------------------------------------------------------------------------------
491 int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4 /*block_size excluded*/)
492 - core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
493 assert(remaining_bytes >= 0);
494 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
495
496 while (tags_view.size() > 0)
497 {
498 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
499 read_sam_dict_field(tags_view, tag_dict);
500 else
501 detail::consume(tags_view);
502 }
503
504 // DONE READING - wrap up
505 // -------------------------------------------------------------------------------------------------------------
506 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
507 {
508 int32_t const sc_front = soft_clipping_at_front(cigar_vector);
509
510 // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
511 // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
512 // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
513 if (core.l_seq != 0 && sc_front == core.l_seq)
514 {
515 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
516 { // maybe only throw in debug mode and otherwise return an empty alignment?
517 throw format_error{
518 detail::to_string("The cigar string '",
519 sc_front,
520 "S",
521 ref_length,
522 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
523 "stored in the optional field CG. You need to read in the field::tags and "
524 "field::seq in order to access this information.")};
525 }
526 else
527 {
528 auto it = tag_dict.find("CG"_tag);
529
530 if (it == tag_dict.end())
532 "The cigar string '",
533 sc_front,
534 "S",
535 ref_length,
536 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
537 "stored in the optional field CG but this tag is not present in the given ",
538 "record.")};
539
540 auto cigar_view = std::views::all(std::get<std::string>(it->second));
541 int32_t seq_length{};
542 std::tie(cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
543 tag_dict.erase(it); // remove redundant information
544 }
545 }
546 }
547}
548
550template <typename stream_type,
551 typename header_type,
552 typename seq_type,
553 typename id_type,
554 typename ref_seq_type,
555 typename ref_id_type,
556 typename cigar_type,
557 typename qual_type,
558 typename mate_type,
559 typename tag_dict_type>
560inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
561 [[maybe_unused]] sam_file_output_options const & options,
562 [[maybe_unused]] header_type && header,
563 [[maybe_unused]] seq_type && seq,
564 [[maybe_unused]] qual_type && qual,
565 [[maybe_unused]] id_type && id,
566 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
567 [[maybe_unused]] ref_id_type && ref_id,
568 [[maybe_unused]] std::optional<int32_t> ref_offset,
569 [[maybe_unused]] cigar_type && cigar_vector,
570 [[maybe_unused]] sam_flag const flag,
571 [[maybe_unused]] uint8_t const mapq,
572 [[maybe_unused]] mate_type && mate,
573 [[maybe_unused]] tag_dict_type && tag_dict,
574 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
575 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
576{
577 // ---------------------------------------------------------------------
578 // Type Requirements (as static asserts for user friendliness)
579 // ---------------------------------------------------------------------
580 static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
581 "The seq object must be a std::ranges::forward_range over "
582 "letters that model seqan3::alphabet.");
583
584 static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
585 "The id object must be a std::ranges::forward_range over "
586 "letters that model seqan3::alphabet.");
587
588 static_assert((std::ranges::forward_range<ref_seq_type> && alphabet<std::ranges::range_reference_t<ref_seq_type>>),
589 "The ref_seq object must be a std::ranges::forward_range "
590 "over letters that model seqan3::alphabet.");
591
592 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
593 {
594 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
595 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
596 "The ref_id object must be a std::ranges::forward_range "
597 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
598 }
599
600 static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
601 "The qual object must be a std::ranges::forward_range "
602 "over letters that model seqan3::alphabet.");
603
605 "The mate object must be a std::tuple of size 3 with "
606 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
607 "2) a std::integral or std::optional<std::integral>, and "
608 "3) a std::integral.");
609
610 static_assert(
611 ((std::ranges::forward_range<decltype(std::get<0>(mate))>
612 || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
613 || detail::is_type_specialisation_of_v<
614 std::remove_cvref_t<decltype(std::get<0>(mate))>,
615 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
616 || detail::is_type_specialisation_of_v<
617 std::remove_cvref_t<decltype(std::get<1>(mate))>,
618 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
619 "The mate object must be a std::tuple of size 3 with "
620 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
621 "2) a std::integral or std::optional<std::integral>, and "
622 "3) a std::integral.");
623
624 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
625 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
626
627 if constexpr (detail::decays_to_ignore_v<header_type>)
628 {
629 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
630 "You can either construct the output file with reference names and reference length "
631 "information and the header will be created for you, or you can access the `header` member "
632 "directly."};
633 }
634 else
635 {
636 // ---------------------------------------------------------------------
637 // logical Requirements
638 // ---------------------------------------------------------------------
639
640 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
641 throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
642
643 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
644
645 // ---------------------------------------------------------------------
646 // Writing the BAM Header on first call
647 // ---------------------------------------------------------------------
649 {
650 write_header(stream, options, header);
651 header_was_written = true;
652 }
653
654 // ---------------------------------------------------------------------
655 // Writing the Record
656 // ---------------------------------------------------------------------
657 int32_t ref_length{};
658
659 // Compute the ref_length from given cigar_vector which is needed to fill field `bin`.
660 if (!std::ranges::empty(cigar_vector))
661 {
662 int32_t dummy_seq_length{};
663 for (auto & [count, operation] : cigar_vector)
664 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
665 }
666
667 if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
668 {
669 tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
670 cigar_vector.resize(2);
671 cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
672 cigar_vector[1] = cigar{static_cast<uint32_t>(ref_length), 'N'_cigar_operation};
673 }
674
675 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
676
677 // Compute the value for the l_read_name field for the bam record.
678 // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
679 // the data type to store the value is uint8_t and 255 is the maximal size.
680 // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
681 // 2 bytes.
682 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
683 read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
684
685 alignment_record_core core{/* block_size */ 0, // will be initialised right after
686 /* refID */ -1, // will be initialised right after
687 /* pos */ ref_offset.value_or(-1),
688 /* l_read_name */ read_name_size,
689 /* mapq */ mapq,
690 /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
691 /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
692 /* flag */ flag,
693 /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
694 /* next_refId */ -1, // will be initialised right after
695 /* next_pos */ get<1>(mate).value_or(-1),
696 /* tlen */ get<2>(mate)};
697
698 auto check_and_assign_id_to = [&header]([[maybe_unused]] auto & id_source, [[maybe_unused]] auto & id_target)
699 {
700 using id_t = std::remove_reference_t<decltype(id_source)>;
701
702 if constexpr (!detail::decays_to_ignore_v<id_t>)
703 {
704 if constexpr (std::integral<id_t>)
705 {
706 id_target = id_source;
707 }
708 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
709 {
710 id_target = id_source.value_or(-1);
711 }
712 else
713 {
714 if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
715 {
716 auto id_it = header.ref_dict.end();
717
718 if constexpr (std::ranges::contiguous_range<decltype(id_source)>
719 && std::ranges::sized_range<decltype(id_source)>
720 && std::ranges::borrowed_range<decltype(id_source)>)
721 {
722 id_it = header.ref_dict.find(
723 std::span{std::ranges::data(id_source), std::ranges::size(id_source)});
724 }
725 else
726 {
727 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
728
729 static_assert(
730 implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
731 "The ref_id type is not convertible to the reference id information stored in the "
732 "reference dictionary of the header object.");
733
734 id_it = header.ref_dict.find(id_source);
735 }
736
737 if (id_it == header.ref_dict.end())
738 {
739 throw format_error{detail::to_string("Unknown reference name '",
740 id_source,
741 "' could "
742 "not be found in BAM header ref_dict: ",
743 header.ref_dict,
744 ".")};
745 }
746
747 id_target = id_it->second;
748 }
749 }
750 }
751 };
752
753 // initialise core.refID
754 check_and_assign_id_to(ref_id, core.refID);
755
756 // initialise core.next_refID
757 check_and_assign_id_to(get<0>(mate), core.next_refID);
758
759 // initialise core.block_size
760 core.block_size = sizeof(core) - 4 /*block_size excluded*/ + core.l_read_name + core.n_cigar_op * 4
761 + // each int32_t has 4 bytes
762 (core.l_seq + 1) / 2 + // bitcompressed seq
763 core.l_seq + // quality string
764 tag_dict_binary_str.size();
765
766 std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
767
768 if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
769 stream_it = '*';
770 else
771 std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
772 stream_it = '\0';
773
774 // write cigar
775 for (auto [cigar_count, op] : cigar_vector)
776 {
777 cigar_count = cigar_count << 4;
778 cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
779 std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
780 }
781
782 // write seq (bit-compressed: dna16sam characters go into one byte)
783 using alph_t = std::ranges::range_value_t<seq_type>;
784 constexpr auto to_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
785
786 auto sit = std::ranges::begin(seq);
787 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
788 {
789 uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
790 ++sidx, ++sit;
791 compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
792 stream_it = static_cast<char>(compressed_chr);
793 }
794
795 if (core.l_seq & 1) // write one more
796 stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
797
798 // write qual
799 if (std::ranges::empty(qual))
800 {
801 auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
802 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
803 }
804 else
805 {
806 if (std::ranges::distance(qual) != core.l_seq)
807 throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
808 core.l_seq,
809 ". Got quality with size ",
810 std::ranges::distance(qual),
811 " instead.")};
812
813 auto v = qual
815 [](auto chr)
816 {
817 return static_cast<char>(to_rank(chr));
818 });
819 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
820 }
821
822 // write optional fields
823 stream << tag_dict_binary_str;
824 } // if constexpr (!detail::decays_to_ignore_v<header_type>)
825}
826
828template <typename stream_t, typename header_type>
829inline void format_bam::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header)
830{
831 if constexpr (detail::decays_to_ignore_v<header_type>)
832 {
833 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
834 "You can either construct the output file with reference names and reference length "
835 "information and the header will be created for you, or you can access the `header` member "
836 "directly."};
837 }
838 else
839 {
840 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
841
842 std::ranges::copy_n("BAM\1", 4, stream_it); // Do not copy the null terminator
843
844 // write SAM header to temporary stream first to query its size.
846 detail::format_sam_base::write_header(os, options, header);
847#if SEQAN3_WORKAROUND_GCC_NO_CXX11_ABI || (SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10))
848 int32_t const l_text{static_cast<int32_t>(os.str().size())};
849#else
850 int32_t const l_text{static_cast<int32_t>(os.view().size())};
851#endif
852 std::ranges::copy_n(reinterpret_cast<char const *>(&l_text), 4, stream_it); // write text length
853
854#if SEQAN3_WORKAROUND_GCC_NO_CXX11_ABI || (SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10))
855 auto header_view = os.str();
856#else
857 auto header_view = os.view();
858#endif
859 std::ranges::copy(header_view, stream_it);
860
861 assert(header.ref_ids().size() < (1ull << 32));
862 int32_t const n_ref{static_cast<int32_t>(header.ref_ids().size())};
863 std::ranges::copy_n(reinterpret_cast<char const *>(&n_ref), 4, stream_it); // write number of references
864
865 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
866 {
867 assert(header.ref_ids()[ridx].size() + 1 < (1ull << 32));
868 int32_t const l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
869 std::ranges::copy_n(reinterpret_cast<char const *>(&l_name), 4, stream_it); // write l_name
870 // write reference name:
871 std::ranges::copy(header.ref_ids()[ridx], stream_it);
872 stream_it = '\0'; // ++ is not necessary for ostream_iterator
873 // write reference sequence length:
874 std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
875 }
876 }
877}
878
880template <typename stream_view_type, typename value_type>
882 stream_view_type && stream_view,
883 value_type const & SEQAN3_DOXYGEN_ONLY(value))
884{
885 int32_t count;
886 read_integral_byte_field(stream_view, count); // read length of vector
887 std::vector<value_type> tmp_vector;
888 tmp_vector.reserve(count);
889
890 while (count > 0)
891 {
892 value_type tmp{};
893 if constexpr (std::integral<value_type>)
894 {
895 read_integral_byte_field(stream_view, tmp);
896 }
897 else if constexpr (std::same_as<value_type, float>)
898 {
899 read_float_byte_field(stream_view, tmp);
900 }
901 else
902 {
903 constexpr bool always_false = std::is_same_v<value_type, void>;
904 static_assert(always_false, "format_bam::read_sam_dict_vector: unsupported value_type");
905 }
906 tmp_vector.push_back(std::move(tmp));
907 --count;
908 }
909 variant = std::move(tmp_vector);
910}
911
929template <typename stream_view_type>
930inline void format_bam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
931{
932 /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
933 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
934 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
935 VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
936 by the length (int32_t) of the array, followed by the values.
937 */
938 auto it = std::ranges::begin(stream_view);
939 uint16_t tag = static_cast<uint16_t>(*it) << 8;
940 ++it; // skip char read before
941
942 tag += static_cast<uint16_t>(*it);
943 ++it; // skip char read before
944
945 char type_id = *it;
946 ++it; // skip char read before
947
948 switch (type_id)
949 {
950 case 'A': // char
951 {
952 target[tag] = *it;
953 ++it; // skip char that has been read
954 break;
955 }
956 // all integer sizes are possible
957 case 'c': // int8_t
958 {
959 int8_t tmp;
960 read_integral_byte_field(stream_view, tmp);
961 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
962 break;
963 }
964 case 'C': // uint8_t
965 {
966 uint8_t tmp;
967 read_integral_byte_field(stream_view, tmp);
968 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
969 break;
970 }
971 case 's': // int16_t
972 {
973 int16_t tmp;
974 read_integral_byte_field(stream_view, tmp);
975 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
976 break;
977 }
978 case 'S': // uint16_t
979 {
980 uint16_t tmp;
981 read_integral_byte_field(stream_view, tmp);
982 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
983 break;
984 }
985 case 'i': // int32_t
986 {
987 int32_t tmp;
988 read_integral_byte_field(stream_view, tmp);
989 target[tag] = std::move(tmp); // readable sam format only allows int32_t
990 break;
991 }
992 case 'I': // uint32_t
993 {
994 uint32_t tmp;
995 read_integral_byte_field(stream_view, tmp);
996 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
997 break;
998 }
999 case 'f': // float
1000 {
1001 float tmp;
1002 read_float_byte_field(stream_view, tmp);
1003 target[tag] = tmp;
1004 break;
1005 }
1006 case 'Z': // string
1007 {
1009 while (!is_char<'\0'>(*it))
1010 {
1012 ++it;
1013 }
1014 ++it; // skip \0
1015 target[tag] = string_buffer;
1016 break;
1017 }
1018 case 'H': // byte array, represented as null-terminated string; specification requires even number of bytes
1019 {
1020 std::vector<std::byte> byte_array;
1021 std::byte value;
1022 while (!is_char<'\0'>(*it))
1023 {
1026 ++it;
1027
1028 if (*it == '\0')
1029 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1030
1032 ++it;
1034 byte_array.push_back(value);
1035 }
1036 ++it; // skip \0
1037 target[tag] = byte_array;
1038 break;
1039 }
1040 case 'B': // Array. Value type depends on second char [cCsSiIf]
1041 {
1042 char array_value_type_id = *it;
1043 ++it; // skip char read before
1044
1045 switch (array_value_type_id)
1046 {
1047 case 'c': // int8_t
1048 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1049 break;
1050 case 'C': // uint8_t
1051 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1052 break;
1053 case 's': // int16_t
1054 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1055 break;
1056 case 'S': // uint16_t
1057 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1058 break;
1059 case 'i': // int32_t
1060 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1061 break;
1062 case 'I': // uint32_t
1063 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1064 break;
1065 case 'f': // float
1066 read_sam_dict_vector(target[tag], stream_view, float{});
1067 break;
1068 default:
1069 throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1070 "must be one of [cCsSiIf] but '",
1071 array_value_type_id,
1072 "' was given.")};
1073 }
1074 break;
1075 }
1076 default:
1077 throw format_error{detail::to_string("The second character in the numerical id of a "
1078 "SAM tag must be one of [A,i,Z,H,B,f] but '",
1079 type_id,
1080 "' was given.")};
1081 }
1082}
1083
1098template <typename cigar_input_type>
1099inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1100{
1101 std::vector<cigar> operations{};
1102 char operation{'\0'};
1103 uint32_t count{};
1104 int32_t ref_length{}, seq_length{};
1105 uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1106 constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1107 constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1108
1109 if (n_cigar_op == 0) // [[unlikely]]
1110 return std::tuple{operations, ref_length, seq_length};
1111
1112 // parse the rest of the cigar
1113 // -------------------------------------------------------------------------------------------------------------
1114 while (n_cigar_op > 0) // until stream is not empty
1115 {
1117 sizeof(operation_and_count),
1118 reinterpret_cast<char *>(&operation_and_count));
1119 operation = cigar_mapping[operation_and_count & cigar_mask];
1120 count = operation_and_count >> 4;
1121
1122 detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1123 operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1124 --n_cigar_op;
1125 }
1126
1127 return std::tuple{operations, ref_length, seq_length};
1128}
1129
1134{
1135 std::string result{};
1136
1137 auto stream_variant_fn = [&result](auto && arg) // helper to print a std::variant
1138 {
1139 // T is either char, int32_t, float, std::string, or a std::vector<some int>
1140 using T = std::remove_cvref_t<decltype(arg)>;
1141
1142 if constexpr (std::same_as<T, int32_t>)
1143 {
1144 // always choose the smallest possible representation [cCsSiI]
1145 size_t const absolute_arg = std::abs(arg);
1146 auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1147 bool const negative = arg < 0;
1148 n = n * n + 2 * negative; // for switch case order
1149
1150 switch (n)
1151 {
1152 case 0:
1153 {
1154 result[result.size() - 1] = 'C';
1155 result.append(reinterpret_cast<char const *>(&arg), 1);
1156 break;
1157 }
1158 case 1:
1159 {
1160 result[result.size() - 1] = 'S';
1161 result.append(reinterpret_cast<char const *>(&arg), 2);
1162 break;
1163 }
1164 case 2:
1165 {
1166 result[result.size() - 1] = 'c';
1167 int8_t tmp = static_cast<int8_t>(arg);
1168 result.append(reinterpret_cast<char const *>(&tmp), 1);
1169 break;
1170 }
1171 case 3:
1172 {
1173 result[result.size() - 1] = 's';
1174 int16_t tmp = static_cast<int16_t>(arg);
1175 result.append(reinterpret_cast<char const *>(&tmp), 2);
1176 break;
1177 }
1178 default:
1179 {
1180 result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1181 break;
1182 }
1183 }
1184 }
1185 else if constexpr (std::same_as<T, std::string>)
1186 {
1187 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1 /*+ null character*/);
1188 }
1189 else if constexpr (!std::ranges::range<T>) // char, float
1190 {
1191 result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1192 }
1193 else // std::vector of some arithmetic_type type
1194 {
1195 int32_t sz{static_cast<int32_t>(arg.size())};
1196 result.append(reinterpret_cast<char *>(&sz), 4);
1197 result.append(reinterpret_cast<char const *>(arg.data()),
1198 arg.size() * sizeof(std::ranges::range_value_t<T>));
1199 }
1200 };
1201
1202 for (auto & [tag, variant] : tag_dict)
1203 {
1204 result.push_back(static_cast<char>(tag / 256));
1205 result.push_back(static_cast<char>(tag % 256));
1206
1207 result.push_back(detail::sam_tag_type_char[variant.index()]);
1208
1209 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1210 result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1211
1212 std::visit(stream_variant_fn, variant);
1213 }
1214
1215 return result;
1216}
1217
1218} // namespace seqan3
T begin(T... args)
T bit_ceil(T... args)
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:163
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:187
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: alphabet/cigar/cigar.hpp:60
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition: fast_ostreambuf_iterator.hpp:40
The alignment base format.
Definition: format_sam_base.hpp:45
void write_header(stream_t &stream, sam_file_output_options const &options, header_type &header)
Writes the SAM header.
Definition: format_sam_base.hpp:625
int32_t soft_clipping_at_front(std::vector< cigar > const &cigar_vector) const
Returns the soft clipping value at the front of the cigar_vector or 0 if none present.
Definition: format_sam_base.hpp:148
void read_forward_range_field(stream_view_type &&stream_view, target_range_type &target)
Reads a range by copying from stream_view to target, converting values with seqan3::views::char_to.
Definition: format_sam_base.hpp:220
bool header_was_written
A variable that tracks whether the content of header has been written or not.
Definition: format_sam_base.hpp:66
void read_header(stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &)
Reads the SAM header.
Definition: format_sam_base.hpp:288
void read_byte_field(stream_view_t &&stream_view, std::byte &byte_target)
Reads std::byte fields using std::from_chars.
Definition: format_sam_base.hpp:192
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The actual implementation of seqan3::cigar::operation for documentation purposes only....
Definition: cigar_operation.hpp:48
The BAM format.
Definition: format_bam.hpp:50
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:259
void read_sam_dict_field(stream_view_type &&stream_view, sam_tag_dictionary &target)
Reads the optional tag fields into the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:930
format_bam()=default
Defaulted.
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:560
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
void write_header(stream_t &stream, sam_file_output_options const &options, header_type &header)
Writes the SAM header.
Definition: format_bam.hpp:829
format_bam(format_bam &&)=default
Defaulted.
void read_integral_byte_field(stream_view_type &&stream_view, number_type &target)
Reads a arithmetic field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:209
~format_bam()=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
auto parse_binary_cigar(cigar_input_type &&cigar_input, uint16_t n_cigar_op) const
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: format_bam.hpp:1099
void read_float_byte_field(stream_view_type &&stream_view, float &target)
Reads a float field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:220
static std::string get_tag_dict_str(sam_tag_dictionary const &tag_dict)
Writes the optional fields of the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:1133
std::string string_buffer
Local buffer to read into while avoiding reallocation.
Definition: format_bam.hpp:142
static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
Computes the bin number for a given region [beg, end), copied from the official SAM specifications.
Definition: format_bam.hpp:186
void read_sam_dict_vector(seqan3::detail::sam_tag_variant &variant, stream_view_type &&stream_view, value_type const &value)
Reads a list of values separated by comma as it is the case for SAM tag arrays.
Definition: format_bam.hpp:881
bool header_was_read
A variable that tracks whether the content of header has been read or not.
Definition: format_bam.hpp:139
static constexpr std::array< uint8_t, 256 > char_to_sam_rank
Converts a cigar op character to the rank according to the official BAM specifications.
Definition: format_bam.hpp:164
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:66
Stores the header information of SAM/BAM files.
Definition: header.hpp:34
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:143
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:182
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:179
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:343
T clear(T... args)
T copy(T... args)
T copy_n(T... args)
T countr_zero(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T emplace_back(T... args)
T equal(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
T find(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: alphabet/concept.hpp:155
constexpr void consume(rng_t &&rng)
Iterate over a range (consumes single-pass input ranges).
Definition: core/range/detail/misc.hpp:28
constexpr auto all
Returns a view that includes all elements of the range argument.
Definition: all_view.hpp:204
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
constexpr std::tuple< std::vector< cigar >, int32_t, int32_t > parse_cigar(cigar_input_type &&cigar_input)
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: io/sam_file/detail/cigar.hpp:94
std::string get_cigar_string(std::vector< cigar > const &cigar_vector)
Transforms a vector of cigar elements into a string representation.
Definition: io/sam_file/detail/cigar.hpp:128
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition: sam_tag_dictionary.hpp:45
void update_alignment_lengths(int32_t &ref_length, int32_t &seq_length, char const cigar_operation, uint32_t const cigar_count)
Updates the sequence lengths by cigar_count depending on the cigar operation op.
Definition: io/sam_file/detail/cigar.hpp:51
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition: sam_tag_dictionary.hpp:42
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly_view.hpp:590
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf_view.hpp:107
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: type_list/traits.hpp:470
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: type_pack/traits.hpp:164
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
Whether a type behaves like a tuple.
Auxiliary functions for the SAM IO.
Provides seqan3::detail::istreambuf.
T min(T... args)
std::string to_string(value_type &&... values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition: to_string.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::debug_stream and related types.
T push_back(T... args)
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Stores all fixed length variables which can be read/written directly by reinterpreting the binary str...
Definition: format_bam.hpp:146
uint32_t bin
The bin number.
Definition: format_bam.hpp:152
sam_flag flag
The flag value (uint16_t enum).
Definition: format_bam.hpp:154
uint32_t n_cigar_op
The number of cigar operations of the alignment.
Definition: format_bam.hpp:153
int32_t next_refID
The reference id of the mate.
Definition: format_bam.hpp:156
int32_t l_seq
The number of bases of the read sequence.
Definition: format_bam.hpp:155
int32_t refID
The reference id the read was mapped to.
Definition: format_bam.hpp:148
int32_t pos
The begin position of the alignment.
Definition: format_bam.hpp:149
int32_t block_size
The size in bytes of the whole BAM record.
Definition: format_bam.hpp:147
uint32_t l_read_name
The length of the read name including the \0 character.
Definition: format_bam.hpp:150
int32_t tlen
The template length of the read and its mate.
Definition: format_bam.hpp:158
uint32_t mapq
The mapping quality.
Definition: format_bam.hpp:151
int32_t next_pos
The begin position of the mate alignment.
Definition: format_bam.hpp:157
Thrown if information given to output format didn't match expectations.
Definition: io/exception.hpp:91
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sam_file/input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: sam_file/output_options.hpp:26
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T tie(T... args)
T visit(T... args)