SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
alignment_from_cigar.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <vector>
16
22
23namespace seqan3
24{
25
83template <typename reference_type, typename sequence_type>
84inline auto alignment_from_cigar(std::vector<cigar> const & cigar_vector,
85 reference_type const & reference,
86 uint32_t const zero_based_reference_start_position,
87 sequence_type const & query)
88{
89 if (cigar_vector.empty())
90 throw std::logic_error{"An empty CIGAR is not a valid alignment representation."};
91
92 // compute the length of the aligned region in the reference sequence
93 // -------------------------------------------------------------------------
94 // this requires a first stream over the cigar vector.
95 uint32_t reference_length{0};
96 uint32_t query_length{0};
97
98 for (auto [cigar_count, cigar_operation] : cigar_vector)
99 {
100 if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation)
101 || ('X'_cigar_operation == cigar_operation))
102 {
103 reference_length += cigar_count;
104 query_length += cigar_count;
105 }
106 else if ('D'_cigar_operation == cigar_operation)
107 {
108 reference_length += cigar_count;
109 }
110 else if ('I'_cigar_operation == cigar_operation)
111 {
112 query_length += cigar_count;
113 }
114 }
115
116 if (static_cast<size_t>(zero_based_reference_start_position + reference_length) > std::ranges::size(reference))
117 throw std::logic_error{"The CIGAR string indicates a reference length of at least "
118 + std::to_string(zero_based_reference_start_position + reference_length)
119 + ", but the supplied reference sequence is only of size"
120 + std::to_string(std::ranges::size(reference)) + "."};
121
122 // Get soft clipping at the start and end of the CIGAR string
123 // -------------------------------------------------------------------------
124 uint32_t soft_clipping_start{0};
125 uint32_t soft_clipping_end{0};
126
127 // Checks whether the given index in the cigar vector is a soft clip.
128 auto soft_clipping_at = [&cigar_vector](size_t const index)
129 {
130 return cigar_vector[index] == 'S'_cigar_operation;
131 };
132 // Checks whether the given index in the cigar vector is a hard clip.
133 auto hard_clipping_at = [&](size_t const index)
134 {
135 return cigar_vector[index] == 'H'_cigar_operation;
136 };
137 // Checks whether the given cigar vector has at least min_size many elements.
138 auto vector_size_at_least = [&](size_t const min_size)
139 {
140 return cigar_vector.size() >= min_size;
141 };
142 // Returns the cigar count of the ith cigar element in the given cigar vector.
143 auto cigar_count_at = [&](size_t const index)
144 {
145 return get<0>(cigar_vector[index]);
146 };
147
148 // check for soft clipping at the first two positions
149 // cigar is non-empty, checked at the very beginning.
150 if (soft_clipping_at(0))
151 soft_clipping_start = cigar_count_at(0);
152 else if (vector_size_at_least(2) && hard_clipping_at(0) && soft_clipping_at(1))
153 soft_clipping_start = cigar_count_at(1);
154
155 // Check for soft clipping at the last two positions to validate the CIGAR string.
156 // Even if the two following arithmetics overflow, they are protected by the corresponding if expressions below.
157 auto const last_index = cigar_vector.size() - 1;
158 auto const second_last_index = last_index - 1;
159
160 if (vector_size_at_least(2) && soft_clipping_at(last_index))
161 soft_clipping_end = cigar_count_at(last_index);
162 else if (vector_size_at_least(3) && hard_clipping_at(last_index) && soft_clipping_at(second_last_index))
163 soft_clipping_end = cigar_count_at(second_last_index);
164
165 if (soft_clipping_start + query_length + soft_clipping_end != std::ranges::size(query))
166 throw std::logic_error{"The CIGAR string indicates a query/read sequence length of "
167 + std::to_string(soft_clipping_start + query_length + soft_clipping_end)
168 + ", but the supplied query/read sequence is of size"
169 + std::to_string(std::ranges::size(query)) + "."};
170
171 // Assign the sequence to the alignment (a tuple of 2 gap decorators)
172 // -------------------------------------------------------------------------
173 using gapped_reference_type = gap_decorator<decltype(reference | views::slice(0, 0))>;
174 using gapped_sequence_type = gap_decorator<decltype(query | views::slice(0, 0))>;
176
177 alignment_type alignment{};
178
179 assign_unaligned(get<0>(alignment),
180 reference
181 | views::slice(zero_based_reference_start_position,
182 zero_based_reference_start_position + reference_length));
183 // query_length already accounts for soft clipping at begin and end
184 assign_unaligned(get<1>(alignment), query | views::slice(soft_clipping_start, soft_clipping_start + query_length));
185
186 // Insert gaps into the alignment based on the cigar vector
187 // -------------------------------------------------------------------------
188 using std::get;
189 auto current_ref_pos = std::ranges::begin(get<0>(alignment));
190 auto current_read_pos = std::ranges::begin(get<1>(alignment));
191
192 for (auto [cigar_count, cigar_operation] : cigar_vector)
193 {
194 if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation)
195 || ('X'_cigar_operation == cigar_operation))
196 {
197 std::ranges::advance(current_ref_pos, cigar_count);
198 std::ranges::advance(current_read_pos, cigar_count);
199 }
200 else if (('D'_cigar_operation == cigar_operation) || ('N'_cigar_operation == cigar_operation))
201 {
202 // insert gaps into query
203 current_read_pos = get<1>(alignment).insert_gap(current_read_pos, cigar_count);
204 ++current_read_pos;
205 std::ranges::advance(current_ref_pos, cigar_count);
206 }
207 else if (('I'_cigar_operation == cigar_operation)) // Insert gaps into ref
208 {
209 current_ref_pos = get<0>(alignment).insert_gap(current_ref_pos, cigar_count);
210 ++current_ref_pos;
211 std::ranges::advance(current_read_pos, cigar_count);
212 }
213 else if (('P'_cigar_operation == cigar_operation)) // skip padding
214 {
215 current_ref_pos = get<0>(alignment).insert_gap(current_ref_pos, cigar_count);
216 ++current_ref_pos;
217
218 current_read_pos = get<1>(alignment).insert_gap(current_read_pos, cigar_count);
219 ++current_read_pos;
220 }
221 // S and H are ignored as they are handled by cropping the sequence
222 }
223
224 return alignment;
225}
226
230template <typename reference_type, typename sequence_type>
231inline auto alignment_from_cigar(std::string const & cigar_string,
232 reference_type const & reference,
233 uint32_t const zero_based_reference_start_position,
234 sequence_type const & query)
235{
236 alignment_from_cigar(detail::parse_cigar(cigar_string), reference, zero_based_reference_start_position, query);
237}
238
239} // namespace seqan3
Includes the aligned_sequence and the related insert_gap and erase_gap functions to enable stl contai...
Provides the seqan3::cigar alphabet.
T begin(T... args)
A gap decorator allows the annotation of sequences with gap symbols while leaving the underlying sequ...
Definition: gap_decorator.hpp:81
T empty(T... args)
Provides seqan3::gap_decorator.
auto alignment_from_cigar(std::vector< cigar > const &cigar_vector, reference_type const &reference, uint32_t const zero_based_reference_start_position, sequence_type const &query)
Construct an alignment from a CIGAR string and the corresponding sequences.
Definition: alignment_from_cigar.hpp:84
@ alignment
The (pairwise) alignment stored in an object that models seqan3::detail::pairwise_alignment.
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:178
Auxiliary functions for the SAM IO.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
constexpr auto const & get(configuration< configs_t... > const &config) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: configuration.hpp:415
T size(T... args)
Provides seqan3::views::slice.
T to_string(T... args)