diff --git a/pysam/libcalignedsegment.pyi b/pysam/libcalignedsegment.pyi index bea806e7..2e407a0b 100644 --- a/pysam/libcalignedsegment.pyi +++ b/pysam/libcalignedsegment.pyi @@ -136,7 +136,7 @@ class AlignedSegment: def get_forward_sequence(self) -> Optional[str]: ... def get_forward_qualities(self) -> Optional[array]: ... def get_aligned_pairs( - self, matches_only: bool = ..., with_seq: bool = ... + self, matches_only: bool = ..., with_seq: bool = ..., skip_soft_clipping: bool = ... ) -> List[Tuple[int, int]]: ... def get_blocks(self) -> List[Tuple[int, int]]: ... def get_overlap(self, start: int, end: int) -> Optional[int]: ... diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index 3071f375..86c30fc0 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -1973,7 +1973,7 @@ cdef class AlignedSegment: return self.query_qualities - def get_aligned_pairs(self, matches_only=False, with_seq=False): + def get_aligned_pairs(self, matches_only=False, with_seq=False, skip_soft_clipping=False): """a list of aligned read (query) and reference positions. Each item in the returned list is a tuple consisting of @@ -1997,6 +1997,8 @@ cdef class AlignedSegment: reference sequence. For CIGAR 'P' (padding in the reference) operations, the third tuple element will be None. Substitutions are lower-case. This option requires an MD tag to be present. + skip_soft_clipping: bool + If True, soft-clipped regions are not returned but ignored. Returns ------- @@ -2010,6 +2012,7 @@ cdef class AlignedSegment: cdef bam1_t * src = self._delegate cdef bint _matches_only = bool(matches_only) cdef bint _with_seq = bool(with_seq) + cdef bint _skip_soft_clipping = bool(skip_soft_clipping) # TODO: this method performs no checking and assumes that # read sequence, cigar and MD tag are consistent. @@ -2046,7 +2049,7 @@ cdef class AlignedSegment: pos += l elif op == BAM_CINS or op == BAM_CSOFT_CLIP or op == BAM_CPAD: - if not _matches_only: + if not (_matches_only or (_skip_soft_clipping and op == BAM_CSOFT_CLIP)): if _with_seq: for i from pos <= i < pos + l: result.append((qpos, None, None)) diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index ea7886fa..c3b397e8 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -360,6 +360,54 @@ def testPositions(self): ], ) + # Insertions should not be affected by skip_soft_clipping + self.assertEqual( + a.get_aligned_pairs(matches_only = False, skip_soft_clipping = True), + [ + (0, 20), + (1, 21), + (2, 22), + (3, 23), + (4, 24), + (5, 25), + (6, 26), + (7, 27), + (8, 28), + (9, 29), + (None, 30), + (10, 31), + (11, 32), + (12, 33), + (13, 34), + (14, 35), + (15, 36), + (16, 37), + (17, 38), + (18, 39), + (19, None), + (20, 40), + (21, 41), + (22, 42), + (23, 43), + (24, 44), + (25, 45), + (26, 46), + (27, 47), + (28, 48), + (29, 49), + (30, 50), + (31, 51), + (32, 52), + (33, 53), + (34, 54), + (35, 55), + (36, 56), + (37, 57), + (38, 58), + (39, 59), + ], + ) + self.assertEqual( a.get_reference_positions(), [ @@ -453,6 +501,16 @@ def test_get_aligned_pairs_soft_clipping(self): ] # [(37, None), (38, None), (39, None)] ) + # Soft-clipping should be ignored with skip_soft_clipping + self.assertEqual( + a.get_aligned_pairs(matches_only=False, skip_soft_clipping=True), + # [(0, None), (1, None)] + + [ + (qpos, refpos) + for (qpos, refpos) in zip(range(2, 2 + 35), range(20, 20 + 35)) + ] + # [(37, None), (38, None), (39, None)] + ) def test_get_aligned_pairs_hard_clipping(self): a = self.build_read() @@ -524,6 +582,8 @@ def test_get_aligned_pairs_padding(self): # See comment in test_get_aligned_pairs_padding_with_seq (below). self.assertEqual(a.get_aligned_pairs(), [(0, 20), (1, None), (2, 21)]) + self.assertEqual(a.get_aligned_pairs(matches_only=False, skip_soft_clipping=True), + [(0, 20), (1, None), (2, 21)]) def test_get_aligned_pairs_padding_with_seq(self): a = self.build_read() @@ -552,6 +612,8 @@ def test_get_aligned_pairs_padding_with_seq(self): # sequences inserted relative to the reference." self.assertEqual(a.get_aligned_pairs(with_seq=True), [(0, 20, 'A'), (1, None, None), (2, 21, 'T')]) + self.assertEqual(a.get_aligned_pairs(with_seq=True, matches_only=False, skip_soft_clipping=True), + [(0, 20, 'A'), (1, None, None), (2, 21, 'T')]) def test_get_aligned_pairs(self): a = self.build_read()