Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature skip softclip #1293

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pysam/libcalignedsegment.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ class AlignedSegment:
def get_forward_sequence(self) -> Optional[str]: ...
def get_forward_qualities(self) -> Optional[array]: ...
def get_aligned_pairs(
self, matches_only: bool = ..., with_seq: bool = ...
self, matches_only: bool = ..., with_seq: bool = ..., skip_soft_clipping: bool = ...
) -> List[Tuple[int, int]]: ...
def get_blocks(self) -> List[Tuple[int, int]]: ...
def get_overlap(self, start: int, end: int) -> Optional[int]: ...
Expand Down
7 changes: 5 additions & 2 deletions pysam/libcalignedsegment.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1973,7 +1973,7 @@ cdef class AlignedSegment:
return self.query_qualities


def get_aligned_pairs(self, matches_only=False, with_seq=False):
def get_aligned_pairs(self, matches_only=False, with_seq=False, skip_soft_clipping=False):
"""a list of aligned read (query) and reference positions.

Each item in the returned list is a tuple consisting of
Expand All @@ -1997,6 +1997,8 @@ cdef class AlignedSegment:
reference sequence. For CIGAR 'P' (padding in the reference)
operations, the third tuple element will be None. Substitutions
are lower-case. This option requires an MD tag to be present.
skip_soft_clipping: bool
If True, soft-clipped regions are not returned but ignored.

Returns
-------
Expand All @@ -2010,6 +2012,7 @@ cdef class AlignedSegment:
cdef bam1_t * src = self._delegate
cdef bint _matches_only = bool(matches_only)
cdef bint _with_seq = bool(with_seq)
cdef bint _skip_soft_clipping = bool(skip_soft_clipping)

# TODO: this method performs no checking and assumes that
# read sequence, cigar and MD tag are consistent.
Expand Down Expand Up @@ -2046,7 +2049,7 @@ cdef class AlignedSegment:
pos += l

elif op == BAM_CINS or op == BAM_CSOFT_CLIP or op == BAM_CPAD:
if not _matches_only:
if not (_matches_only or (_skip_soft_clipping and op == BAM_CSOFT_CLIP)):
if _with_seq:
for i from pos <= i < pos + l:
result.append((qpos, None, None))
Expand Down
62 changes: 62 additions & 0 deletions tests/AlignedSegment_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,54 @@ def testPositions(self):
],
)

# Insertions should not be affected by skip_soft_clipping
self.assertEqual(
a.get_aligned_pairs(matches_only = False, skip_soft_clipping = True),
[
(0, 20),
(1, 21),
(2, 22),
(3, 23),
(4, 24),
(5, 25),
(6, 26),
(7, 27),
(8, 28),
(9, 29),
(None, 30),
(10, 31),
(11, 32),
(12, 33),
(13, 34),
(14, 35),
(15, 36),
(16, 37),
(17, 38),
(18, 39),
(19, None),
(20, 40),
(21, 41),
(22, 42),
(23, 43),
(24, 44),
(25, 45),
(26, 46),
(27, 47),
(28, 48),
(29, 49),
(30, 50),
(31, 51),
(32, 52),
(33, 53),
(34, 54),
(35, 55),
(36, 56),
(37, 57),
(38, 58),
(39, 59),
],
)

self.assertEqual(
a.get_reference_positions(),
[
Expand Down Expand Up @@ -453,6 +501,16 @@ def test_get_aligned_pairs_soft_clipping(self):
]
# [(37, None), (38, None), (39, None)]
)
# Soft-clipping should be ignored with skip_soft_clipping
self.assertEqual(
a.get_aligned_pairs(matches_only=False, skip_soft_clipping=True),
# [(0, None), (1, None)] +
[
(qpos, refpos)
for (qpos, refpos) in zip(range(2, 2 + 35), range(20, 20 + 35))
]
# [(37, None), (38, None), (39, None)]
)

def test_get_aligned_pairs_hard_clipping(self):
a = self.build_read()
Expand Down Expand Up @@ -524,6 +582,8 @@ def test_get_aligned_pairs_padding(self):
# See comment in test_get_aligned_pairs_padding_with_seq (below).
self.assertEqual(a.get_aligned_pairs(),
[(0, 20), (1, None), (2, 21)])
self.assertEqual(a.get_aligned_pairs(matches_only=False, skip_soft_clipping=True),
[(0, 20), (1, None), (2, 21)])

def test_get_aligned_pairs_padding_with_seq(self):
a = self.build_read()
Expand Down Expand Up @@ -552,6 +612,8 @@ def test_get_aligned_pairs_padding_with_seq(self):
# sequences inserted relative to the reference."
self.assertEqual(a.get_aligned_pairs(with_seq=True),
[(0, 20, 'A'), (1, None, None), (2, 21, 'T')])
self.assertEqual(a.get_aligned_pairs(with_seq=True, matches_only=False, skip_soft_clipping=True),
[(0, 20, 'A'), (1, None, None), (2, 21, 'T')])

def test_get_aligned_pairs(self):
a = self.build_read()
Expand Down