diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py index f77b442..698b1ea 100644 --- a/fgpyo/sam/__init__.py +++ b/fgpyo/sam/__init__.py @@ -668,6 +668,35 @@ def build( return PairOrientation.RF +def isize(r1: AlignedSegment, r2: Optional[AlignedSegment] = None) -> int: + """Computes the insert size for a pair of records.""" + if r2 is None: + r2_is_unmapped = r1.mate_is_unmapped + r2_reference_id = r1.next_reference_id + else: + r2_is_unmapped = r2.is_unmapped + r2_reference_id = r2.reference_id + + if r1.is_unmapped or r2_is_unmapped or r1.reference_id != r2_reference_id: + return 0 + + if r2 is None: + if not r1.has_tag("MC"): + raise ValueError('Cannot determine proper pair status without R2\'s cigar ("MC")!') + r2_cigar = Cigar.from_cigarstring(str(r1.get_tag("MC"))) + r2_is_reverse = r1.mate_is_reverse + r2_reference_start = r1.next_reference_start + r2_reference_end = r1.next_reference_start + r2_cigar.length_on_target() + else: + r2_is_reverse = r2.is_reverse + r2_reference_start = r2.reference_start + r2_reference_end = r2.reference_end + + r1_pos = r1.reference_end if r1.is_reverse else r1.reference_start + r2_pos = r2_reference_end if r2_is_reverse else r2_reference_start + return r2_pos - r1_pos + + DefaultProperlyPairedOrientations = {PairOrientation.FR} """The default orientations for properly paired reads.""" @@ -677,6 +706,7 @@ def is_proper_pair( r2: Optional[AlignedSegment] = None, max_insert_size: int = 1000, orientations: set[PairOrientation] = DefaultProperlyPairedOrientations, + isize: Callable[[AlignedSegment, AlignedSegment], int] = isize, ) -> bool: """Determines if a read pair is properly paired or not. @@ -691,6 +721,7 @@ def is_proper_pair( r2: The second read in the template. If undefined, mate data set upon R1 will be used. max_insert_size: The maximum insert size to consider a read pair "proper". orientations: The valid set of orientations to consider a read pair "proper". + isize: A function that takes the two alignments and calculates their isize. See: [`htsjdk.samtools.SamPairUtil.isProperPair()`](https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L106-L125) @@ -707,9 +738,7 @@ def is_proper_pair( and r2_is_mapped and r1.reference_id == r2_reference_id and PairOrientation.build(r1, r2) in orientations - # TODO: consider replacing with `abs(isize(r1, r2)) <= max_insert_size` - # which can only be done if isize() is modified to allow for an optional R2. - and 0 < abs(r1.template_length) <= max_insert_size + and 0 < abs(isize(r1, r2)) <= max_insert_size ) @@ -794,16 +823,6 @@ def from_read(cls, read: pysam.AlignedSegment) -> List["SupplementaryAlignment"] return [] -def isize(r1: AlignedSegment, r2: AlignedSegment) -> int: - """Computes the insert size for a pair of records.""" - if r1.is_unmapped or r2.is_unmapped or r1.reference_id != r2.reference_id: - return 0 - else: - r1_pos = r1.reference_end if r1.is_reverse else r1.reference_start - r2_pos = r2.reference_end if r2.is_reverse else r2.reference_start - return r2_pos - r1_pos - - def sum_of_base_qualities(rec: AlignedSegment, min_quality_score: int = 15) -> int: """Calculate the sum of base qualities score for an alignment record. diff --git a/tests/fgpyo/sam/test_sam.py b/tests/fgpyo/sam/test_sam.py index 6c1daaf..cb60ab5 100755 --- a/tests/fgpyo/sam/test_sam.py +++ b/tests/fgpyo/sam/test_sam.py @@ -492,7 +492,16 @@ def test_not_is_proper_pair_if_too_far_apart() -> None: assert not is_proper_pair(r1, r2) -def test_isize() -> None: +def test_is_not_proper_pair_with_custom_isize_func() -> None: + """Test that reads are not properly paired because of a custom isize function.""" + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=100, start2=100) + assert is_proper_pair(r1, r2) + assert not is_proper_pair(r1, r2, isize=lambda a, b: False) + + +def test_isize_when_r2_defined() -> None: + """Tests that an insert size can be calculated when both input records are defined.""" builder = SamBuilder() r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") assert sam.isize(r1, r2) == 190 @@ -502,6 +511,40 @@ def test_isize() -> None: assert sam.isize(r1, r2) == 0 +def test_isize_when_r2_undefined() -> None: + """Tests that an insert size can be calculated when R1 is provided only.""" + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + assert sam.isize(r1) == 190 + assert sam.isize(r2) == -190 + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M") + assert sam.isize(r1) == 0 + assert sam.isize(r2) == 0 + + +def test_isize_when_r2_undefined_indels_in_r2_cigar() -> None: + """Tests that an insert size can be derived without R2 by using R2's cigar.""" + builder = SamBuilder() + r1, _ = builder.add_pair( + chrom="chr1", + start1=100, + cigar1="115M", + start2=250, + cigar2="10S5M1D1M1D2I2D30M", # only 40bp reference-consuming operators + ) + assert sam.isize(r1) == 190 + + +def test_isize_raises_when_r2_not_provided_and_no_mate_cigar_tag() -> None: + """Tests that an insert size can be calculated when both input records are defined.""" + builder = SamBuilder() + r1, _ = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.set_tag("MC", None) + with pytest.raises(ValueError, match="Cannot determine proper pair status without R2's cigar"): + sam.isize(r1) + + def test_sum_of_base_qualities() -> None: builder = SamBuilder(r1_len=5, r2_len=5) single = builder.add_single(quals=[1, 2, 3, 4, 5])