Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Insert size filtering #214

Merged
merged 11 commits into from
Nov 11, 2023
20 changes: 20 additions & 0 deletions functional-tests.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,26 @@ assert_exit_code 0
assert_equal $(zgrep -c "MT" t.per-base.bed.gz) 2
assert_equal "MT 0 16569 0.00" "$(zgrep ^MT t.regions.bed.gz)"


# fragment length filtering
run length_filter $exe t tests/ovl.bam --min-frag-len 80 --max-frag-len 80
assert_exit_code 0
assert_equal $(zgrep -c "MT" t.per-base.bed.gz) 2
assert_equal "MT 0 80 1 MT 80 16569 0 " "$(zgrep ^MT t.per-base.bed.gz | tr -s '[:space:]' ' ')"

run length_filter $exe t tests/ovl.bam --min-frag-len 81
assert_exit_code 0
assert_equal "MT 0 16569 0" "$(zgrep ^MT t.per-base.bed.gz)"

run length_filter $exe t tests/ovl.bam --max-frag-len 79
assert_exit_code 0
assert_equal "MT 0 16569 0" "$(zgrep ^MT t.per-base.bed.gz)"

run bad_frag_len_filter $exe t tests/ovl.bam --min-frag-len 10 --max-frag-len 9
assert_in_stderr "--max-frag-len was lower than --min-frag-len."
assert_exit_code 1


unset MOSDEPTH_Q0
unset MOSDEPTH_Q1
unset MOSDEPTH_Q2
Expand Down
43 changes: 27 additions & 16 deletions mosdepth.nim
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ proc init(arr: var coverage_t, tlen:int) =
arr.set_len(int(tlen))
zeroMem(arr[0].addr, len(arr) * sizeof(arr[0]))

proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t, mapq:int= -1, eflag: uint16=1796, iflag:uint16=0, read_groups:seq[string]=(@[]), fast_mode:bool=false): int =
proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t, mapq:int= -1, min_len:int= -1, max_len:int=int.high, eflag: uint16=1796, iflag:uint16=0, read_groups:seq[string]=(@[]), fast_mode:bool=false): int =
# depth updates arr in-place and yields the tid for each chrom.
# returns -1 if the chrom is not found in the bam header
# returns -2 if the chrom was found in the header, but there was no data for it
Expand All @@ -253,6 +253,7 @@ proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t, mapq:int=
arr.init(int(tgt.length+1))
found = true
if int(rec.mapping_quality) < mapq: continue
if int(abs(rec.isize)) < min_len or int(abs(rec.isize)) > max_len: continue
if (rec.flag and eflag) != 0:
continue
if iflag != 0 and ((rec.flag and iflag) == 0):
Expand Down Expand Up @@ -544,7 +545,7 @@ proc to_tuples(targets:seq[Target]): seq[tuple[name:string, length:int]] =
for i, t in targets:
result[i] = (t.name, t.length.int)

proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16, region: string, thresholds: seq[int],
proc main(bam: hts.Bam, chrom: region_t, mapq: int, min_len: int, max_len: int, eflag: uint16, iflag: uint16, region: string, thresholds: seq[int],
fast_mode:bool, args: Table[string, docopt.Value], use_median:bool=false, use_d4:bool=false) =
# windows are either from regions, or fixed-length windows.
# we assume the input is sorted by chrom.
Expand Down Expand Up @@ -638,7 +639,7 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16
if skip_per_base and thresholds.len == 0 and quantize.len == 0 and bed_regions != nil and not bed_regions.contains(target.name):
continue
rchrom = region_t(chrom: target.name)
var tid = coverage(bam, arr, rchrom, mapq, eflag, iflag, read_groups=read_groups, fast_mode=fast_mode)
var tid = coverage(bam, arr, rchrom, mapq, min_len, max_len, eflag, iflag, read_groups=read_groups, fast_mode=fast_mode)
if tid == -1: continue # -1 means that chrom is not even in the bam
if tid != -2: # -2 means there were no reads in the bam
arr.to_coverage()
Expand Down Expand Up @@ -816,17 +817,19 @@ Common Options:

Other options:

-F --flag <FLAG> exclude reads with any of the bits in FLAG set [default: 1796]
-i --include-flag <FLAG> only include reads with any of the bits in FLAG set. default is unset. [default: 0]
-x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases).
-q --quantize <segments> write quantized output see docs for description.
-Q --mapq <mapq> mapping quality threshold. reads with a quality less than this value are ignored [default: 0]
-T --thresholds <thresholds> for each interval in --by, write number of bases covered by at
least threshold bases. Specify multiple integer values separated
by ','.
-m --use-median output median of each region (in --by) instead of mean.
-R --read-groups <string> only calculate depth for these comma-separated read groups IDs.
-h --help show help
-F --flag <FLAG> exclude reads with any of the bits in FLAG set [default: 1796]
-i --include-flag <FLAG> only include reads with any of the bits in FLAG set. default is unset. [default: 0]
-x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases).
-q --quantize <segments> write quantized output see docs for description.
-Q --mapq <mapq> mapping quality threshold. reads with a quality less than this value are ignored [default: 0]
-l --min-frag-len <min-frag-len> minimum insert size. reads with a smaller insert size than this are ignored [default: -1]
-u --max-frag-len <max-frag-len> maximum insert size. reads with a larger insert size than this are ignored. [default: -1]
-T --thresholds <thresholds> for each interval in --by, write number of bases covered by at
least threshold bases. Specify multiple integer values separated
by ','.
-m --use-median output median of each region (in --by) instead of mean.
-R --read-groups <string> only calculate depth for these comma-separated read groups IDs.
-h --help show help
"""

var args: Table[string, Value]
Expand All @@ -837,6 +840,14 @@ Other options:
quit "error parsing arguments"

let mapq = S.parse_int($args["--mapq"])
let min_len = S.parse_int($args["--min-frag-len"])
var max_len = S.parse_int($args["--max-frag-len"])
if max_len < 0:
max_len = int.high
if max_len < min_len:
stderr.write_line("[mosdepth] error --max-frag-len was lower than --min-frag-len.")
quit(2)

var
region: string
thresholds: seq[int] = threshold_args($args["--thresholds"])
Expand Down Expand Up @@ -870,7 +881,7 @@ Other options:
stderr.write_line("[mosdepth] error alignment file must be indexed")
quit(2)

var opts = SamField.SAM_FLAG.int or SamField.SAM_RNAME.int or SamField.SAM_POS.int or SamField.SAM_MAPQ.int or SamField.SAM_CIGAR.int
var opts = SamField.SAM_FLAG.int or SamField.SAM_RNAME.int or SamField.SAM_POS.int or SamField.SAM_MAPQ.int or SamField.SAM_CIGAR.int or SamField.SAM_TLEN.int
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assumed this was necessary but I haven't actually tested if it's the case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And if it is correct, we can remove the comment in line 883

if not fast_mode:
opts = opts or SamField.SAM_QNAME.int or SamField.SAM_RNEXT.int or SamField.SAM_PNEXT.int #or SamField.SAM_TLEN.int

Expand All @@ -881,4 +892,4 @@ Other options:
discard bam.set_option(FormatOption.CRAM_OPT_DECODE_MD, 0)
check_chrom(chrom, bam.hdr.targets)

main(bam, chrom, mapq, eflag, iflag, region, thresholds, fast_mode, args, use_median=use_median, use_d4=use_d4)
main(bam, chrom, mapq, min_len, max_len, eflag, iflag, region, thresholds, fast_mode, args, use_median=use_median, use_d4=use_d4)
Loading