From 0f305c9d1b777e2ab8a838fc3203adf86c5656c7 Mon Sep 17 00:00:00 2001 From: Ludvig Date: Wed, 1 Nov 2023 19:04:58 +0100 Subject: [PATCH 01/11] Adds length filter in coverage --- mosdepth.nim | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mosdepth.nim b/mosdepth.nim index b793243..6e8448a 100644 --- a/mosdepth.nim +++ b/mosdepth.nim @@ -229,7 +229,7 @@ proc init(arr: var coverage_t, tlen:int) = arr.set_len(int(tlen)) zeroMem(arr[0].addr, len(arr) * sizeof(arr[0])) -proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t, mapq:int= -1, eflag: uint16=1796, iflag:uint16=0, read_groups:seq[string]=(@[]), fast_mode:bool=false): int = +proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t, mapq:int= -1, min_len:int= -1, max_len:int=int.high, eflag: uint16=1796, iflag:uint16=0, read_groups:seq[string]=(@[]), fast_mode:bool=false): int = # depth updates arr in-place and yields the tid for each chrom. # returns -1 if the chrom is not found in the bam header # returns -2 if the chrom was found in the header, but there was no data for it @@ -253,6 +253,7 @@ proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t, mapq:int= arr.init(int(tgt.length+1)) found = true if int(rec.mapping_quality) < mapq: continue + if int(abs(rec.tlen)) < min_len or int(abs(rec.tlen)) > max_len: continue if (rec.flag and eflag) != 0: continue if iflag != 0 and ((rec.flag and iflag) == 0): From 08f186bcedfe11f0896c3138bcc338a71ad56da8 Mon Sep 17 00:00:00 2001 From: Ludvig Date: Wed, 1 Nov 2023 19:33:30 +0100 Subject: [PATCH 02/11] Adds min_len and max_len to CLI --- mosdepth.nim | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mosdepth.nim b/mosdepth.nim index 6e8448a..8d607b2 100644 --- a/mosdepth.nim +++ b/mosdepth.nim @@ -822,6 +822,8 @@ Other options: -x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases). -q --quantize write quantized output see docs for description. -Q --mapq mapping quality threshold. reads with a quality less than this value are ignored [default: 0] + -l --min_len minimum fragment length. reads with a smaller template length than this are ignored [default: 0] + -u --max_len maximum fragment length. reads with a larger template length than this are ignored. [default: -1] -T --thresholds for each interval in --by, write number of bases covered by at least threshold bases. Specify multiple integer values separated by ','. @@ -838,6 +840,11 @@ Other options: quit "error parsing arguments" let mapq = S.parse_int($args["--mapq"]) + let min_len = S.parse_int($args["--min_len"]) + var max_len = S.parse_int($args["--max_len"]) + if max_len < 0: + let max_len = int.high + var region: string thresholds: seq[int] = threshold_args($args["--thresholds"]) @@ -871,7 +878,7 @@ Other options: stderr.write_line("[mosdepth] error alignment file must be indexed") quit(2) - var opts = SamField.SAM_FLAG.int or SamField.SAM_RNAME.int or SamField.SAM_POS.int or SamField.SAM_MAPQ.int or SamField.SAM_CIGAR.int + var opts = SamField.SAM_FLAG.int or SamField.SAM_RNAME.int or SamField.SAM_POS.int or SamField.SAM_MAPQ.int or SamField.SAM_CIGAR.int or SamField.SAM_TLEN.int if not fast_mode: opts = opts or SamField.SAM_QNAME.int or SamField.SAM_RNEXT.int or SamField.SAM_PNEXT.int #or SamField.SAM_TLEN.int @@ -882,4 +889,4 @@ Other options: discard bam.set_option(FormatOption.CRAM_OPT_DECODE_MD, 0) check_chrom(chrom, bam.hdr.targets) - main(bam, chrom, mapq, eflag, iflag, region, thresholds, fast_mode, args, use_median=use_median, use_d4=use_d4) + main(bam, chrom, mapq, min_len, max_len, eflag, iflag, region, thresholds, fast_mode, args, use_median=use_median, use_d4=use_d4) From 6f8bc26f3d32e6ec990a9d5ac287e64a80ca1bc0 Mon Sep 17 00:00:00 2001 From: Ludvig Date: Wed, 1 Nov 2023 20:48:27 +0100 Subject: [PATCH 03/11] tries isize instead of tlen --- mosdepth.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mosdepth.nim b/mosdepth.nim index 8d607b2..6fedd47 100644 --- a/mosdepth.nim +++ b/mosdepth.nim @@ -253,7 +253,7 @@ proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t, mapq:int= arr.init(int(tgt.length+1)) found = true if int(rec.mapping_quality) < mapq: continue - if int(abs(rec.tlen)) < min_len or int(abs(rec.tlen)) > max_len: continue + if int(abs(rec.isize)) < min_len or int(abs(rec.isize)) > max_len: continue if (rec.flag and eflag) != 0: continue if iflag != 0 and ((rec.flag and iflag) == 0): From a6c030b086ca08607d588715610ba4576fe366ec Mon Sep 17 00:00:00 2001 From: Ludvig Date: Wed, 1 Nov 2023 20:53:10 +0100 Subject: [PATCH 04/11] Passes min_len and max_len to coverage() --- mosdepth.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mosdepth.nim b/mosdepth.nim index 6fedd47..53d5daa 100644 --- a/mosdepth.nim +++ b/mosdepth.nim @@ -545,7 +545,7 @@ proc to_tuples(targets:seq[Target]): seq[tuple[name:string, length:int]] = for i, t in targets: result[i] = (t.name, t.length.int) -proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16, region: string, thresholds: seq[int], +proc main(bam: hts.Bam, chrom: region_t, mapq: int, min_len: int, max_len: int, eflag: uint16, iflag: uint16, region: string, thresholds: seq[int], fast_mode:bool, args: Table[string, docopt.Value], use_median:bool=false, use_d4:bool=false) = # windows are either from regions, or fixed-length windows. # we assume the input is sorted by chrom. @@ -639,7 +639,7 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16 if skip_per_base and thresholds.len == 0 and quantize.len == 0 and bed_regions != nil and not bed_regions.contains(target.name): continue rchrom = region_t(chrom: target.name) - var tid = coverage(bam, arr, rchrom, mapq, eflag, iflag, read_groups=read_groups, fast_mode=fast_mode) + var tid = coverage(bam, arr, rchrom, mapq, min_len, max_len, eflag, iflag, read_groups=read_groups, fast_mode=fast_mode) if tid == -1: continue # -1 means that chrom is not even in the bam if tid != -2: # -2 means there were no reads in the bam arr.to_coverage() From a4a160f5ebc3bb132126d1142a5a195bcfba3ea6 Mon Sep 17 00:00:00 2001 From: Ludvig Date: Wed, 1 Nov 2023 21:10:36 +0100 Subject: [PATCH 05/11] replaces template length with insert size in docs --- mosdepth.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mosdepth.nim b/mosdepth.nim index 53d5daa..6dd9d96 100644 --- a/mosdepth.nim +++ b/mosdepth.nim @@ -822,8 +822,8 @@ Other options: -x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases). -q --quantize write quantized output see docs for description. -Q --mapq mapping quality threshold. reads with a quality less than this value are ignored [default: 0] - -l --min_len minimum fragment length. reads with a smaller template length than this are ignored [default: 0] - -u --max_len maximum fragment length. reads with a larger template length than this are ignored. [default: -1] + -l --min_len minimum fragment length. reads with a smaller insert size than this are ignored [default: 0] + -u --max_len maximum fragment length. reads with a larger insert size than this are ignored. [default: -1] -T --thresholds for each interval in --by, write number of bases covered by at least threshold bases. Specify multiple integer values separated by ','. From 53ba5931216a8815b16ff66725cdbfc07cc585bd Mon Sep 17 00:00:00 2001 From: Ludvig Date: Wed, 1 Nov 2023 21:39:14 +0100 Subject: [PATCH 06/11] Removes bad let --- mosdepth.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mosdepth.nim b/mosdepth.nim index 6dd9d96..8a19b06 100644 --- a/mosdepth.nim +++ b/mosdepth.nim @@ -843,7 +843,7 @@ Other options: let min_len = S.parse_int($args["--min_len"]) var max_len = S.parse_int($args["--max_len"]) if max_len < 0: - let max_len = int.high + max_len = int.high var region: string From 1bf2bbd1e66dcfc575942dc0721be3a5e42d86e5 Mon Sep 17 00:00:00 2001 From: Ludvig Date: Wed, 1 Nov 2023 22:00:35 +0100 Subject: [PATCH 07/11] replaces fragment length with insert size in docs --- mosdepth.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mosdepth.nim b/mosdepth.nim index 8a19b06..0ca2d8c 100644 --- a/mosdepth.nim +++ b/mosdepth.nim @@ -822,8 +822,8 @@ Other options: -x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases). -q --quantize write quantized output see docs for description. -Q --mapq mapping quality threshold. reads with a quality less than this value are ignored [default: 0] - -l --min_len minimum fragment length. reads with a smaller insert size than this are ignored [default: 0] - -u --max_len maximum fragment length. reads with a larger insert size than this are ignored. [default: -1] + -l --min_len minimum insert size. reads with a smaller insert size than this are ignored [default: 0] + -u --max_len maximum insert size. reads with a larger insert size than this are ignored. [default: -1] -T --thresholds for each interval in --by, write number of bases covered by at least threshold bases. Specify multiple integer values separated by ','. From d35fe31899c8af5de11c2fe66eddcf75cfdad5df Mon Sep 17 00:00:00 2001 From: Ludvig Date: Thu, 2 Nov 2023 12:40:56 +0100 Subject: [PATCH 08/11] Sets min_len default to -1 Otherwise, this PR might cause reads with insert size 0 to be excluded, which might be okay but is not the intended change I want to make --- mosdepth.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mosdepth.nim b/mosdepth.nim index 0ca2d8c..5c52d58 100644 --- a/mosdepth.nim +++ b/mosdepth.nim @@ -822,7 +822,7 @@ Other options: -x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases). -q --quantize write quantized output see docs for description. -Q --mapq mapping quality threshold. reads with a quality less than this value are ignored [default: 0] - -l --min_len minimum insert size. reads with a smaller insert size than this are ignored [default: 0] + -l --min_len minimum insert size. reads with a smaller insert size than this are ignored [default: -1] -u --max_len maximum insert size. reads with a larger insert size than this are ignored. [default: -1] -T --thresholds for each interval in --by, write number of bases covered by at least threshold bases. Specify multiple integer values separated From 5df473fdd6b51c28407c1264bf3e560b9dbd11ca Mon Sep 17 00:00:00 2001 From: Ludvig Date: Tue, 7 Nov 2023 20:53:17 +0100 Subject: [PATCH 09/11] Adheres to argument naming standard ("-" not "_") --- mosdepth.nim | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mosdepth.nim b/mosdepth.nim index 5c52d58..202025c 100644 --- a/mosdepth.nim +++ b/mosdepth.nim @@ -822,8 +822,8 @@ Other options: -x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases). -q --quantize write quantized output see docs for description. -Q --mapq mapping quality threshold. reads with a quality less than this value are ignored [default: 0] - -l --min_len minimum insert size. reads with a smaller insert size than this are ignored [default: -1] - -u --max_len maximum insert size. reads with a larger insert size than this are ignored. [default: -1] + -l --min-len minimum insert size. reads with a smaller insert size than this are ignored [default: -1] + -u --max-len maximum insert size. reads with a larger insert size than this are ignored. [default: -1] -T --thresholds for each interval in --by, write number of bases covered by at least threshold bases. Specify multiple integer values separated by ','. @@ -840,8 +840,8 @@ Other options: quit "error parsing arguments" let mapq = S.parse_int($args["--mapq"]) - let min_len = S.parse_int($args["--min_len"]) - var max_len = S.parse_int($args["--max_len"]) + let min_len = S.parse_int($args["--min-len"]) + var max_len = S.parse_int($args["--max-len"]) if max_len < 0: max_len = int.high From 39f73bdedeeaf77dc3ec23fa56759578727f0583 Mon Sep 17 00:00:00 2001 From: Ludvig Date: Thu, 9 Nov 2023 13:28:07 +0100 Subject: [PATCH 10/11] Updates argnames with "-frag-". Adds error when max < min. --- mosdepth.nim | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/mosdepth.nim b/mosdepth.nim index 202025c..6ac153b 100644 --- a/mosdepth.nim +++ b/mosdepth.nim @@ -817,19 +817,19 @@ Common Options: Other options: - -F --flag exclude reads with any of the bits in FLAG set [default: 1796] - -i --include-flag only include reads with any of the bits in FLAG set. default is unset. [default: 0] - -x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases). - -q --quantize write quantized output see docs for description. - -Q --mapq mapping quality threshold. reads with a quality less than this value are ignored [default: 0] - -l --min-len minimum insert size. reads with a smaller insert size than this are ignored [default: -1] - -u --max-len maximum insert size. reads with a larger insert size than this are ignored. [default: -1] - -T --thresholds for each interval in --by, write number of bases covered by at - least threshold bases. Specify multiple integer values separated - by ','. - -m --use-median output median of each region (in --by) instead of mean. - -R --read-groups only calculate depth for these comma-separated read groups IDs. - -h --help show help + -F --flag exclude reads with any of the bits in FLAG set [default: 1796] + -i --include-flag only include reads with any of the bits in FLAG set. default is unset. [default: 0] + -x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases). + -q --quantize write quantized output see docs for description. + -Q --mapq mapping quality threshold. reads with a quality less than this value are ignored [default: 0] + -l --min-frag-len minimum insert size. reads with a smaller insert size than this are ignored [default: -1] + -u --max-frag-len maximum insert size. reads with a larger insert size than this are ignored. [default: -1] + -T --thresholds for each interval in --by, write number of bases covered by at + least threshold bases. Specify multiple integer values separated + by ','. + -m --use-median output median of each region (in --by) instead of mean. + -R --read-groups only calculate depth for these comma-separated read groups IDs. + -h --help show help """ var args: Table[string, Value] @@ -840,10 +840,13 @@ Other options: quit "error parsing arguments" let mapq = S.parse_int($args["--mapq"]) - let min_len = S.parse_int($args["--min-len"]) - var max_len = S.parse_int($args["--max-len"]) + let min_len = S.parse_int($args["--min-frag-len"]) + var max_len = S.parse_int($args["--max-frag-len"]) if max_len < 0: max_len = int.high + if max_len < min_len: + stderr.write_line("[mosdepth] error --max-frag-len was lower than --min-frag-len.") + quit(2) var region: string From 4bd8f0c5246f42441826b80739269b7602d2259e Mon Sep 17 00:00:00 2001 From: Ludvig Date: Fri, 10 Nov 2023 19:18:38 +0100 Subject: [PATCH 11/11] Adds tests of fragment length filtering --- functional-tests.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) mode change 100644 => 100755 functional-tests.sh diff --git a/functional-tests.sh b/functional-tests.sh old mode 100644 new mode 100755 index 457e1d7..2d4af6b --- a/functional-tests.sh +++ b/functional-tests.sh @@ -58,6 +58,26 @@ assert_exit_code 0 assert_equal $(zgrep -c "MT" t.per-base.bed.gz) 2 assert_equal "MT 0 16569 0.00" "$(zgrep ^MT t.regions.bed.gz)" + +# fragment length filtering +run length_filter $exe t tests/ovl.bam --min-frag-len 80 --max-frag-len 80 +assert_exit_code 0 +assert_equal $(zgrep -c "MT" t.per-base.bed.gz) 2 +assert_equal "MT 0 80 1 MT 80 16569 0 " "$(zgrep ^MT t.per-base.bed.gz | tr -s '[:space:]' ' ')" + +run length_filter $exe t tests/ovl.bam --min-frag-len 81 +assert_exit_code 0 +assert_equal "MT 0 16569 0" "$(zgrep ^MT t.per-base.bed.gz)" + +run length_filter $exe t tests/ovl.bam --max-frag-len 79 +assert_exit_code 0 +assert_equal "MT 0 16569 0" "$(zgrep ^MT t.per-base.bed.gz)" + +run bad_frag_len_filter $exe t tests/ovl.bam --min-frag-len 10 --max-frag-len 9 +assert_in_stderr "--max-frag-len was lower than --min-frag-len." +assert_exit_code 1 + + unset MOSDEPTH_Q0 unset MOSDEPTH_Q1 unset MOSDEPTH_Q2