Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: added per-read flag and map quality filtering #3

Open
wants to merge 1 commit into
base: 188/td/min-intron-length-init
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions src/cis-splice-effects/cis_splice_effects_identifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ void CisSpliceEffectsIdentifier::usage(ostream& out) {
<< "\t\t\t " << "anchor length on both sides are reported. [8]" << endl;
out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl;
out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl;
out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl;
out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl;
out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl;
out << "\t\t" << "-w INT\tWindow size in b.p to identify splicing events in.\n"
<< "\t\t\t " << "The tool identifies events in variant.start +/- w basepairs.\n"
<< "\t\t\t " << "Default behaviour is to look at the window between previous and next exons." << endl;
Expand Down Expand Up @@ -113,7 +116,7 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
optind = 1; //Reset before parsing again.
stringstream help_ss;
char c;
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:b:C")) != -1) {
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:f:F:q:b:C")) != -1) {
switch(c) {
case 'o':
output_file_ = string(optarg);
Expand Down Expand Up @@ -170,6 +173,15 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
case 'M':
max_intron_length_ = atoi(optarg);
break;
case 'f':
require_flags_ = atoi(optarg);
break;
case 'F':
filter_flags_ = atoi(optarg);
break;
case 'q':
min_map_qual_ = atoi(optarg);
break;
case 'b':
output_barcodes_file_ = string(optarg);
break;
Expand Down Expand Up @@ -285,7 +297,9 @@ void CisSpliceEffectsIdentifier::identify() {
} else {
ref_to_pass = "NA";
}
JunctionsExtractor je1(bam_, variant_region, strandness_, strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_, ref_to_pass);
JunctionsExtractor je1(bam_, variant_region, strandness_,
strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_,
filter_flags_, require_flags_, min_map_qual_, ref_to_pass);
je1.identify_junctions_from_BAM();
vector<Junction> junctions = je1.get_all_junctions();
//Add all the junctions to the unique set
Expand Down
11 changes: 10 additions & 1 deletion src/cis-splice-effects/cis_splice_effects_identifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,19 @@ class CisSpliceEffectsIdentifier {
//tag used in BAM to denote strand, default "XS"
string strand_tag_;
//Minimum anchor length for junctions
//Junctions need atleast this many bp overlap
//Junctions need at least this many bp overlap
// on both ends.
uint32_t min_anchor_length_;
//Minimum length of an intron, i.e min junction width
uint32_t min_intron_length_;
//Maximum length of an intron, i.e max junction width
uint32_t max_intron_length_;
//filter reads containing any of these flags
uint16_t filter_flags_;
// filter reads not containing all of these flags
uint16_t require_flags_;
// filter reads below the minimum mapping quality
uint8_t min_map_qual_;
//whether to override strand of extracted junctions using intron-motif method
bool override_strand_with_canonical_intron_motif_;
public:
Expand All @@ -114,6 +120,9 @@ class CisSpliceEffectsIdentifier {
min_anchor_length_(8),
min_intron_length_(70),
max_intron_length_(500000),
filter_flags_(0),
require_flags_(0),
min_map_qual_(0),
override_strand_with_canonical_intron_motif_(false) {}
//Destructor
~CisSpliceEffectsIdentifier() {
Expand Down
32 changes: 30 additions & 2 deletions src/junctions/junctions_extractor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
optind = 1; //Reset before parsing again.
int c;
stringstream help_ss;
while((c = getopt(argc, argv, "ha:m:M:o:r:t:s:b:")) != -1) {
while((c = getopt(argc, argv, "ha:m:M:f:F:q:o:r:t:s:b:")) != -1) {
switch(c) {
case 'h':
usage(help_ss);
Expand All @@ -57,6 +57,15 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
case 'M':
max_intron_length_ = atoi(optarg);
break;
case 'f':
require_flags_ = atoi(optarg);
break;
case 'F':
filter_flags_ = atoi(optarg);
break;
case 'q':
min_map_qual_ = atoi(optarg);
break;
case 'o':
output_file_ = string(optarg);
break;
Expand Down Expand Up @@ -108,10 +117,17 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
throw runtime_error("Strandness mode 'intron-motif' requires a fasta file!\n\n");
}
}
if ( (require_flags_ & filter_flags_) != 0) {
usage();
throw runtime_error("No overlap allowed between '-f' and '-F' options (same flag filtered and required)!\n\n");
}

cerr << "Minimum junction anchor length: " << min_anchor_length_ << endl;
cerr << "Minimum intron length: " << min_intron_length_ << endl;
cerr << "Maximum intron length: " << max_intron_length_ << endl;
cerr << "Require alignment flags: " << require_flags_ << endl;
cerr << "Filter alignment flags: " << filter_flags_ << endl;
cerr << "Minimum alignment mapping quality: " << int(min_map_qual_) << endl;
cerr << "Alignment: " << bam_ << endl;
cerr << "Output file: " << output_file_ << endl;
if (output_barcodes_file_ != "NA"){
Expand All @@ -130,6 +146,9 @@ int JunctionsExtractor::usage(ostream& out) {
<< "\t\t\t " << "anchor length on both sides are reported. [8]" << endl;
out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl;
out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl;
out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl;
out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl;
out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl;
out << "\t\t" << "-o FILE\tThe file to write output to. [STDOUT]" << endl;
out << "\t\t" << "-r STR\tThe region to identify junctions \n"
<< "\t\t\t " << "in \"chr:start-end\" format. Entire BAM by default." << endl;
Expand Down Expand Up @@ -358,7 +377,7 @@ void JunctionsExtractor::set_junction_strand(bam1_t *aln, Junction& j1, string i
return;
}

//Get the the barcode
//Get the barcode
void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) {
uint8_t *p = bam_aux_get(aln, barcode_tag_.c_str());
if(p != NULL) {
Expand All @@ -373,6 +392,14 @@ void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) {
}
}

//Filters alignments based on filtering flags and mapping quality
bool JunctionsExtractor::filter_alignment(bam_hdr_t *header, bam1_t *aln) {
if ((aln->core.flag & filter_flags_) != 0) return true;
if ((aln->core.flag | require_flags_) != aln->core.flag) return true;
if (aln->core.qual < min_map_qual_) return true;
return false;
}

//Parse junctions from the read and store in junction map
int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t *aln) {
int n_cigar = aln->core.n_cigar;
Expand Down Expand Up @@ -523,6 +550,7 @@ int JunctionsExtractor::identify_junctions_from_BAM() {
//Initiate the alignment record
bam1_t *aln = bam_init1();
while(sam_itr_next(in, iter, aln) >= 0) {
if (filter_alignment(header, aln)) continue;
parse_alignment_into_junctions(header, aln);
}
hts_itr_destroy(iter);
Expand Down
19 changes: 18 additions & 1 deletion src/junctions/junctions_extractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,12 +180,21 @@ class JunctionsExtractor {
string strand_tag_;
//tag used in BAM to denote single cell barcode
string barcode_tag_;
//filter reads containing any of these flags
uint16_t filter_flags_;
// filter reads not containing all of these flags
uint16_t require_flags_;
// filter reads below the minimum mapping quality
uint8_t min_map_qual_;
public:
//Default constructor
JunctionsExtractor() {
min_anchor_length_ = 8;
min_intron_length_ = 70;
max_intron_length_ = 500000;
filter_flags_ = 0;
require_flags_ = 0;
min_map_qual_ = 0;
junctions_sorted_ = false;
strandness_ = -1;
strand_tag_ = "XS";
Expand All @@ -204,14 +213,20 @@ class JunctionsExtractor {
uint32_t min_anchor_length1,
uint32_t min_intron_length1,
uint32_t max_intron_length1,
uint16_t filter_flags,
uint16_t require_flags,
uint8_t min_map_qual,
string ref1) :
bam_(bam1),
region_(region1),
strandness_(strandness1),
strand_tag_(strand_tag1),
min_anchor_length_(min_anchor_length1),
min_intron_length_(min_intron_length1),
min_intron_length_(min_anchor_length1),
max_intron_length_(max_intron_length1),
filter_flags_(filter_flags),
require_flags_(require_flags),
min_map_qual_(min_map_qual),
ref_(ref1) {
junctions_sorted_ = false;
output_file_ = "NA";
Expand Down Expand Up @@ -241,6 +256,8 @@ class JunctionsExtractor {
void create_junctions_vector();
//Pull out the cigar string from the read
int parse_read(bam_hdr_t *header, bam1_t *aln);
//Returns whether alignment should be filtered from junction analysis
bool filter_alignment(bam_hdr_t *header, bam1_t *aln);
//Parse junctions from the read and store in junction map
int parse_cigar_into_junctions(string chr, int read_pos,
uint32_t *cigar, int n_cigar);
Expand Down