From 985b9303b0b378a39908fff3d225289aa893b8d2 Mon Sep 17 00:00:00 2001 From: Chris Miller Date: Tue, 6 Oct 2020 15:31:15 -0500 Subject: [PATCH 1/4] updating vep version --- Dockerfile | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3b87fc0..185ef6b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,6 @@ -FROM ensemblorg/ensembl-vep:release_95.3 -MAINTAINER John Garza - -LABEL \ - description="Vep helper image" +FROM ensemblorg/ensembl-vep:release_101.0 +LABEL maintainer="John Garza " +LABEL description="Vep helper image" USER root From ec3ff688954b195f9cf443d8ad57c87a2c5529e1 Mon Sep 17 00:00:00 2001 From: Chris Miller Date: Tue, 6 Oct 2020 15:45:39 -0500 Subject: [PATCH 2/4] removing unneeded scripts --- add_annotations_to_table_helper.py | 123 ----------------------------- docm_and_coding_indel_selection.pl | 81 ------------------- intervals_to_bed.pl | 10 --- single_sample_docm_filter.pl | 38 --------- 4 files changed, 252 deletions(-) delete mode 100644 add_annotations_to_table_helper.py delete mode 100755 docm_and_coding_indel_selection.pl delete mode 100644 intervals_to_bed.pl delete mode 100755 single_sample_docm_filter.pl diff --git a/add_annotations_to_table_helper.py b/add_annotations_to_table_helper.py deleted file mode 100644 index 8339dbc..0000000 --- a/add_annotations_to_table_helper.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python - -import sys -import os -import re -from cyvcf2 import VCF -import tempfile -import csv -import binascii - -def parse_csq_header(vcf_file): - for header in vcf_file.header_iter(): - info = header.info(extra=True) - if b'ID' in info.keys() and info[b'ID'] == b'CSQ': - format_pattern = re.compile('Format: (.*)"') - match = format_pattern.search(info[b'Description'].decode()) - return match.group(1).split('|') - -def parse_csq_entries(csq_entries, csq_fields): - transcripts = {} - for entry in csq_entries: - values = entry.split('|') - transcript = {} - for key, value in zip(csq_fields, values): - transcript[key] = value - if transcript['Allele'] not in transcripts.keys(): - transcripts[transcript['Allele']] = [] - transcripts[transcript['Allele']].append(transcript) - return transcripts - -def resolve_alleles(entry, csq_alleles): - alleles = {} - if entry.is_indel: - for alt in entry.ALT: - alt = str(alt) - if alt[0:1] != entry.REF[0:1]: - csq_allele = alt - elif alt[1:] == "": - csq_allele = '-' - else: - csq_allele = alt[1:] - alleles[alt] = csq_allele - elif entry.is_sv: - for alt in alts: - if len(alt) > len(entry.REF) and 'insertion' in csq_alleles: - alleles[alt] = 'insertion' - elif len(alt) < len(entry.REF) and 'deletion' in csq_alleles: - alleles[alt] = 'deletion' - elif len(csq_alleles) == 1: - alleles[alt] = list(csq_alleles)[0] - else: - for alt in entry.ALT: - alt = str(alt) - alleles[alt] = alt - return alleles - -def transcript_for_alt(transcripts, alt): - for transcript in transcripts[alt]: - if transcript['PICK'] == '1': - return transcript - return transcripts[alt][0] - -def decode_hex(string): - hex_string = string.group(0).replace('%', '') - return binascii.unhexlify(hex_string).decode('utf-8') - -(script, tsv_filename, vcf_filename, vep_fields, output_dir) = sys.argv -vep_fields_list = vep_fields.split(',') - -vcf_file = VCF(vcf_filename) - -csq_fields = parse_csq_header(vcf_file) - -vep = {} -for variant in vcf_file: - chr = str(variant.CHROM) - pos = str(variant.POS) - ref = str(variant.REF) - alts = variant.ALT - - if chr not in vep: - vep[chr] = {} - - if pos not in vep[chr]: - vep[chr][pos] = {} - - if ref not in vep[chr][pos]: - vep[chr][pos][ref] = {} - - csq = variant.INFO.get('CSQ') - if csq is not None: - transcripts = parse_csq_entries(csq.split(','), csq_fields) - alleles_dict = resolve_alleles(variant, transcripts.keys()) - for alt in alts: - if alt not in vep[chr][pos][ref]: - if transcripts is not None and alleles_dict[alt] in transcripts: - vep[chr][pos][ref][alt] = transcript_for_alt(transcripts, alleles_dict[alt]) - else: - vep[chr][pos][ref][alt] = None - else: - sys.exit("VEP entry for at CHR %s, POS %s, REF %s , ALT % already exists" % (chr, pos, ref, alt) ) - - -with open(tsv_filename, 'r') as input_filehandle: - reader = csv.DictReader(input_filehandle, delimiter = "\t") - output_filehandle = open(os.path.join(output_dir, 'variants.annotated.tsv'), 'w') - writer = csv.DictWriter(output_filehandle, fieldnames = reader.fieldnames + vep_fields_list, delimiter = "\t") - writer.writeheader() - for entry in reader: - row = entry - for field in vep_fields_list: - field_annotations = [] - for alt in entry['ALT'].split(','): - vep_annotations = vep[entry['CHROM']][entry['POS']][entry['REF']][alt] - if vep_annotations is not None and field in vep_annotations: - annotation = vep_annotations[field] - decoded_annotation = re.sub(r'%[0-9|A-F][0-9|A-F]', decode_hex, annotation) - field_annotations.append(decoded_annotation) - else: - field_annotations.append('-') - row[field] = ','.join(field_annotations) - writer.writerow(row) - output_filehandle.close() diff --git a/docm_and_coding_indel_selection.pl b/docm_and_coding_indel_selection.pl deleted file mode 100755 index a9393da..0000000 --- a/docm_and_coding_indel_selection.pl +++ /dev/null @@ -1,81 +0,0 @@ -#! /usr/bin/perl - -#Copyright (C) 2015 Feiyu Du -# and Washington University The Genome Institute - -#This script is distributed in the hope that it will be useful, -#but WITHOUT ANY WARRANTY or the implied warranty of -#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -#GNU General Public License for more details. - - -use strict; -use warnings; - -use feature qw(say); - -die "Provide vep annotated vcf and output_dir" unless @ARGV >= 2; -my ($annotated_vcf, $outdir, $filter_flag) = @ARGV; - -my @coding_categories = qw( - splice_acceptor_variant - splice_donor_variant - splice_region_variant - transcript_ablation - transcript_amplification - stop_retained_variant - stop_gained - stop_lost - start_lost - frameshift_variant - inframe_insertion - inframe_deletion - missense_variant - protein_altering_variant - initiator_codon_variant - incomplete_terminal_codon_variant - synonymous_variant - coding_sequence_variant -); - -open (my $annotated_vcf_fh, '-|', '/bin/gunzip', '-c', $annotated_vcf) - or die "couldn't open $annotated_vcf to read"; -open (my $annotated_filtered_vcf_fh, ">", "$outdir/annotated_filtered.vcf") - or die "couldn't open annotated_filtered.vcf to write"; - -while (<$annotated_vcf_fh>) { - chomp; - if (/^#/) { - say $annotated_filtered_vcf_fh $_; - } - else { - if ($filter_flag and $filter_flag eq 'filter') { - my @columns = split /\t/, $_; - my ($ref, $alt, $info) = map{$columns[$_]}(3, 4, 7); - my @alts = split /,/, $alt; - my ($caller) = $info =~ /;set=(\S+?);/; - - if (length($ref) == 1 and length($alts[0]) == 1) { #snvs - say $annotated_filtered_vcf_fh $_; - } - else { - if ($caller =~ /docm/) { - say $annotated_filtered_vcf_fh $_; - } - else { - my ($csq) = $info =~ /;CSQ=(\S+)/; - if (grep {$csq =~ /$_/}@coding_categories) { - say $annotated_filtered_vcf_fh $_; - } - } - } - } - else { - say $annotated_filtered_vcf_fh $_; - } - } -} - -close $annotated_vcf_fh; -close $annotated_filtered_vcf_fh; - diff --git a/intervals_to_bed.pl b/intervals_to_bed.pl deleted file mode 100644 index ce3121b..0000000 --- a/intervals_to_bed.pl +++ /dev/null @@ -1,10 +0,0 @@ -use feature qw(say); - -for my $line (<>) { - chomp $line; - - next if substr($line,0,1) eq '@'; #skip header lines - - my ($chrom, $start, $stop) = split(/\t/, $line); - say(join("\t", $chrom, $start-1, $stop)); -} diff --git a/single_sample_docm_filter.pl b/single_sample_docm_filter.pl deleted file mode 100755 index 98a02b5..0000000 --- a/single_sample_docm_filter.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -use feature qw(say); - -die("wrong number of arguments") unless scalar(@ARGV) == 2; -my ($docm_out_vcf, $outdir) = @ARGV; - -open(my $docm_vcf_fh, $docm_out_vcf) - or die("couldn't open $docm_out_vcf to read"); -open(my $docm_filter_fh, ">", "$outdir/docm_filter_out.vcf") - or die("couldn't open docm_filter_out.vcf for write"); - -while (<$docm_vcf_fh>) { - chomp; - if (/^#/) { - say $docm_filter_fh $_; - } - else { - my @columns = split /\t/, $_; - my @info = split /:/, $columns[9]; - my ($AD, $DP) = ($info[1], $info[2]); - next unless $AD; - my @AD = split /,/, $AD; - shift @AD; #the first one is ref count - for my $ad (@AD) { - if ($ad > 5 and $ad/$DP > 0.01) { - say $docm_filter_fh $_; - last; - } - } - } -} - -close($docm_vcf_fh); -close($docm_filter_fh); From 24e7e117c058d287e57849780662ac98e752520a Mon Sep 17 00:00:00 2001 From: Chris Miller Date: Tue, 6 Oct 2020 15:46:06 -0500 Subject: [PATCH 3/4] updating readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1b540f6..2a1dc24 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -This repo has moved to genome/docker-vep_helper-cwl +VEP plus accessory script for empty VCF handling From 1d4a5c18680b7d16456888d20be3b3d4305fe5ba Mon Sep 17 00:00:00 2001 From: Chris Miller Date: Tue, 6 Oct 2020 15:47:38 -0500 Subject: [PATCH 4/4] removing unneeded files --- Dockerfile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 185ef6b..6579cdc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,10 +31,6 @@ https://raw.githubusercontent.com/Ensembl/VEP_plugins/release/95/SpliceRegion.pm https://raw.githubusercontent.com/Ensembl/VEP_plugins/release/95/dbNSFP.pm \ https://raw.githubusercontent.com/Ensembl/VEP_plugins/release/95/dbNSFP_replacement_logic -COPY add_annotations_to_table_helper.py /usr/bin/add_annotations_to_table_helper.py -COPY docm_and_coding_indel_selection.pl /usr/bin/docm_and_coding_indel_selection.pl COPY vcf_check.pl /usr/bin/vcf_check.pl -COPY intervals_to_bed.pl /usr/bin/intervals_to_bed.pl -COPY single_sample_docm_filter.pl /usr/bin/single_sample_docm_filter.pl USER vep