From 985b9303b0b378a39908fff3d225289aa893b8d2 Mon Sep 17 00:00:00 2001
From: Chris Miller <cmiller@genome.wustl.edu>
Date: Tue, 6 Oct 2020 15:31:15 -0500
Subject: [PATCH 1/4] updating vep version

---
 Dockerfile | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3b87fc0..185ef6b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,6 @@
-FROM ensemblorg/ensembl-vep:release_95.3
-MAINTAINER John Garza <johnegarza@wustl.edu>
-
-LABEL \
-    description="Vep helper image"
+FROM ensemblorg/ensembl-vep:release_101.0
+LABEL maintainer="John Garza <johnegarza@wustl.edu>"
+LABEL description="Vep helper image"
 
 USER root
 

From ec3ff688954b195f9cf443d8ad57c87a2c5529e1 Mon Sep 17 00:00:00 2001
From: Chris Miller <cmiller@genome.wustl.edu>
Date: Tue, 6 Oct 2020 15:45:39 -0500
Subject: [PATCH 2/4] removing unneeded scripts

---
 add_annotations_to_table_helper.py | 123 -----------------------------
 docm_and_coding_indel_selection.pl |  81 -------------------
 intervals_to_bed.pl                |  10 ---
 single_sample_docm_filter.pl       |  38 ---------
 4 files changed, 252 deletions(-)
 delete mode 100644 add_annotations_to_table_helper.py
 delete mode 100755 docm_and_coding_indel_selection.pl
 delete mode 100644 intervals_to_bed.pl
 delete mode 100755 single_sample_docm_filter.pl

diff --git a/add_annotations_to_table_helper.py b/add_annotations_to_table_helper.py
deleted file mode 100644
index 8339dbc..0000000
--- a/add_annotations_to_table_helper.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-import os
-import re
-from cyvcf2 import VCF
-import tempfile
-import csv
-import binascii
-
-def parse_csq_header(vcf_file):
-    for header in vcf_file.header_iter():
-        info = header.info(extra=True)
-        if b'ID' in info.keys() and info[b'ID'] == b'CSQ':
-            format_pattern = re.compile('Format: (.*)"')
-            match = format_pattern.search(info[b'Description'].decode())
-            return match.group(1).split('|')
-
-def parse_csq_entries(csq_entries, csq_fields):
-    transcripts = {}
-    for entry in csq_entries:
-        values = entry.split('|')
-        transcript = {}
-        for key, value in zip(csq_fields, values):
-            transcript[key] = value
-        if transcript['Allele'] not in transcripts.keys():
-            transcripts[transcript['Allele']] = []
-        transcripts[transcript['Allele']].append(transcript)
-    return transcripts
-
-def resolve_alleles(entry, csq_alleles):
-    alleles = {}
-    if entry.is_indel:
-        for alt in entry.ALT:
-            alt = str(alt)
-            if alt[0:1] != entry.REF[0:1]:
-                csq_allele = alt
-            elif alt[1:] == "":
-                csq_allele = '-'
-            else:
-                csq_allele = alt[1:]
-            alleles[alt] = csq_allele
-    elif entry.is_sv:
-        for alt in alts:
-            if len(alt) > len(entry.REF) and 'insertion' in csq_alleles:
-                alleles[alt] = 'insertion'
-            elif len(alt) < len(entry.REF) and 'deletion' in csq_alleles:
-                alleles[alt] = 'deletion'
-            elif len(csq_alleles) == 1:
-                alleles[alt] = list(csq_alleles)[0]
-    else:
-        for alt in entry.ALT:
-            alt = str(alt)
-            alleles[alt] = alt
-    return alleles
-
-def transcript_for_alt(transcripts, alt):
-    for transcript in transcripts[alt]:
-        if transcript['PICK'] == '1':
-            return transcript
-    return transcripts[alt][0]
-
-def decode_hex(string):
-    hex_string = string.group(0).replace('%', '')
-    return binascii.unhexlify(hex_string).decode('utf-8')
-
-(script, tsv_filename, vcf_filename, vep_fields, output_dir) = sys.argv
-vep_fields_list = vep_fields.split(',')
-
-vcf_file = VCF(vcf_filename)
-
-csq_fields = parse_csq_header(vcf_file)
-
-vep = {}
-for variant in vcf_file:
-    chr = str(variant.CHROM)
-    pos = str(variant.POS)
-    ref = str(variant.REF)
-    alts = variant.ALT
-
-    if chr not in vep:
-        vep[chr] = {}
-
-    if pos not in vep[chr]:
-        vep[chr][pos] = {}
-
-    if ref not in vep[chr][pos]:
-        vep[chr][pos][ref] = {}
-
-    csq = variant.INFO.get('CSQ')
-    if csq is not None:
-        transcripts = parse_csq_entries(csq.split(','), csq_fields)
-    alleles_dict = resolve_alleles(variant, transcripts.keys())
-    for alt in alts:
-        if alt not in vep[chr][pos][ref]:
-            if transcripts is not None and alleles_dict[alt] in transcripts:
-                vep[chr][pos][ref][alt] = transcript_for_alt(transcripts, alleles_dict[alt])
-            else:
-                vep[chr][pos][ref][alt] = None
-        else:
-            sys.exit("VEP entry for at CHR %s, POS %s, REF %s , ALT % already exists" % (chr, pos, ref, alt) )
-
-
-with open(tsv_filename, 'r') as input_filehandle:
-    reader = csv.DictReader(input_filehandle, delimiter = "\t")
-    output_filehandle = open(os.path.join(output_dir, 'variants.annotated.tsv'), 'w')
-    writer = csv.DictWriter(output_filehandle, fieldnames = reader.fieldnames + vep_fields_list, delimiter = "\t")
-    writer.writeheader()
-    for entry in reader:
-        row = entry
-        for field in vep_fields_list:
-            field_annotations = []
-            for alt in entry['ALT'].split(','):
-                vep_annotations = vep[entry['CHROM']][entry['POS']][entry['REF']][alt]
-                if vep_annotations is not None and field in vep_annotations:
-                    annotation = vep_annotations[field]
-                    decoded_annotation = re.sub(r'%[0-9|A-F][0-9|A-F]', decode_hex, annotation)
-                    field_annotations.append(decoded_annotation)
-                else:
-                    field_annotations.append('-')
-            row[field] = ','.join(field_annotations)
-        writer.writerow(row)
-    output_filehandle.close()
diff --git a/docm_and_coding_indel_selection.pl b/docm_and_coding_indel_selection.pl
deleted file mode 100755
index a9393da..0000000
--- a/docm_and_coding_indel_selection.pl
+++ /dev/null
@@ -1,81 +0,0 @@
-#! /usr/bin/perl
-
-#Copyright (C) 2015 Feiyu Du <fdu@genome.wustl.edu>
-#              and Washington University The Genome Institute
-
-#This script is distributed in the hope that it will be useful, 
-#but WITHOUT ANY WARRANTY or the implied warranty of 
-#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
-#GNU General Public License for more details.
-
-
-use strict;
-use warnings;
-
-use feature qw(say);
-
-die "Provide vep annotated vcf and output_dir" unless @ARGV >= 2;
-my ($annotated_vcf, $outdir, $filter_flag) = @ARGV;
-
-my @coding_categories = qw(
-    splice_acceptor_variant
-    splice_donor_variant
-    splice_region_variant
-    transcript_ablation
-    transcript_amplification
-    stop_retained_variant
-    stop_gained
-    stop_lost
-    start_lost
-    frameshift_variant
-    inframe_insertion
-    inframe_deletion
-    missense_variant
-    protein_altering_variant
-    initiator_codon_variant
-    incomplete_terminal_codon_variant
-    synonymous_variant
-    coding_sequence_variant
-);
-
-open (my $annotated_vcf_fh, '-|', '/bin/gunzip', '-c', $annotated_vcf) 
-    or die "couldn't open $annotated_vcf to read";
-open (my $annotated_filtered_vcf_fh, ">", "$outdir/annotated_filtered.vcf")
-    or die "couldn't open annotated_filtered.vcf to write";
-
-while (<$annotated_vcf_fh>) {
-    chomp;
-    if (/^#/) {
-        say $annotated_filtered_vcf_fh $_;
-    }
-    else {
-        if ($filter_flag and $filter_flag eq 'filter') {
-            my @columns = split /\t/, $_;
-            my ($ref, $alt, $info) = map{$columns[$_]}(3, 4, 7);
-            my @alts = split /,/, $alt;
-            my ($caller) = $info =~ /;set=(\S+?);/;
-
-            if (length($ref) == 1 and length($alts[0]) == 1) { #snvs
-                say $annotated_filtered_vcf_fh $_;
-            }
-            else {
-                if ($caller =~ /docm/) {
-                    say $annotated_filtered_vcf_fh $_;
-                }
-                else {
-                    my ($csq) = $info =~ /;CSQ=(\S+)/;
-                    if (grep {$csq =~ /$_/}@coding_categories) {
-                        say $annotated_filtered_vcf_fh $_;
-                    }
-                }
-            }
-        }
-        else {
-            say $annotated_filtered_vcf_fh $_;
-        }
-    }
-}
-
-close $annotated_vcf_fh;
-close $annotated_filtered_vcf_fh;
-
diff --git a/intervals_to_bed.pl b/intervals_to_bed.pl
deleted file mode 100644
index ce3121b..0000000
--- a/intervals_to_bed.pl
+++ /dev/null
@@ -1,10 +0,0 @@
-use feature qw(say);
-
-for my $line (<>) {
-    chomp $line;
-
-    next if substr($line,0,1) eq '@'; #skip header lines
-
-    my ($chrom, $start, $stop) = split(/\t/, $line);
-    say(join("\t", $chrom, $start-1, $stop));
-}
diff --git a/single_sample_docm_filter.pl b/single_sample_docm_filter.pl
deleted file mode 100755
index 98a02b5..0000000
--- a/single_sample_docm_filter.pl
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-use warnings;
-
-use feature qw(say);
-
-die("wrong number of arguments") unless scalar(@ARGV) == 2;
-my ($docm_out_vcf, $outdir) = @ARGV;
-
-open(my $docm_vcf_fh, $docm_out_vcf)
-    or die("couldn't open $docm_out_vcf to read");
-open(my $docm_filter_fh, ">", "$outdir/docm_filter_out.vcf")
-    or die("couldn't open docm_filter_out.vcf for write");
-
-while (<$docm_vcf_fh>) {
-    chomp;
-    if (/^#/) {
-        say $docm_filter_fh $_;
-    }
-    else {
-        my @columns = split /\t/, $_;
-        my @info = split /:/, $columns[9];
-        my ($AD, $DP) = ($info[1], $info[2]);
-        next unless $AD;
-        my @AD = split /,/, $AD;
-        shift @AD; #the first one is ref count
-        for my $ad (@AD) {
-            if ($ad > 5 and $ad/$DP > 0.01) {
-                say $docm_filter_fh $_;
-                last;
-            }
-        }
-    }
-}
-
-close($docm_vcf_fh);
-close($docm_filter_fh);

From 24e7e117c058d287e57849780662ac98e752520a Mon Sep 17 00:00:00 2001
From: Chris Miller <cmiller@genome.wustl.edu>
Date: Tue, 6 Oct 2020 15:46:06 -0500
Subject: [PATCH 3/4] updating readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1b540f6..2a1dc24 100644
--- a/README.md
+++ b/README.md
@@ -1 +1 @@
-This repo has moved to genome/docker-vep_helper-cwl
+VEP plus accessory script for empty VCF handling

From 1d4a5c18680b7d16456888d20be3b3d4305fe5ba Mon Sep 17 00:00:00 2001
From: Chris Miller <cmiller@genome.wustl.edu>
Date: Tue, 6 Oct 2020 15:47:38 -0500
Subject: [PATCH 4/4] removing unneeded files

---
 Dockerfile | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 185ef6b..6579cdc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -31,10 +31,6 @@ https://raw.githubusercontent.com/Ensembl/VEP_plugins/release/95/SpliceRegion.pm
 https://raw.githubusercontent.com/Ensembl/VEP_plugins/release/95/dbNSFP.pm \
 https://raw.githubusercontent.com/Ensembl/VEP_plugins/release/95/dbNSFP_replacement_logic
 
-COPY add_annotations_to_table_helper.py /usr/bin/add_annotations_to_table_helper.py
-COPY docm_and_coding_indel_selection.pl /usr/bin/docm_and_coding_indel_selection.pl
 COPY vcf_check.pl /usr/bin/vcf_check.pl
-COPY intervals_to_bed.pl /usr/bin/intervals_to_bed.pl
-COPY single_sample_docm_filter.pl /usr/bin/single_sample_docm_filter.pl
 
 USER vep