From b13536577ae7b553966dbaba665e72a2a7d94822 Mon Sep 17 00:00:00 2001 From: Rose Orenbuch Date: Fri, 21 Jun 2019 20:31:58 +0000 Subject: [PATCH 1/6] Added all IMGT/HLA github versions for reference module --- dat/info/parameters.p | Bin 1146 -> 2576 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/dat/info/parameters.p b/dat/info/parameters.p index 5520e6a11919ee04e6678c976424720130e2490a..4c2a79c8335528184011933b663de47b60e30cff 100644 GIT binary patch literal 2576 zcmXw)S({w75rz$BH3Y~;0$E6aB+fRdw3f2L4Eq*fvUL{6lu8nAFcZsT=Pu8aoBZ#5 zQun#i>C@*`m87>y|Gj!+T)D30{{Hs8{l(?A%XaVDc;Lo(@cQn7D_5?(J+AH^!uZDd zSr`wkS7+nl?UfsktXJ=iN7wklcx?NN#^bxI6a46GTsyq{$H!9|PaYmW#^YBO z{myvmfSjL>r`Mae#xrYtb9{A;uaB>-@pOEBjqi_d9EcB3$2X7RPRFyyi4pEw$G^|U zx7+{5bLZpvTUgt7uJ0a3YIpAL|8;kKclQ{cZ|yJd$i-gA_jXU?=Y`zc|GV9kFWcR{ z%EkEp?peH*%e`FOyd%2rd%d~8ye${C-5oEi*~)FvFUAjckK;p?`?@|JKSbCU&&Nyu z8807#x)XC^#*cQ_@mZ0e#Z?nD6^9?jdacrhvaOt@6US~?^7!$J;7$nWPY!6eAvxoF z^Ij+!E!D9l&niN`k)QK+E@O*of&O%%L7hm6t%)hoaR?OIO6|sG$v#!0>M2?pKRZT6 z%H!t?ZH283b6(R*uA~|e-7uCQg}bZvzs$(BZxy7pm?Xaq_NO-nYrqCYLKxFcQE?_pO11^9F6!_{l!1mr z!j(DYR!njwaX<`8&NmUOo`UzKhG3(2BOcKVG%Yk+?=%KmUCZ7|Z`HY+#Ppn+s^jKj zD9O>_-6JpvN=%*dV(r{Q->|!?HpG@2bA$`caF0zw>FlbuGJdtO)2Ck_SIf0CSF9{> z#iJ7Y+OjFh2DMa5XxdyFzuDL+)NdE7ENn_`$gxW6QcTAZyG=q6%`_@(LP5^I+t_K; ztBY-Qv4mui0~Su9hmcaj@rCi)#!jV97n@XkE7n1&$n43_UBd|(VmuXP z-xP_T|?jG5i z=n}cuQzEOzM>W-HE`GeVvD2!z7b{h;%P52tvC2}&%-t)|XX-gRL_{WD>Ud{kr&jMC z;<_GUGKs5?&_G;o8lxy00Y%lM#vz+v#(Nt(y?TGKqzQ>aOB|#zj$HSVG%_}+t7u7z zsXdwT!I8Dos}C0ou6ISrrW6|c94J$s7JE_DU+OieBrW6j8#}%F=)kth&c)`PF*cy^ znlly_THnd%79BKkxrFiY#!jz3S*&AnY;3aJguStat3`YVKhOs>qp~8QpvU>fPOtv3 zSQBbC?6i4Xa8L<``HlueE_d5OD8YWt3kA*EjVvEW)}#K!4q^ls#HS4 zF{NgF8-L!|>D6Bjab5Niy|sDt)r>~!r)D9si#gPSkeX%_8oQ01Uj22k>M|RY^7zNb zPOok()=NOsL1)J9qQfxuCpRpavCn8!Eyog|n~r~O?DXp9VzaNMLJy+}G0=x0T4-2% z@!rzx^ymi7%J}riy6Kf1ayFHcL}Xq|!F`1?kG4WX#}VDRkU%c(d6bQvUg=_S<%kV9 zk6ux+l%NP!kPPxM0lqStwe=;By0O!%wpbL4$+%##CDFh|)Qp=qS&AL?-$^Xtq;nm8 zW2aZ2E!GwEMTT~@QI^WcAB{XZT#6`2iqu*pVjj0Pc6zlxtk?Qv1$SVwF);2k!L4P{ zuAJ?v96FRVR%v{`vD2$B78|W`h^dY;>IGlbB1o||pR1){2zQ1-j~=%-c6#;YVsUvo f$sIM{S#Tk#uiaaX(sPH72{qNQVP#z0Zzul;*CJv3 literal 1146 zcmX}qNpoB^5CC8YCm}IQSi?@(88#!ecDAZ9Yq1o>r@5qD-7Tr|3NyGpHaDuc@V{wm z=5dmqx<9qlPyg&cki7-d<#c*6tv3goc5xur9>^n$)&Abz-Z#H4Wq)=3YIb~N<@)NH zPJfacV?2_ZV_eFkWBgujZBy?rt=!)FUzTz(fB=m${- zXy}~}RDrX#zy)0<=siGB-qaX-OY+`qo!a+@b;fq<$#|m)wpQ$p97D~3r5l@(*&&-O z|JU_uKNyxf#Y(QW)E%alV$Ipd1jz<(dpFFv6Qh+6=ej}dN5ev>ft+XKY%q|JTLD|F zB}XNh9=nOQvPJoLwo&aT!xj@VF#t9yP%Rc;dLUmLC5#4(D+$11m&4g6wVw`~ID}ja zaP+o>z|D5!sJYxqFd;GfRBLM}X0}uFadANCQ!bQpvdL_APCI1 zsCC=>*67JPJs|rYG4>ECQcbyaQ#q0CIh35d_?>k|Z5UQFtisx<5^29<2oY*b-sR}L zA&V`g+>6B7TD57|9IelkBgYa0nA9_b+(6H4YK4v(vROMWdA3*Ystl`LF4 Date: Fri, 21 Jun 2019 20:42:10 +0000 Subject: [PATCH 2/6] Removed IMGT/HLA versions missing hla.dat (3.0.0, 3.1.0, 3.2.0) --- dat/info/parameters.p | Bin 2576 -> 2399 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/dat/info/parameters.p b/dat/info/parameters.p index 4c2a79c8335528184011933b663de47b60e30cff..51402b3850532799aa046e800fa69309b82cc554 100644 GIT binary patch delta 643 zcmXxexiW)c7>41Hg%tS;lUj!eK1u%Uh<%GCb`iTs7#%Yd4nUzY^UNp}3bj!<0EGk4 zX_#>U3I||3ule8hYVP|c{Mqt&*5G$+?6eJ$MjL7``#m0yW0RB)R4EZxQj|{IQKGJ- zDP4G{bi0zF^dRiDA?r$((u)!$=1Ptd$FvLrU^Kj1$s`lFX0dfi$Kh_v82?O}Lbh z`$@D0teH}h`)O22MJ2hP!7XW4N$%(HMw(ZW`w~o(H492|zlb7fNlEUPaYb5DlKWM> zkk*vsejUH0vXb0yAm40F#U)4XH*rDQQj+^^Jdt*k!Lr4@n00j>~K}AJLK|#-V6rXZ_n$P;%zqY^I@#%U0wN`q(do=9zdR55kDs6OD z7gdT6ZC%}jE|}6bz6_o5d8d6S$9qNt{GKX<>E4ZLXR{*Ju4|nOrqFfOq~03m>ky1< zjpK7WUyX-xYBAl%)1sHvX$V!`wn0~U>a)z`-iD^rwhW~!8e<2J?i0Im1c z^uZ_&GDDvsDoO@9M1LSUN)9nJoKVv?N&!i!hPWsmlF~Et>8I!w=(G^G zpP{dyvqBtyj{bqp3vvAgG6R)djAXffk=8($gt-1PeFR+*;`$}}4Z14C^~&^aivVDdqY=-$0cR z*VokFqLN04>sy)wbwXVKfL?+g3UU1-`T}|^#Pv_;FX*Wd*RRpcRyFzTMXUY;4;`=> From 116b5a0979ab5e77a04a6bbab23b2fe9a86ee64d Mon Sep 17 00:00:00 2001 From: Rose Orenbuch Date: Mon, 24 Jun 2019 20:43:59 +0000 Subject: [PATCH 3/6] Changed merge output to table and added gene count table. --- scripts/merge.py | 88 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 20 deletions(-) diff --git a/scripts/merge.py b/scripts/merge.py index 95290e4..f3a7394 100644 --- a/scripts/merge.py +++ b/scripts/merge.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- #------------------------------------------------------------------------------- -# merge.py: merges genotype files into a single json or table. +# merge.py: merges genotype files into a single table. #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- @@ -25,7 +25,9 @@ import os import json import argparse +import pandas as pd +from collections import defaultdict from argparse import RawTextHelpFormatter from arcas_utilities import check_path @@ -39,20 +41,21 @@ def get_paths(indir): '''Get all file paths that match arcasHLA output.''' partial_files = [] genotype_files = [] + gene_count_files = [] for file in os.listdir(indir): if file.endswith('.partial_genotype.json'): partial_files.append(file) elif file.endswith('.genotype.json'): genotype_files.append(file) + elif file.endswith('.genes.json'): + gene_count_files.append(file) - return genotype_files, partial_files + return genotype_files, partial_files, gene_count_files -def process_json(json_files, indir, outdir, run, suffix): - '''Merge genotype.json files.''' - file_out = [outdir] - if run: file_out.extend([run,'.']) - file_out.append(suffix) +def process_genotype(json_files, indir, outdir, run, suffix): + '''Merge genotype.json files into single tsv.''' + file_out = ''.join([outdir, run, suffix]) genotypes = dict() for file in json_files: @@ -61,9 +64,45 @@ def process_json(json_files, indir, outdir, run, suffix): with open(file_path,'r') as file: genotypes[sample] = json.load(file) + + genotypes_ = defaultdict(dict) + for sample, genotype in genotypes.items(): + for gene, alleles in genotype.items(): + if len(alleles) == 2: + genotypes_[sample][gene + '1'] = alleles[0] + genotypes_[sample][gene + '2'] = alleles[1] + else: + genotypes_[sample][gene + '1'] = alleles[0] + genotypes_[sample][gene + '2'] = alleles[0] + + pd.DataFrame(genotypes_).T.rename_axis('subject').to_csv(file_out + '.tsv', sep = '\t') + +def process_count(count_files, indir, outdir, run, suffix): + '''Merge gene counts into single tsv.''' + file_out = ''.join([outdir, run, suffix]) + + all_counts = [] + for file in count_files: + sample = file.split('.')[0] + file_path = indir + file + + with open(file_path, 'r') as file: + lines = file.read() + + lines = lines.split('-'*80)[2].split('\n') + counts = {'subject':sample} + for line in lines: + if line.startswith('[alignment] Processed '): + _,_,counts['total_count'],_,counts['aligned_count'],_,_,_,_ = line.split() + elif line.endswith(' reads mapped to a single HLA gene'): + counts['single_count'] = line.split()[1] + elif line.startswith('\t\tHLA-'): + line = line.split() + gene = line[0].split('-')[1] + counts[gene + '_read_count'] = line[2] + all_counts.append(counts) - with open(''.join(file_out), 'w') as file: - json.dump(genotypes, file) + pd.DataFrame(all_counts).set_index('subject').to_csv(file_out, sep = '\t') if __name__ == '__main__': @@ -106,22 +145,31 @@ def process_json(json_files, indir, outdir, run, suffix): args = parser.parse_args() + if args.run: args.run += '.' + indir, outdir = [check_path(path) for path in [args.indir, args.outdir]] - genotype_files, partial_files = get_paths(indir) + genotype_files, partial_files, gene_count_files = get_paths(indir) if genotype_files: - process_json(genotype_files, - indir, - outdir, - args.run, - 'genotypes.json') + process_genotype(genotype_files, + indir, + outdir, + args.run, + 'genotypes') if partial_files: - process_json(partial_files, - indir, - outdir, - args.run, - 'partial_genotypes.json') + process_genotype(partial_files, + indir, + outdir, + args.run, + 'partial_genotypes') + + if gene_count_files: + process_count(gene_count_files, + indir, + outdir, + args.run, + 'genes') #------------------------------------------------------------------------------- From 3205d1a1aabdf4f0fd34a43acc703d856cd81f3a Mon Sep 17 00:00:00 2001 From: Rose Orenbuch Date: Mon, 24 Jun 2019 20:44:26 +0000 Subject: [PATCH 4/6] Added allele nomenclature conversion. --- scripts/convert.py | 193 +++++++++++++++++++++++++++++++++++++++++++ scripts/reference.py | 103 +++++++++++++++++++---- 2 files changed, 279 insertions(+), 17 deletions(-) create mode 100644 scripts/convert.py diff --git a/scripts/convert.py b/scripts/convert.py new file mode 100644 index 0000000..1d8194d --- /dev/null +++ b/scripts/convert.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +#------------------------------------------------------------------------------- +# convert.py: changes HLA resolution and converts nomenclature to p/g-groups. +#------------------------------------------------------------------------------- + +#------------------------------------------------------------------------------- +# This file is part of arcasHLA. +# +# arcasHLA is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# arcasHLA is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with arcasHLA. If not, see . +#------------------------------------------------------------------------------- + +import sys +import os +import json +import argparse +import pandas as pd +import pickle + +from argparse import RawTextHelpFormatter +from os.path import isfile, isdir, dirname, realpath + +from arcas_utilities import check_path, process_allele + +#------------------------------------------------------------------------------- + +__version__ = '0.2' +__date__ = '2019-04-02' + +#------------------------------------------------------------------------------- +# Paths and fileames +#------------------------------------------------------------------------------- + +rootDir = dirname(realpath(__file__)) + '/../' + +hla_convert = rootDir + 'dat/ref/hla.convert.p' + +#------------------------------------------------------------------------------- + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(prog='arcasHLA convert', + usage='%(prog)s [options]', + add_help=False, + formatter_class=RawTextHelpFormatter) + + parser.add_argument('file', + help='tsv containing HLA genotypes, see github for ' \ + + example file structure.\n\n', + type=str) + + parser.add_argument('-h', + '--help', + action = 'help', + help='show this help message and exit\n\n', + default=argparse.SUPPRESS) + + parser.add_argument('-o', + '--outfile', + type=str, + help='output file\n ' \ + + 'default: ./file_basename.resolution.tsv\n\n', + default='', + metavar='') + + parser.add_argument('--resolution', + help = 'output resolution (1,2,3,4) or grouping ' \ + + '(g-group, p-group)\n\n', + default='2', + metavar='') + + parser.add_argument('--force', + help = 'force conversion for grouped alleles even if ' \ + + 'it results in loss of resolution', + action = 'count', + default=False) + + parser.add_argument('-v', + '--verbose', + action = 'count', + default=False) + + args = parser.parse_args() + + p_group, g_group = pickle.load(open(hla_convert,'rb')) + + + # Check input resolution + accepted_fields = {'1','2','3','4'} + accepted_groupings = {'g-group','p-group'} + + resolution = None + + if args.resolution in accepted_fields: + resolution = int(args.resolution) + elif args.resolution.lower() in accepted_groupings: + resolution = args.resolution.lower() + + if not resolution: + sys.exit('[convert] Error: output resolution is needed ' \ + + '(1, 2, 3, g-group, p-group).') + + + # Create outfile name + if not args.outfile: + outfile = [os.path.splitext(os.path.basename(args.file))[0], + args.resolution.lower(), + 'tsv'] + outfile = '.'.join(outfile) + else: + outfile = args.outfile + + # Load input genotypes + df_genotypes = pd.read_csv(args.file, sep = '\t').set_index('subject') + genotypes = df_genotypes.to_dict('index') + + + for subject, genotype in genotypes.items(): + for gene, allele in genotype.items(): + if type(allele) != str: + continue + + i = len(allele.split(':')) + + # If input allele is in P-group nomenclature, it can only be + # converted to 1 field without loss + if allele[-1] == 'P': + if resolution == 'g-group': + sys.exit('[convert] Error: p-group cannot be converted ' \ + + 'to g-group.') + + elif type(resolution) == int and resolution > 1: + if not args.force: + sys.exit('[convert] Error: p-group cannot be ' \ + + 'converted to %.0f fields.' %resolution) + allele = process_allele(allele[:-1], resolution) + + elif type(resolution) == int: + allele = process_allele(allele, resolution) + + # If input allele is in G-group nomenclature, it can only be + # converted to 1 field without loss + elif allele[-1] == 'G': + + if type(resolution) == int and resolution > 1: + if not args.force: + sys.exit('[convert] Error: g-group cannot be converted'\ + + 'to %.0f fields.' %resolution) + allele = process_allele(allele[:-1], resolution) + + elif resolution == 'p-group': + if allele[:-1] in p_group[i]: + allele = p_group[i][allele[:-1]] + + elif process_allele(allele[:-1], i - 1) in p_group[i]: + allele = p_group[i][process_allele(allele[:-1], i -1)] + + elif type(resolution) == int: + allele = process_allele(allele, resolution) + + # If allele not in G-group and not null, reduce resolution + # to 3-fields + elif resolution == 'g-group': + if allele in g_group[i]: + allele = g_group[i][allele] + elif allele[-1] != 'N': + allele = process_allele(allele,3) + + elif resolution == 'p-group': + if allele in p_group[i]: + allele = p_group[i][allele] + + elif type(resolution) == int: + allele = process_allele(allele, resolution) + + genotypes[subject][gene] = allele + + pd.DataFrame(genotypes).T.rename_axis('subject').to_csv(outfile, sep = '\t') + +#------------------------------------------------------------------------------- \ No newline at end of file diff --git a/scripts/reference.py b/scripts/reference.py index 2b2c3e7..45b107d 100644 --- a/scripts/reference.py +++ b/scripts/reference.py @@ -47,7 +47,7 @@ __date__ = '2019-05-07' #------------------------------------------------------------------------------- -# Paths and filenames +# Paths and fileames #------------------------------------------------------------------------------- rootDir = dirname(realpath(__file__)) + '/../' @@ -55,6 +55,9 @@ IMGTHLA = rootDir + 'dat/IMGTHLA/' IMGTHLA_git = 'https://github.com/ANHIG/IMGTHLA.git' hla_dat = rootDir + 'dat/IMGTHLA/hla.dat' +hla_nom_g = rootDir + 'dat/IMGTHLA/wmda/hla_nom_g.txt' +hla_nom_p = rootDir + 'dat/IMGTHLA/wmda/hla_nom_p.txt' +hla_convert = rootDir + 'dat/ref/hla.convert.p' hla_fa = rootDir + 'dat/ref/hla.fasta' partial_fa = rootDir + 'dat/ref/hla_partial.fasta' hla_p = rootDir + 'dat/ref/hla.p' @@ -74,8 +77,7 @@ def check_ref(): if not isfile(hla_dat): fetch_hla_dat() - - if not isfile(hla_fa) or not isfile(hla_p): + build_convert(False) build_fasta() def fetch_hla_dat(): @@ -84,18 +86,23 @@ def fetch_hla_dat(): if isdir(IMGTHLA): run_command(['rm', '-rf', IMGTHLA]) - command = ['git', 'clone', IMGTHLA_git, IMGTHLA] + command = ['git', 'lfs clone', IMGTHLA_git, IMGTHLA] run_command(command, - '[reference] cloning IMGT/HLA database:') + '[reference] Cloning IMGT/HLA database:') + #run_command(['git', '-C', IMGTHLA, 'lfs', 'fetch']) -def checkout_version(commithash): +def checkout_version(commithash, verbose = True): '''Checks out a specific IMGTHLA github version given a commithash.''' if not isfile(hla_dat): fetch_hla_dat() - command = ['git', '-C', IMGTHLA, 'checkout', commithash] - run_command(command,'[reference] checking out IMGT/HLA:') + command = ['git', '-C', IMGTHLA, 'lfs checkout', commithash] + if verbose: + run_command(command, '[reference] Checking out IMGT/HLA:') + else: + run_command(command) + #run_command(['git', '-C', IMGTHLA, 'lfs', 'fetch']) def hla_dat_version(print_version = False): '''Returns commithash of downloaded IMGTHLA database.''' @@ -216,6 +223,43 @@ def process_hla_dat(): return (complete_alleles, partial_alleles, gene_set, sequences, utrs, exons, final_exon_length) +def process_hla_nom(hla_nom): + '''Processes nomenclature files for arcasHLA convert.''' + allele_to_group = defaultdict(dict) + + single_alleles = set() + grouped_alleles = set() + + for line in open(hla_nom, 'r', encoding='UTF-8'): + if line.startswith('#'): + continue + + gene, alleles, group = line.split(';') + alleles = [gene + allele for allele in alleles.split('/')] + if len(group) == 1: + single_alleles.add(alleles[0]) + continue + + group = gene + group[:-1] + + for allele in alleles: + grouped_alleles.add(process_allele(allele,2)) + for i in range(2,5): + allele_to_group[i][process_allele(allele,i)] = group + + # Alleles not included in a group + for allele in single_alleles: + # Checks if 2-field allele already represented by a group + if process_allele(allele,2) not in grouped_alleles: + for i in range(2,5): + allele_to_group[i][process_allele(allele,i)] = process_allele(allele,2) + else: + allele_to_group[2][allele] = process_allele(allele,3) + allele_to_group[3][allele] = process_allele(allele,3) + allele_to_group[4][allele] = allele + + return allele_to_group + #------------------------------------------------------------------------------- # Saving reference files #------------------------------------------------------------------------------- @@ -253,7 +297,7 @@ def build_fasta(): log.info('[reference] IMGT/HLA database version:\n') hla_dat_version(True) - log.info('[reference] processing IMGT/HLA database') + log.info('[reference] Processing IMGT/HLA database') # Constructs cDNA sequences for alleles and adds UTRs to the set of # non-coding sequences @@ -298,6 +342,7 @@ def complete_records(cDNA, other): allele_idx = dict() lengths = dict() + # Adds coding sequences cDNA = sorted(cDNA.items(), key=lambda x:x[1]) for i,(seq,alleles) in enumerate(cDNA): idx = str(i) @@ -309,6 +354,7 @@ def complete_records(cDNA, other): allele_idx[idx] = sorted(alleles) lengths[idx] = len(seq) + # Adds UTRs offset = i + 1 for i, seq in enumerate(other): idx = str(i + offset) @@ -330,6 +376,7 @@ def partial_records(sequences, other): allele_idx = dict() lengths = dict() + # Adds exon combination sequences offset = 0 for exon in sorted(sequences): exon_sequences = sorted(sequences[exon].items(),key=lambda x:x[1]) @@ -349,6 +396,7 @@ def partial_records(sequences, other): offset += i + 1 + # Adds UTRs for i, seq in enumerate(other): idx = str(i + offset) @@ -373,6 +421,7 @@ def partial_records(sequences, other): combo = {str(i):defaultdict(set) for i in exon_combinations} other = set() + # Build sequences for each allele for allele in complete_alleles: build_complete(allele) build_combination(allele) @@ -385,25 +434,40 @@ def partial_records(sequences, other): other = list(other) gene_length = {g:get_mode(lengths) for g, lengths in gene_length.items()} - log.info('[reference] building HLA database') - - seq_out, allele_idx, lengths = complete_records(cDNA, other) - + log.info('[reference] Building HLA database') + seq_out, allele_idx, lengths = complete_records(cDNA, other) write_reference(seq_out, [gene_set, allele_idx, lengths, gene_length], hla_fa, hla_idx, hla_p, 'complete') - log.info('[reference] building partial HLA database') + log.info('[reference] Building partial HLA database') seq_out, allele_idx, lengths, exon_idx = partial_records(combo, other) - partial_exons = {allele:exons[allele] for allele in partial_alleles} write_reference(seq_out, [gene_set, allele_idx, exon_idx, lengths, partial_exons, partial_alleles], partial_fa, partial_idx, partial_p, 'partial') + +def build_convert(reset=False): + '''Creates conversion tables for arcasHLA convert.''' + + log.info('[reference] Building nomenclature conversion tables.') + + if reset: + commit = hla_dat_version() + checkout_version('origin', False) + + p_group = process_hla_nom(hla_nom_p) + g_group = process_hla_nom(hla_nom_g) + + with open(hla_convert, 'wb') as file: + pickle.dump([p_group,g_group], file) + + if reset: + checkout_version(commit, False) #------------------------------------------------------------------------------- # Main @@ -468,21 +532,26 @@ def partial_records(sequences, other): if args.update: log.info('[reference] Updating HLA reference') checkout_version('origin') + build_convert(False) build_fasta() + elif args.rebuild: + build_convert() build_fasta() elif args.version: if args.version not in versions: sys.exit('[reference] Error: invalid version.') - checkout_version(versions[args.version]) - build_fasta() + #checkout_version(versions[args.version]) + #build_fasta() + build_convert() elif args.commit: check_ref() checkout_version(args.commit) build_fasta() + build_convert() else: check_ref() From bf5bd3a99156cfbb92825fb320a903460e6783f4 Mon Sep 17 00:00:00 2001 From: Rose Orenbuch Date: Mon, 24 Jun 2019 20:45:16 +0000 Subject: [PATCH 5/6] Added convert command to arcasHLA launching script. --- arcasHLA | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arcasHLA b/arcasHLA index c9a093c..209e391 100755 --- a/arcasHLA +++ b/arcasHLA @@ -70,6 +70,10 @@ elif [ "$1" == "partial" ]; then python3 ${ARCASHLA_ROOT_DIR}/scripts/partial.py ${@:2} +elif [ "$1" == "convert" ]; then + + python3 ${ARCASHLA_ROOT_DIR}/scripts/convert.py ${@:2} + #----------------------------------------------------------------------------- # Usage #----------------------------------------------------------------------------- From 301085eec9fd10babb3096882c43b126e98c862d Mon Sep 17 00:00:00 2001 From: Rose Orenbuch Date: Wed, 26 Jun 2019 19:14:52 +0000 Subject: [PATCH 6/6] Added information on updated merge module and new convert module to README; updated documentation. --- README.md | 54 ++++++++++---- arcasHLA | 5 +- scripts/align.py | 29 ++++---- scripts/arcas_utilities.py | 5 +- scripts/convert.py | 147 +++++++++++++++++++------------------ scripts/extract.py | 8 +- scripts/genotype.py | 38 ++++++---- scripts/merge.py | 4 +- scripts/partial.py | 28 ++++--- scripts/reference.py | 11 ++- 10 files changed, 187 insertions(+), 142 deletions(-) diff --git a/README.md b/README.md index 22d751e..92f01d1 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,10 @@ arcasHLA performs high resolution genotyping for HLA class I and class II genes from RNA sequencing, supporting both paired and single-end samples. ### Dependencies ### +arcasHLA requires the following utilities: +- [Git Large File Storage](https://github.com/git-lfs/git-lfs/wiki/Installation) +- coreutils + Make sure the following programs are in your `PATH`: - [Samtools v1.19](http://www.htslib.org/) - [bedtools v2.27.1](http://bedtools.readthedocs.io/) @@ -16,8 +20,6 @@ arcasHLA requires the following Python modules: - SciPy - Pandas -arcasHLA also requires coreutils. - ### Test ### In order to test arcasHLA partial typing, we need to roll back the reference to an earlier version. First, fetch IMGT/HLA database version 3.24.0: ``` @@ -55,16 +57,10 @@ Expected output in `test/output/test.partial_genotype.json`: "DQB1": ["DQB1*06:04:01", "DQB1*02:02:01"], "DRB1": ["DRB1*03:02:01", "DRB1*14:02:01"]} ``` -Before further usage, remember to update to the 3.34.0. -``` -./arcasHLA reference --version 3.34.0 +Remember to update the HLA reference using the following command. ``` -At this time, cloning the latest version of IMGTHLA (3.35.0) results in a corrupted database due to an issue with GitHub's Large File Storage. To update the arcasHLA's reference to the current version, run the following commands. +./arcasHLA reference --update ``` -curl https://media.githubusercontent.com/media/ANHIG/IMGTHLA/Latest/hla.dat > dat/IMGTHLA/hla.dat -./arcasHLA reference --rebuild --v -``` - ### Usage ### @@ -124,12 +120,13 @@ arcasHLA genotype [options] /path/to/sample.alignment.p #### Options #### - `-g, --genes GENES` : comma separated list of HLA genes (ex. A,B,C,DQA1,DQB1,DRB1) - `-p, --population POPULATION` : sample population, options are asian_pacific_islander, black, caucasian, hispanic, native_american and prior (default: Prior) +- `--min_count INT` : minimum gene read count required for genotyping (default: 75) - `--tolerance FLOAT` : convergence tolerance for transcript quantification (default: 10e-7) - `--max_iterations INT` : maximmum number of iterations for transcript quantification (default: 1000) -- `--drop_iterations INT` : number of iterations before dropping low support alleles, a lower number of iterations is recommended for single-end and low read couunt samples (default: paired - 10, single - 4) +- `--drop_iterations INT` : number of iterations before dropping low support alleles, a lower number of iterations is recommended for single-end and low read count samples (default: paired - 10, single - 4) - `--drop_threshold FLOAT` : proportion of maximum abundance an allele needs to not be dropped (default: 0.1) - `--zygosity_threshold FLOAT` : threshold for ratio of minor to major allele nonshared count to determine zygosity (default: 0.15) -- `--log FILE` : log file for run summary (default: sample.genotype.log) +- `--log FILE` : log file for run summary (default: `sample.genotype.log`) - `--o, --outdir DIR` : output directory (default: `.`) - `--temp DIR` : temp directory (default: `/tmp`) - `--keep_files` : keep intermediate files (default: False) @@ -148,7 +145,7 @@ Output: `sample.partial_alignment.p`, `sample.partial_genotype.json` The options for partial typing are the same as genotype. Partial typing can be run from the intermediate alignment file. ### Merge jsons ### -To make analysis easier, this command will merge all jsons produced by genotyping. All `.genotype.json` files will be merged into a single `run.genotypes.json` file and all `.partial_genotype.json` files will be merged into `run.partial_genotypes.json`. +To make analysis easier, this command will merge all jsons produced by genotyping into a single table. All `.genotype.json` files will be merged into a single `run.genotypes.tsv` file and all `.partial_genotype.json` files will be merged into `run.partial_genotypes.tsv`. In addition, HLA locus read counts and relative abundance produced by alignment will be merged into a single tsv file. ``` arcasHLA merge [options] ``` @@ -156,7 +153,25 @@ arcasHLA merge [options] - `--run RUN` : run name - `--i, --indir DIR` : input directory (default: `.`) - `--o, --outdir DIR` : output directory (default: `.`) -- `-v, --verbose` : verbosity (default: False) +- `-v, --verbose` : toggle verbosity + +### Convert HLA nomenclature ### +arcasHLA convert changes alleles in a tsv file from its input form to a specified grouped nomenclature (P-group or G-group) or a specified number of fields (i.e. 1, 2 or 3 fields in resolution). This file can be produced by arcasHLA merge or any tsv following the same structure: + +| subject | A1 | A2 | B1 | B2 | C1 | C2 | +|-------------- |------------ |------------ |------------ |------------ |------------ |------------ | +| subject_name | A*01:01:01 | A*01:01:01 | B*07:02:01 | B*07:02:01 | C*04:01:01 | C*04:01:01 | + +P-group (alleles sharing the same amino acid sequence in the antigen-binding region) and G-group (alleles sharing the same base sequence in the antigen-binding region) can only be reduced to 1-field resolution as alleles with differing 2nd fields can be in the same group. By the same reasoning, P-group cannot be converted into G-group. + +``` +arcasHLA convert --resolution [resolution] genotypes.tsv +``` +#### Options #### +- `-r, --resolution RESOLUTION` : output resolution (1, 2, 3) or grouping (g-group, p-group) +- `-o, --outfile FILE` : output file (default: `./run.resolution.tsv`) +- `-f, --force` : force conversion for grouped alleles even if it results in loss of resolution +- `-v, --verbose` : toggle verbosity ### Change reference ### To update the reference to the latest IMGT/HLA version, run @@ -174,6 +189,14 @@ If you suspect there is an issue with the reference files, rebuild the referenc ``` arcasHLA reference --rebuild ``` + +Note: if your reference was built with arcasHLA version <= 0.1.1 and you wish to change your reference to versions >= 3.35.0, it may be necessary to remove the IMGTHLA folder due to the need for Git Large File Storage to properly download hla.dat. + +``` +rm -rf dat/IMGTHLA +arcasHLA reference --update +``` + #### Options #### - `--update` : update to latest IMGT/HLA version - `--version` : checkout IMGT/HLA version using commithash @@ -181,5 +204,4 @@ arcasHLA reference --rebuild - `-v, --verbose` : verbosity (default: False) ## Citation ## -R. Orenbuch, I. Filip, D. Comito, J. Shaman, I. Pe’er, and R. Rabadan. arcasHLA: -high resolution HLA typing from RNA seq. bioRxiv doi: [10.1101/479824](https://doi.org/10.1101/479824) +Orenbuch R, Filip I, Comito D, et al (2019) arcasHLA: high resolution HLA typing from RNA seq. Bioinformatics doi:[10.1093/bioinformatics/btz474](http://dx.doi.org/10.1093/bioinformatics/btz474) diff --git a/arcasHLA b/arcasHLA index 209e391..42999b3 100755 --- a/arcasHLA +++ b/arcasHLA @@ -4,8 +4,8 @@ #----------------------------------------------------------------------------- # # Authors: Rose Orenbuch, Ioan Filip, Itsik Pe'er -# Date: 2019-05-07 -# Version: 0.1.1 +# Date: 2019-06-26 +# Version: 0.2.0 # License: GNU # #----------------------------------------------------------------------------- @@ -85,6 +85,7 @@ else echo " genotype types HLA genes from extracted reads" echo " partial types partial HLA genes from extracted reads" echo " merge processes results into a tab-separated table" + echo " convert converts HLA nomenclature/resolution" echo " reference check or update HLA reference" echo echo "Note: run any command with --help to view required fields, options" diff --git a/scripts/align.py b/scripts/align.py index 1fd8942..9ba44c3 100644 --- a/scripts/align.py +++ b/scripts/align.py @@ -43,8 +43,8 @@ from reference import check_ref, get_exon_combinations from arcas_utilities import * -__version__ = '0.1.1' -__date__ = '2019-05-07' +__version__ = '0.2.0' +__date__ = '2019-06-26' #------------------------------------------------------------------------------- # Paths and filenames @@ -69,17 +69,12 @@ def analyze_reads(fqs, paired, reads_file): log.info('[alignment] Analyzing read length') if paired: fq1, fq2 = fqs - - command = [cat, '<', fq1, awk, '>' , reads_file] - run_command(command) - - command = [cat, '<', fq2, awk, '>>', reads_file] - run_command(command) + run_command([cat, '<', fq1, awk, '>' , reads_file]) + run_command([cat, '<', fq2, awk, '>>', reads_file]) else: fq = fqs[0] - command = [cat, '<', fq, awk, '>', reads_file] - run_command(command) + run_command([cat, '<', fq, awk, '>', reads_file]) read_lengths = np.genfromtxt(reads_file) @@ -90,7 +85,6 @@ def analyze_reads(fqs, paired, reads_file): avg = round(np.mean(read_lengths), 6) std = round(np.std(read_lengths), 6) - return num, avg, std def pseudoalign(fqs, sample, paired, reference, outdir, temp, threads): @@ -232,6 +226,7 @@ def get_count_stats(eq_idx, gene_length): return stats def alignment_summary(align_stats, partial = False): + '''Prints alignment summary to log.''' count_unique, count_multi, total, _, _ = align_stats log.info('[alignment] Processed {:.0f} reads, {:.0f} pseudoaligned ' .format(total, count_unique + count_multi)+ @@ -241,7 +236,7 @@ def alignment_summary(align_stats, partial = False): .format(count_unique)) def gene_summary(gene_stats): - # Print gene abundance information + '''Prints gene read count and relative abundance to log.''' log.info('[alignment] Observed HLA genes:') @@ -253,7 +248,7 @@ def gene_summary(gene_stats): .format(g, a*100, c, e)) def get_alignment(fqs, sample, reference, reference_info, outdir, temp, threads, partial = False): - + '''Runs pseudoalignment and processes output.''' paired = True if len(fqs) == 2 else False count_file = ''.join([temp, 'pseudoalignments.tsv']) @@ -266,6 +261,8 @@ def get_alignment(fqs, sample, reference, reference_info, outdir, temp, threads, outdir, temp, threads) + + # Process partial genotyping pseudoalignment if partial: (commithash, (gene_set, allele_idx, exon_idx, lengths, partial_exons, partial_alleles)) = reference_info @@ -287,6 +284,7 @@ def get_alignment(fqs, sample, reference, reference_info, outdir, temp, threads, align_stats, []] pickle.dump(alignment_info, file) + # Process regular pseudoalignment else: (commithash,(gene_set, allele_idx, lengths, gene_length)) = reference_info @@ -309,9 +307,14 @@ def get_alignment(fqs, sample, reference, reference_info, outdir, temp, threads, align_stats, gene_stats] pickle.dump(alignment_info, file) + with open(''.join([outdir,sample,'.genes.json']), 'w') as file: + json.dump(gene_stats, file) + return alignment_info def load_alignment(file, commithash, partial = False): + '''Loads previous pseudoalignment.''' + log.info(f'[alignment] Loading previous alignment %s', file) with open(file, 'rb') as file: diff --git a/scripts/arcas_utilities.py b/scripts/arcas_utilities.py index 5c90166..a666420 100644 --- a/scripts/arcas_utilities.py +++ b/scripts/arcas_utilities.py @@ -28,8 +28,8 @@ import uuid from subprocess import PIPE, STDOUT, run -__version__ = '0.1.1' -__date__ = '2019-05-07' +__version__ = '0.2.0' +__date__ = '2019-06-26' #------------------------------------------------------------------------------- @@ -93,6 +93,7 @@ def run_command(command, message = ''): return output def create_temp(temp): + '''Generates name for temporary folder.''' temp = check_path(temp) temp_folder = ''.join([temp,'arcas_' + str(uuid.uuid4())]) return check_path(temp_folder) diff --git a/scripts/convert.py b/scripts/convert.py index 1d8194d..566d096 100644 --- a/scripts/convert.py +++ b/scripts/convert.py @@ -36,8 +36,8 @@ #------------------------------------------------------------------------------- -__version__ = '0.2' -__date__ = '2019-04-02' +__version__ = '0.2.0' +__date__ = '2019-06-26' #------------------------------------------------------------------------------- # Paths and fileames @@ -49,6 +49,63 @@ #------------------------------------------------------------------------------- +def convert_allele(allele, resolution): + '''Checks nomenclature of input allele and returns converted allele.''' + i = len(allele.split(':')) + + # Input: P-group allele + if allele[-1] == 'P': + if resolution == 'g-group': + sys.exit('[convert] Error: p-group cannot be converted ' + + 'to g-group.') + + # Output: 1-field allele unless forced + elif type(resolution) == int: + if resolution > 1 and not args.force: + sys.exit('[convert] Error: p-group cannot be ' + + 'converted to %.0f fields.' %resolution) + allele = process_allele(allele[:-1], resolution) + + # Input: G-group allele + elif allele[-1] == 'G': + + # Output: 1-field allele unless forced + if type(resolution) == int: + if resolution > 1 and not args.force: + sys.exit('[convert] Error: g-group cannot be converted' + + 'to %.0f fields.' %resolution) + allele = process_allele(allele[:-1], resolution) + + # Output: P-group allele + elif resolution == 'p-group': + if allele[:-1] in p_group[i]: + allele = p_group[i][allele[:-1]] + + elif process_allele(allele[:-1], i - 1) in p_group[i]: + allele = p_group[i][process_allele(allele[:-1], i -1)] + + # Input: ungrouped allele + # Output: G-group allele + elif resolution == 'g-group': + if allele in g_group[i]: + allele = g_group[i][allele] + elif allele[-1] != 'N': + allele = process_allele(allele,3) + + # Input: ungrouped allele + # Output: P-group allele + elif resolution == 'p-group': + if allele in p_group[i]: + allele = p_group[i][allele] + + # Input: ungrouped allele + # Output: reduced resolution, ungrouped allele + elif type(resolution) == int: + allele = process_allele(allele, resolution) + + return allele + +#------------------------------------------------------------------------------- if __name__ == '__main__': @@ -58,8 +115,8 @@ formatter_class=RawTextHelpFormatter) parser.add_argument('file', - help='tsv containing HLA genotypes, see github for ' \ - + example file structure.\n\n', + help='tsv containing HLA genotypes, see github for ' + + 'example file structure.\n\n', type=str) parser.add_argument('-h', @@ -67,24 +124,25 @@ action = 'help', help='show this help message and exit\n\n', default=argparse.SUPPRESS) + + parser.add_argument('-r', + '--resolution', + help = 'output resolution (1,2,3) or grouping ' + + '(g-group, p-group)\n\n', + metavar='') parser.add_argument('-o', '--outfile', type=str, - help='output file\n ' \ - + 'default: ./file_basename.resolution.tsv\n\n', + help='output file\n default: ' + + './file_basename.resolution.tsv\n\n', default='', metavar='') - - parser.add_argument('--resolution', - help = 'output resolution (1,2,3,4) or grouping ' \ - + '(g-group, p-group)\n\n', - default='2', - metavar='') - parser.add_argument('--force', - help = 'force conversion for grouped alleles even if ' \ - + 'it results in loss of resolution', + parser.add_argument('-f', + '--force', + help = 'force conversion for grouped alleles even if ' + + 'it results in loss of resolution', action = 'count', default=False) @@ -110,8 +168,8 @@ resolution = args.resolution.lower() if not resolution: - sys.exit('[convert] Error: output resolution is needed ' \ - + '(1, 2, 3, g-group, p-group).') + sys.exit('[convert] Error: output resolution is needed ' + + '(1, 2, 3, g-group, p-group).') # Create outfile name @@ -133,60 +191,7 @@ if type(allele) != str: continue - i = len(allele.split(':')) - - # If input allele is in P-group nomenclature, it can only be - # converted to 1 field without loss - if allele[-1] == 'P': - if resolution == 'g-group': - sys.exit('[convert] Error: p-group cannot be converted ' \ - + 'to g-group.') - - elif type(resolution) == int and resolution > 1: - if not args.force: - sys.exit('[convert] Error: p-group cannot be ' \ - + 'converted to %.0f fields.' %resolution) - allele = process_allele(allele[:-1], resolution) - - elif type(resolution) == int: - allele = process_allele(allele, resolution) - - # If input allele is in G-group nomenclature, it can only be - # converted to 1 field without loss - elif allele[-1] == 'G': - - if type(resolution) == int and resolution > 1: - if not args.force: - sys.exit('[convert] Error: g-group cannot be converted'\ - + 'to %.0f fields.' %resolution) - allele = process_allele(allele[:-1], resolution) - - elif resolution == 'p-group': - if allele[:-1] in p_group[i]: - allele = p_group[i][allele[:-1]] - - elif process_allele(allele[:-1], i - 1) in p_group[i]: - allele = p_group[i][process_allele(allele[:-1], i -1)] - - elif type(resolution) == int: - allele = process_allele(allele, resolution) - - # If allele not in G-group and not null, reduce resolution - # to 3-fields - elif resolution == 'g-group': - if allele in g_group[i]: - allele = g_group[i][allele] - elif allele[-1] != 'N': - allele = process_allele(allele,3) - - elif resolution == 'p-group': - if allele in p_group[i]: - allele = p_group[i][allele] - - elif type(resolution) == int: - allele = process_allele(allele, resolution) - - genotypes[subject][gene] = allele + genotypes[subject][gene] = convert_allele(allele, resolution) pd.DataFrame(genotypes).T.rename_axis('subject').to_csv(outfile, sep = '\t') diff --git a/scripts/extract.py b/scripts/extract.py index 68a3daa..8b6ee37 100644 --- a/scripts/extract.py +++ b/scripts/extract.py @@ -34,8 +34,8 @@ from argparse import RawTextHelpFormatter from arcas_utilities import * -__version__ = '0.1.1' -__date__ = '2019-05-07' +__version__ = '0.2.0' +__date__ = '2019-06-26' #------------------------------------------------------------------------------- # Extract Reads @@ -218,14 +218,13 @@ def extract_reads(bam, outdir, paired, unmapped, alts, temp, threads): datDir = os.path.dirname(os.path.realpath(__file__)) + '/../dat/' + # Set up log file if args.log: log_file = args.log else: log_file = ''.join([outdir,sample,'.extract.log']) - with open(log_file, 'w'): pass - if args.verbose: handlers = [log.FileHandler(log_file), log.StreamHandler()] @@ -248,6 +247,7 @@ def extract_reads(bam, outdir, paired, unmapped, alts, temp, threads): .format( 'paired' if args.paired else 'single')) hline() + # Load names of regions outside chr6 with HLA loci with open(datDir + '/info/decoys_alts.p', 'rb') as file: alts = pickle.load(file) diff --git a/scripts/genotype.py b/scripts/genotype.py index 508212e..a6f9717 100644 --- a/scripts/genotype.py +++ b/scripts/genotype.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- #------------------------------------------------------------------------------- -# genotype.py: genotypes from extracted chromosome 6 reads. +# genotype.py: genotypes from extracted reads. #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- @@ -44,8 +44,8 @@ from arcas_utilities import * from align import * -__version__ = '0.1.1' -__date__ = '2019-05-07' +__version__ = '0.2.0' +__date__ = '2019-06-26' #------------------------------------------------------------------------------- # Paths and filenames @@ -594,6 +594,13 @@ def arg_check_threshold(parser, arg): default=0.15, metavar='') + parser.add_argument('--min_count', + type=int, + help='minimum gene read count required for genotyping ' + + '\n default: 75\n\n', + default=75, + metavar='') + parser.add_argument('-o', '--outdir', type=str, @@ -626,21 +633,18 @@ def arg_check_threshold(parser, arg): args = parser.parse_args() if len(args.file) == 0: - sys.exit('[genotype] Error: FASTQ or alignment.p file required') + sys.exit('[genotype] Error: FASTQ or alignment.p file required.') + # Set up temporary and output folders, log file sample = os.path.basename(args.file[0]).split('.')[0] - outdir = check_path(args.outdir) temp = create_temp(args.temp) - if args.log: log_file = args.log else: - log_file = ''.join([outdir,sample,'.genotype.log']) - + log_file = ''.join([outdir,sample,'.genotype.log']) with open(log_file, 'w'): pass - if args.verbose: handlers = [log.FileHandler(log_file), log.StreamHandler()] @@ -661,14 +665,15 @@ def arg_check_threshold(parser, arg): log.info(f'[log] Sample: %s', sample) log.info(f'[log] Input file(s): %s', f'\n\t\t '.join(args.file)) + # Load HLA frequencies prior = pd.read_csv(hla_freq, delimiter='\t') prior = prior.set_index('allele').to_dict('index') - # checks if HLA reference exists + # Checks if HLA reference exists check_path(rootDir + 'dat/ref') check_ref() - # loads reference information + # Loads reference information with open(hla_p, 'rb') as file: reference_info = pickle.load(file) (commithash,(gene_set, allele_idx, @@ -685,33 +690,38 @@ def arg_check_threshold(parser, arg): commithash, eq_idx, allele_eq, paired, align_stats, gene_stats = alignment_info + # Set up EM parameters if not args.drop_iterations: if paired: args.drop_iterations = 20 else: args.drop_iterations = 4 em_results = dict() genotypes = dict() - + hline() log.info('[genotype] Genotyping parameters:') log.info(f'\t\tpopulation: %s', args.population) + log.info(f'\t\tminimum count: %s', args.min_count) log.info(f'\t\tmax iterations: %s', args.max_iterations) log.info(f'\t\ttolerance: %s', args.tolerance) log.info(f'\t\tdrop iterations: %s', args.drop_iterations) log.info(f'\t\tdrop threshold: %s', args.drop_threshold) log.info(f'\t\tzygosity threshold: %s', args.zygosity_threshold) + # For each HLA locus, perform EM then scoring for gene in args.genes: hline() log.info(f'[genotype] Genotyping HLA-{gene}') - if gene not in gene_stats or gene_stats[gene][0] == 0: - log.info(f'[genotype] No reads aligned to HLA-{gene}') + # Skips loci with not enough reads to genotype + if gene not in gene_stats or gene_stats[gene][0] < args.min_count: + log.info(f'[genotype] Not enough reads aligned to HLA-{gene} to genotype.') continue gene_count, eq_count, abundance = gene_stats[gene] log.info(f'[genotype] {gene_count:.0f} reads aligned to HLA-{gene} '+ f'in {eq_count} classes') + em, genotype = genotype_gene(gene, gene_count, eq_idx[gene], diff --git a/scripts/merge.py b/scripts/merge.py index f3a7394..b7836c5 100644 --- a/scripts/merge.py +++ b/scripts/merge.py @@ -32,8 +32,8 @@ from arcas_utilities import check_path -__version__ = '0.1.1' -__date__ = '2019-05-07' +__version__ = '0.2.0' +__date__ = '2019-06-26' #------------------------------------------------------------------------------- diff --git a/scripts/partial.py b/scripts/partial.py index 9821673..345d459 100644 --- a/scripts/partial.py +++ b/scripts/partial.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- #------------------------------------------------------------------------------- -# genotype.py: genotypes from extracted chromosome 6 reads. +# partial.py: genotypes partial alleles from extracted reads. #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- @@ -43,8 +43,8 @@ from align import * from genotype import expectation_maximization -__version__ = '0.1.1' -__date__ = '2019-05-07' +__version__ = '0.2.0' +__date__ = '2019-06-26' #------------------------------------------------------------------------------- # Paths and filenames @@ -65,13 +65,16 @@ def filter_eqs(complete_genotypes, allele_idx, eq_idx, partial_alleles): at least one predicted complete allele. ''' + # All alleles returned from arcasHLA genotype all_predicted = {allele for alleles in complete_genotypes.values() for allele in alleles} + # Find the indices of sequences derived from these alleles wanted_indices = {index for index, alleles in allele_idx.items() if alleles and (set(alleles) & (partial_alleles | set(all_predicted)))} + # For each eq group, select only those that include the previous indices filtered_eqs = dict() for group, eq_list in eq_idx.items(): filtered_eqs[group] = dict() @@ -89,6 +92,7 @@ def filter_eqs(complete_genotypes, allele_idx, eq_idx, partial_alleles): filtered.append([indices,count]) filtered_eqs[group][gene] = filtered + # Find indices given an allele allele_eq = {group:defaultdict(set) for group in filtered_eqs.keys()} for group, eq_list in filtered_eqs.items(): for gene in eq_list: @@ -427,19 +431,16 @@ def arg_check_threshold(parser, arg): if len(args.file) == 0: sys.exit('[genotype] Error: FASTQ or partial_alignment.p file required') + # Set up directories and log file sample = os.path.basename(args.file[0]).split('.')[0] - outdir = check_path(args.outdir) temp = create_temp(args.temp) - if args.log: log_file = args.log else: - log_file = ''.join([outdir,sample,'.partial_genotype.log']) - + log_file = ''.join([outdir,sample,'.partial_genotype.log']) with open(log_file, 'w'): pass - if args.verbose: handlers = [log.FileHandler(log_file), log.StreamHandler()] @@ -463,10 +464,10 @@ def arg_check_threshold(parser, arg): prior = pd.read_csv(hla_freq, delimiter='\t') prior = prior.set_index('allele').to_dict('index') - # checks if HLA reference exists + # Checks if HLA reference exists check_ref() - # loads reference information + # Loads reference information with open(partial_p, 'rb') as file: reference_info = pickle.load(file) (commithash, (gene_set, allele_idx, exon_idx, @@ -475,7 +476,7 @@ def arg_check_threshold(parser, arg): log.info(f'[log] Reference: %s', commithash) hline() - # runs transcript assembly if intermediate json not provided + # Runs transcript assembly if intermediate json not provided if args.file[0].endswith('.partial_alignment.p'): alignment_info = load_alignment(args.file[0], commithash, True) else: @@ -483,17 +484,20 @@ def arg_check_threshold(parser, arg): reference_info, outdir, temp, args.threads, True) commithash, eq_idx, _, paired, align_stats, _ = alignment_info - + + # Load alleles from arcasHLA genotype with open(args.genotype,'r') as file: complete_genotypes = json.load(file) genes = set(args.genes) & set(complete_genotypes.keys()) + # Filter out alleles not returned by arcasHLA genotype eq_idx, allele_eq = filter_eqs(complete_genotypes, allele_idx, eq_idx, partial_alleles) partial_results = dict() + # For each HLA locus, check for possible partial alleles for gene in sorted(genes): hline() log.info(f'[genotype] Partial genotyping HLA-{gene}') diff --git a/scripts/reference.py b/scripts/reference.py index 45b107d..2f5f039 100644 --- a/scripts/reference.py +++ b/scripts/reference.py @@ -43,8 +43,8 @@ from arcas_utilities import * -__version__ = '0.1.1' -__date__ = '2019-05-07' +__version__ = '0.2.0' +__date__ = '2019-06-26' #------------------------------------------------------------------------------- # Paths and fileames @@ -89,7 +89,6 @@ def fetch_hla_dat(): command = ['git', 'lfs clone', IMGTHLA_git, IMGTHLA] run_command(command, '[reference] Cloning IMGT/HLA database:') - #run_command(['git', '-C', IMGTHLA, 'lfs', 'fetch']) def checkout_version(commithash, verbose = True): '''Checks out a specific IMGTHLA github version given a commithash.''' @@ -102,7 +101,6 @@ def checkout_version(commithash, verbose = True): run_command(command, '[reference] Checking out IMGT/HLA:') else: run_command(command) - #run_command(['git', '-C', IMGTHLA, 'lfs', 'fetch']) def hla_dat_version(print_version = False): '''Returns commithash of downloaded IMGTHLA database.''' @@ -134,6 +132,7 @@ def process_hla_dat(): with open(hla_dat, 'r', encoding='UTF-8') as file: lines = file.read().splitlines() + # Check if hla.dat failed to download if len(lines) < 10: sys.exit('[reference] Error: dat/IMGTHLA/hla.dat empty or corrupted.') @@ -543,8 +542,8 @@ def build_convert(reset=False): elif args.version: if args.version not in versions: sys.exit('[reference] Error: invalid version.') - #checkout_version(versions[args.version]) - #build_fasta() + checkout_version(versions[args.version]) + build_fasta() build_convert() elif args.commit: