added LOH options to quant module

RabadanLab · Mar 4, 2022 · 157fabb · 157fabb
1 parent a2036a2
commit 157fabb
Showing 1 changed file with 70 additions and 3 deletions.
diff --git a/scripts/quant.py b/scripts/quant.py
@@ -116,6 +116,11 @@ def arg_check_files(parser, arg):
                         help='keep intermediate files\n\n',
                         default=False)
 
+    parser.add_argument('--single',
+                        action='store_true',
+                        help='Include flag if single-end reads. Default is paired-end.\n\n',
+                        default=False)
+
     parser.add_argument('-l',
                         '--avg', 
                         type=int,
@@ -130,11 +135,22 @@ def arg_check_files(parser, arg):
                              'for single-end reads\n  default: 20\n\n',
                         default=20)
 
-    parser.add_argument('--single',
+    parser.add_argument('--LOH',
                         action='store_true',
-                        help='Include flag if single-end reads. Default is paired-end.\n\n',
+                        help='Include flag for estimated loss of heterozygosity. ' +
+                             'Must provide purity and ploidy estimates.\n\n',
                         default=False)
 
+    parser.add_argument('--purity',
+                        type=float,
+                        help='Estimated purity of sample\n  default: 1.0\n\n',
+                        default=1.0)
+
+    parser.add_argument('--ploidy',
+                        type=int,
+                        help='Estimated ploidy of sample\n  default: 2.0\n\n',
+                        default=2.0)
+
     parser.add_argument('-t',
                         '--threads', 
                         type = str,
@@ -165,6 +181,7 @@ def arg_check_files(parser, arg):
     gene_results_json = outdir + sample + '.quant.genes.json'
     allele_results_tsv = outdir + sample + '.quant.alleles.tsv'
     gene_results_tsv = outdir + sample + '.quant.genes.tsv'
+    loh_results_tsv = outdir + sample + '.quant.loh.tsv'
 
 
     with open(indv_p, 'rb') as file:
@@ -225,7 +242,7 @@ def arg_check_files(parser, arg):
             tpm[gene] += kallisto_results.loc[int(idx)]['tpm']
 
     gene_results = {gene:defaultdict(int) for gene in genes}
-    
+
     allele_results = {gene:defaultdict(float) for gene in genes}
 
     total_hla_count = 0
@@ -269,4 +286,54 @@ def arg_check_files(parser, arg):
         json.dump(gene_results, file)
 
     if not args.keep_files: run_command(['rm -rf', temp])
+
+    # LOH functionality
+    if(args.LOH):
+        corrections_columns = []
+
+        for gene in genes:
+            corrections_columns.append(gene+"_CN_1")
+            corrections_columns.append(gene+"_CN_2")
+            corrections_columns.append(gene+"_LOSS")
+            corrections_columns.append(gene+"_lost")
+
+        corrections_df=pd.DataFrame(columns = corrections_columns)
+
+        for gene in genes:
+            baf1 = allele_results[gene]['allele1_count'] / (allele_results[gene]['allele1_count'] + \
+                                                        allele_results[gene]['allele2_count'])
+            baf2 = 1-baf1
+
+            correction1 = (2*baf1*(1 + args.purity*(args.ploidy - 2)/2) + args.purity - 1)/(args.purity)
+            correction2 = (2*baf2*(1 + args.purity*(args.ploidy - 2)/2) + args.purity - 1)/(args.purity)
+
+            if correction1 < correction2:
+                minor = correction1
+                major = correction2
+            else:
+                minor = correction2
+                major = correction1
+
+            corrections_df.at[0,gene+"_CN_1"] = correction1
+            corrections_df.at[0,gene+"_CN_2"] = correction2
+
+            if (correction1 < 0.5) or (correction2 < 0.5):
+                corrections_df.at[0,gene+"_LOSS"] = True
+
+                if (correction1 < 0.5) and (correction2 < 0.5):
+                    corrections_df.at[0,gene+"_lost"] = ",".join(allele_results[gene][['allele1', \
+                                                            'allele2']].tolist())
+
+                elif (correction1 < 0.5) :
+                    corrections_df.at[0,gene+"_lost"] = allele_results[gene]['allele1']
+
+                else:
+                    corrections_df.at[0,gene+"_lost"] = allele_results[gene]['allele2']
+
+            else:
+                corrections_df.at[0,gene+"_LOSS"] = False
+                corrections_df.at[0,gene+"_lost"] = "none"
+
+    corrections_df.to_csv(loh_results_tsv,sep='\t',index=False)
+
 #-----------------------------------------------------------------------------