davidaknowles · mdshw5 · Nov 16, 2018 · Nov 16, 2018 · Nov 16, 2018 · Nov 17, 2018
diff --git a/clustering/check_files.py b/clustering/check_files.py
@@ -29,7 +29,7 @@
     sys.stderr.write("%d junctions\n"%libs[lib])
 
 failed_junc = []
-threshold=max([len(x) for x in libChroms.values()])/1
+threshold=max([len(x) for x in list(libChroms.values())])/1
 for lib in libChroms:
     if len(libChroms[lib]) < threshold:
         failed_junc.append("rm "+lib)

diff --git a/clustering/get_cluster_gene.py b/clustering/get_cluster_gene.py
@@ -36,10 +36,10 @@ def get_feature(fname, feature = "exon"):
 
 ss2gene = get_feature(sys.argv[1], "exon")
 
-W = file("%s.clu2gene.txt"%sys.argv[2].split("_perind")[0],'w')
+W = open("%s.clu2gene.txt"%sys.argv[2].split("_perind")[0],'w')
 for ln in gzip.open(sys.argv[2]):
     if "chrom" in ln: continue
-    
+
     if len(ln.split()[0].split(":")) == 5:
         chrom, A, B, clu, strand = ln.split()[0].split(":")
     else:
@@ -52,7 +52,7 @@ def get_feature(fname, feature = "exon"):
     if (chrom, int(B)) in ss2gene:
         gs.append(ss2gene[(chrom, int(B))])
 
-    
+
     if len(gs) > 0:
         W.write("%s %s %s %s %s %.2f %.2f %.2f %.2f\n"%(clu,chrom, A,B, gs[0], mean, median, minAS, maxAS))
     else:

diff --git a/clustering/leafcutter_cluster.py b/clustering/leafcutter_cluster.py
@@ -7,7 +7,7 @@
 import shutil
 
 def main(options,libl):
-    
+
     pool_junc_reads(libl, options)
     refine_clusters(options)
     sort_junctions(libl, options)
@@ -23,11 +23,11 @@ def pool_junc_reads(flist, options):
     useStrand = options.strand
 
     outFile = "%s/%s_pooled"%(rundir,outPrefix)
-    
+
     chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
     by_chrom = {}
     for libl in flist:
-        
+
         lib = libl.strip()
         if not os.path.isfile(lib):
             continue
@@ -38,24 +38,24 @@ def pool_junc_reads(flist, options):
         if lib[-3:] == ".gz": F = gzip.open(lib)
         else: F = open(lib)
         for ln in F:
-            
+
             lnsplit=ln.split()
-            if len(lnsplit)<6: 
+            if len(lnsplit)<6:
                 sys.stderr.write("Error in %s \n" % lib)
                 continue
             chrom, A, B, dot, counts, strand = lnsplit
-            
+
             if not useStrand:
                 strand = "NA"
             if checkchrom and (chrom not in chromLst): continue
             A, B = int(A), int(B)+1
             if B-A > int(maxIntronLen): continue
             try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom, strand)][(A,B)]
-            except: 
+            except:
                 try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
                 except: by_chrom[(chrom, strand)] = {(A,B):int(counts)}
 
-    fout = file(outFile, 'w')
+    fout = open(outFile, 'w')
     Ncluster = 0
     sys.stderr.write("Parsing...\n")
     for chrom in by_chrom:
@@ -64,7 +64,7 @@ def pool_junc_reads(flist, options):
         sys.stderr.write("%s:%s.."%chrom)
         clu = cluster_intervals(read_ks)[0]
         for cl in clu:
-            if len(cl) > 1: # if cluster has more than one intron  
+            if len(cl) > 1: # if cluster has more than one intron
                 buf = '%s:%s '%chrom
                 for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
                     buf += "%d:%d" % interval + ":%d"%count+ " "
@@ -76,7 +76,7 @@ def pool_junc_reads(flist, options):
 
 def sort_junctions(libl, options):
 
-    chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] 
+    chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
     outPrefix = options.outprefix
     rundir = options.rundir
     refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
@@ -112,7 +112,7 @@ def sort_junctions(libl, options):
             merges[libN] = []
         merges[libN].append(lib)
 
-    fout_runlibs = file(runName+"_sortedlibs",'w')
+    fout_runlibs = open(runName+"_sortedlibs",'w')
 
     for libN in merges:
         libName = "%s/%s"%(rundir,libN.split('/')[-1])
@@ -121,24 +121,24 @@ def sort_junctions(libl, options):
 
         fout_runlibs.write(foutName+'\n')
 
-        if options.verbose:   
+        if options.verbose:
             sys.stderr.write("Sorting %s..\n"%libN)
         if len(merges[libN]) > 1:
-            if options.verbose:   
+            if options.verbose:
                 sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
         else:
             pass
         fout = gzip.open(foutName,'w')
 
-        fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
+        fout.write("chrom {junc}\n".format(junc=libN.split("/")[-1].split(".junc")[0]).encode())
 
         for lib in merges[libN]:
             if lib[-3:] == ".gz": F = gzip.open(lib)
             else: F = open(lib)
             for ln in F:
 
                 lnsplit=ln.split()
-                if len(lnsplit)<6: 
+                if len(lnsplit)<6:
                     sys.stderr.write("Error in %s \n" % lib)
                     continue
                 chrom, start, end, dot, count, strand = ln.split()
@@ -167,23 +167,23 @@ def sort_junctions(libl, options):
                 elif (start,end) in by_chrom[chrom]:
                     tot += by_chrom[chrom][(start,end)]
             for exon in ks:
-            
+
                 chrom, start, end = exon
                 start, end = int(start), int(end)
                 chromID, strand = chrom
                 if chrom not in by_chrom:
                     buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
-                elif (start,end) in by_chrom[chrom]:                
+                elif (start,end) in by_chrom[chrom]:
                     buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
                 else:
                     buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
-        
-            fout.write("".join(buf))
+
+            fout.write("".join(buf).encode())
         fout.close()
     fout_runlibs.close()
 
 def refine_clusters(options):
-    
+
     outPrefix = options.outprefix
     rundir = options.rundir
     minratio = float(options.mincluratio)
@@ -192,7 +192,7 @@ def refine_clusters(options):
     inFile = "%s/%s_pooled"%(rundir,outPrefix)
     outFile = "%s/%s_refined"%(rundir,outPrefix)
 
-    fout = file(outFile,'w')
+    fout = open(outFile,'w')
     Ncl = 0
     for ln in open(inFile):
         clu = []
@@ -206,7 +206,7 @@ def refine_clusters(options):
         #print "CLU",clu
         #print "linked",refine_linked(clu)
         #print '\n\n'
-    
+
         for cl in refine_linked(clu):
             rc = refine_cluster(cl,minratio, minreads)
             if len(rc) > 0:
@@ -220,45 +220,45 @@ def refine_clusters(options):
     fout.close()
 
 
-def merge_junctions(options):    
+def merge_junctions(options):
     ''' function to merge junctions '''
 
     outPrefix = options.outprefix
     rundir = options.rundir
     fnameout = "%s/%s"%(rundir,outPrefix)
     flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
-    
+
     lsts = []
     for ln in open(flist):
         lsts.append(ln.strip())
     if options.verbose:
         sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
-    
+
     # Change 300 if max open file is < 300
     N = min([300, max([100, int(len(lsts)**(0.5))])])
 
     tmpfiles = []
-    while len(lsts) > 1:    
+    while len(lsts) > 1:
         clst = []
-        
-        for i in range(0,(len(lsts)/N)+1): 
+
+        for i in range(0,(len(lsts)//N)+1):
             lst = lsts[N*i:N*(i+1)]
             if len(lst) > 0:
                 clst.append(lst)
         lsts = []
-    
+
         for lst in clst:
             if len(lst) == 0: continue
             tmpfile = tempfile.mktemp()
             os.mkdir(tmpfile)
             foutname = tmpfile+"/tmpmerge.gz"
             fout = gzip.open(foutname,'w')
-            
+
             merge_files(lst, fout, options)
             lsts.append(foutname)
             tmpfiles.append(foutname)
             fout.close()
-    
+
     shutil.move(lsts[0], fnameout+"_perind.counts.gz")
 
 def merge_files(fnames, fout, options):
@@ -274,12 +274,12 @@ def merge_files(fnames, fout, options):
     N = 0
     while not finished:
         N += 1
-        if N % 50000 == 0: 
+        if N % 50000 == 0:
             sys.stderr.write(".")
         buf = []
         for f in fopen:
             ln = f.readline().split()
-            if len(ln) == 0: 
+            if len(ln) == 0:
                 finished = True
                 break
             chrom = ln[0]
@@ -320,10 +320,10 @@ def cluster_intervals(E):
         i += 1
 
     if len(cluster) > 0:
-        
+
         Eclusters.append(cluster)
 
-    
+
     return Eclusters, E
 
 def overlaps(A,B):
@@ -343,7 +343,7 @@ def refine_linked(clusters):
     newClusters = []
     while len(unassigned) > 0:
         finished = False
-    
+
         while not finished:
             finished = True
             torm = []
@@ -370,7 +370,7 @@ def refine_linked(clusters):
 def refine_cluster(clu, cutoff, readcutoff):
     ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
     remove and recluster '''
-    
+
     remove = []
     dic = {}
     intervals = []
@@ -387,15 +387,15 @@ def refine_cluster(clu, cutoff, readcutoff):
         else:
             reCLU = True
     if len(intervals) == 0: return []
-    
+
     # This makes sure that after trimming, the clusters are still good
     Atmp, B = cluster_intervals(intervals)
     A = []
     for cl in Atmp:
         for c in refine_linked([(x,0) for x in cl]):
             if len(c) > 0:
                 A.append([x[0] for x in c])
-    
+
     if len(A) == 1:
         rc = [(x, dic[x]) for x in A[0]]
         if len(rc) > 1:
@@ -421,15 +421,15 @@ def get_numers(options):
     input_file=gzip.open(fname, 'rb')
     fout = gzip.open(fnameout,'w')
     first_line=True
-    
+
     for l in input_file:
         if first_line:
             fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names
             first_line=False
         else:
             l=l.strip()
             words=l.split(" ")
-            
+
             fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n')
 
     input_file.close()
@@ -453,7 +453,7 @@ def get_numers(options):
 
     parser.add_option("-r", "--rundir", dest="rundir", default='./',
                   help="write to directory (default ./)")
-    
+
     parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
                   help="maximum intron length in bp (default 100,000bp)")
 
@@ -473,14 +473,14 @@ def get_numers(options):
     if options.juncfiles == None:
         sys.stderr.write("Error: no junction file provided...\n")
         exit(0)
-    
+
     # Get the junction file list
     libl = []
     for junc in open(options.juncfiles):
         junc = junc.strip()
         try:
             open(junc)
-        except: 
+        except:
             sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
             exit(0)
         libl.append(junc)