diff --git a/clustering/check_files.py b/clustering/check_files.py index 626181d..86f62ba 100755 --- a/clustering/check_files.py +++ b/clustering/check_files.py @@ -29,7 +29,7 @@ sys.stderr.write("%d junctions\n"%libs[lib]) failed_junc = [] -threshold=max([len(x) for x in libChroms.values()])/1 +threshold=max([len(x) for x in list(libChroms.values())])/1 for lib in libChroms: if len(libChroms[lib]) < threshold: failed_junc.append("rm "+lib) diff --git a/clustering/get_cluster_gene.py b/clustering/get_cluster_gene.py index 376671a..46abf74 100755 --- a/clustering/get_cluster_gene.py +++ b/clustering/get_cluster_gene.py @@ -36,10 +36,10 @@ def get_feature(fname, feature = "exon"): ss2gene = get_feature(sys.argv[1], "exon") -W = file("%s.clu2gene.txt"%sys.argv[2].split("_perind")[0],'w') +W = open("%s.clu2gene.txt"%sys.argv[2].split("_perind")[0],'w') for ln in gzip.open(sys.argv[2]): if "chrom" in ln: continue - + if len(ln.split()[0].split(":")) == 5: chrom, A, B, clu, strand = ln.split()[0].split(":") else: @@ -52,7 +52,7 @@ def get_feature(fname, feature = "exon"): if (chrom, int(B)) in ss2gene: gs.append(ss2gene[(chrom, int(B))]) - + if len(gs) > 0: W.write("%s %s %s %s %s %.2f %.2f %.2f %.2f\n"%(clu,chrom, A,B, gs[0], mean, median, minAS, maxAS)) else: diff --git a/clustering/leafcutter_cluster.py b/clustering/leafcutter_cluster.py index a4a2ae7..c348934 100755 --- a/clustering/leafcutter_cluster.py +++ b/clustering/leafcutter_cluster.py @@ -7,7 +7,7 @@ import shutil def main(options,libl): - + pool_junc_reads(libl, options) refine_clusters(options) sort_junctions(libl, options) @@ -23,11 +23,11 @@ def pool_junc_reads(flist, options): useStrand = options.strand outFile = "%s/%s_pooled"%(rundir,outPrefix) - + chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: - + lib = libl.strip() if not os.path.isfile(lib): continue @@ -38,24 +38,24 @@ def pool_junc_reads(flist, options): if lib[-3:] == ".gz": F = gzip.open(lib) else: F = open(lib) for ln in F: - + lnsplit=ln.split() - if len(lnsplit)<6: + if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand = lnsplit - + if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue A, B = int(A), int(B)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom, strand)][(A,B)] - except: + except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom, strand)] = {(A,B):int(counts)} - fout = file(outFile, 'w') + fout = open(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: @@ -64,7 +64,7 @@ def pool_junc_reads(flist, options): sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: - if len(cl) > 1: # if cluster has more than one intron + if len(cl) > 1: # if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " @@ -76,7 +76,7 @@ def pool_junc_reads(flist, options): def sort_junctions(libl, options): - chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] + chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir refined_cluster = "%s/%s_refined"%(rundir,outPrefix) @@ -112,7 +112,7 @@ def sort_junctions(libl, options): merges[libN] = [] merges[libN].append(lib) - fout_runlibs = file(runName+"_sortedlibs",'w') + fout_runlibs = open(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) @@ -121,16 +121,16 @@ def sort_junctions(libl, options): fout_runlibs.write(foutName+'\n') - if options.verbose: + if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: - if options.verbose: + if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass fout = gzip.open(foutName,'w') - fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) + fout.write("chrom {junc}\n".format(junc=libN.split("/")[-1].split(".junc")[0]).encode()) for lib in merges[libN]: if lib[-3:] == ".gz": F = gzip.open(lib) @@ -138,7 +138,7 @@ def sort_junctions(libl, options): for ln in F: lnsplit=ln.split() - if len(lnsplit)<6: + if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, start, end, dot, count, strand = ln.split() @@ -167,23 +167,23 @@ def sort_junctions(libl, options): elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: - + chrom, start, end = exon start, end = int(start), int(end) chromID, strand = chrom if chrom not in by_chrom: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) - elif (start,end) in by_chrom[chrom]: + elif (start,end) in by_chrom[chrom]: buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) - - fout.write("".join(buf)) + + fout.write("".join(buf).encode()) fout.close() fout_runlibs.close() def refine_clusters(options): - + outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) @@ -192,7 +192,7 @@ def refine_clusters(options): inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) - fout = file(outFile,'w') + fout = open(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] @@ -206,7 +206,7 @@ def refine_clusters(options): #print "CLU",clu #print "linked",refine_linked(clu) #print '\n\n' - + for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: @@ -220,45 +220,45 @@ def refine_clusters(options): fout.close() -def merge_junctions(options): +def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) - + lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) - + # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] - while len(lsts) > 1: + while len(lsts) > 1: clst = [] - - for i in range(0,(len(lsts)/N)+1): + + for i in range(0,(len(lsts)//N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] - + for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') - + merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() - + shutil.move(lsts[0], fnameout+"_perind.counts.gz") def merge_files(fnames, fout, options): @@ -274,12 +274,12 @@ def merge_files(fnames, fout, options): N = 0 while not finished: N += 1 - if N % 50000 == 0: + if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() - if len(ln) == 0: + if len(ln) == 0: finished = True break chrom = ln[0] @@ -320,10 +320,10 @@ def cluster_intervals(E): i += 1 if len(cluster) > 0: - + Eclusters.append(cluster) - + return Eclusters, E def overlaps(A,B): @@ -343,7 +343,7 @@ def refine_linked(clusters): newClusters = [] while len(unassigned) > 0: finished = False - + while not finished: finished = True torm = [] @@ -370,7 +370,7 @@ def refine_linked(clusters): def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' - + remove = [] dic = {} intervals = [] @@ -387,7 +387,7 @@ def refine_cluster(clu, cutoff, readcutoff): else: reCLU = True if len(intervals) == 0: return [] - + # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] @@ -395,7 +395,7 @@ def refine_cluster(clu, cutoff, readcutoff): for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) - + if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: @@ -421,7 +421,7 @@ def get_numers(options): input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True - + for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names @@ -429,7 +429,7 @@ def get_numers(options): else: l=l.strip() words=l.split(" ") - + fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() @@ -453,7 +453,7 @@ def get_numers(options): parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") - + parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") @@ -473,14 +473,14 @@ def get_numers(options): if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) - + # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) - except: + except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) diff --git a/example_data/run_ds.py b/example_data/run_ds.py index 988afa0..2d6a90c 100644 --- a/example_data/run_ds.py +++ b/example_data/run_ds.py @@ -25,12 +25,12 @@ def run(cmd, max_minutes = 6000): r += file_stdout.read() e += file_stderr.read() - + file_stdin.close() #lines = file_stdout.read() lines_stderr = file_stderr.read() exit_code = file_stdout.close() - + file_stdout.close() file_stderr.close() return (r, e, exit_code) @@ -61,7 +61,7 @@ def run(cmd, max_minutes = 6000): parser.add_option("-d", "--leafdir", dest="leafd", default='./', help="LeafCutter directory") - + parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") @@ -80,7 +80,7 @@ def run(cmd, max_minutes = 6000): parser.add_option("-B", "--B", dest="bamB", help="bam files from condition B.") - + (options, args) = parser.parse_args() @@ -103,7 +103,7 @@ def run(cmd, max_minutes = 6000): # check samtools sys.stderr.write("processing bam files...\n") - fout = file("%s/junction_files.txt"%options.tmpdir,'w') + fout = open("%s/junction_files.txt"%options.tmpdir,'w') for bam in bamA+bamB: bedfile = "%s/%s.bed"%(options.tmpdir,bam.split('/')[-1]) juncfile = "%s/%s.junc"%(options.tmpdir,bam.split('/')[-1]) @@ -113,26 +113,25 @@ def run(cmd, max_minutes = 6000): else: sys.stderr.write("%s exists..skipping\n"%juncfile) continue - print run("samtools view %s | python %s/scripts/filter_cs.py | %s/scripts/sam2bed.pl --use-RNA-strand - %s"%(bam, options.leafd, options.leafd,bedfile))[1] - print run("%s/scripts/bed2junc.pl %s %s; rm %s"%(options.leafd,bedfile,juncfile, bedfile))[1] - + print(run("samtools view %s | python %s/scripts/filter_cs.py | %s/scripts/sam2bed.pl --use-RNA-strand - %s"%(bam, options.leafd, options.leafd,bedfile))[1]) + print(run("%s/scripts/bed2junc.pl %s %s; rm %s"%(options.leafd,bedfile,juncfile, bedfile))[1]) + fout.close() - - print run("python %s/clustering/leafcutter_cluster.py -j %s/junction_files.txt -m %s -o %s -l %s -r %s -p %s"%(options.leafd,options.tmpdir,options.minclureads, options.outprefix,str(options.maxintronlen), options.tmpdir,str(options.mincluratio)))[1] + + print(run("python %s/clustering/leafcutter_cluster.py -j %s/junction_files.txt -m %s -o %s -l %s -r %s -p %s"%(options.leafd,options.tmpdir,options.minclureads, options.outprefix,str(options.maxintronlen), options.tmpdir,str(options.mincluratio)))[1]) if options.annotation != None: - print run("python %s/clustering/get_cluster_gene.py %s %s/%s_perind.counts.gz"%(options.leafd,options.annotation, options.tmpdir,options.outprefix))[1] - - - - - fout = file("%s/ds_test"%(options.tmpdir),'w') + print(run("python %s/clustering/get_cluster_gene.py %s %s/%s_perind.counts.gz"%(options.leafd,options.annotation, options.tmpdir,options.outprefix))[1]) + + + + + fout = open("%s/ds_test"%(options.tmpdir),'w') for bam in bamA: fout.write("%s group1\n"%bam.split("/")[-1]) for bam in bamB: fout.write("%s group2\n"%bam.split("/")[-1]) fout.close() - - print run("Rscript %s/scripts/leafcutter_ds.R --num_threads 1 -i 3 %s/%s_perind_numers.counts.gz %s/ds_test"%(options.leafd,options.tmpdir,options.outprefix,options.tmpdir))[1] + print(run("Rscript %s/scripts/leafcutter_ds.R --num_threads 1 -i 3 %s/%s_perind_numers.counts.gz %s/ds_test"%(options.leafd,options.tmpdir,options.outprefix,options.tmpdir))[1]) diff --git a/example_data/run_sQTL.py b/example_data/run_sQTL.py index d31886a..88ed82b 100644 --- a/example_data/run_sQTL.py +++ b/example_data/run_sQTL.py @@ -23,12 +23,12 @@ def run(cmd, max_minutes = 6000): e += file_stderr.read() r += file_stdout.read() e += file_stderr.read() - + file_stdin.close() #lines = file_stdout.read() lines_stderr = file_stderr.read() exit_code = file_stdout.close() - + file_stdout.close() file_stderr.close() return (r, e, exit_code) @@ -49,7 +49,7 @@ def run(cmd, max_minutes = 6000): parser.add_option("-d", "--leafdir", dest="leafd", default='./', help="Top-level LeafCutter directory") - + parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="Maximum intron length in bp (default 100,000bp)") @@ -64,7 +64,7 @@ def run(cmd, max_minutes = 6000): parser.add_option("-b", "--bams", dest="bam", help="Text file listing bam files to quantify") - + (options, args) = parser.parse_args() try: open(options.leafd+"/clustering/leafcutter_cluster.py") @@ -85,7 +85,7 @@ def run(cmd, max_minutes = 6000): # (should check if samtools are installed) sys.stderr.write("processing bam files...\n") - fout = file("%s/junction_files.txt"%options.tmpdir,'w') + fout = open("%s/junction_files.txt"%options.tmpdir,'w') for bam in bams: bam = bam.strip() bedfile = "%s/%s.bed"%(options.tmpdir,bam.split('/')[-1]) @@ -96,19 +96,19 @@ def run(cmd, max_minutes = 6000): else: sys.stderr.write("%s exists..skipping\n"%juncfile) continue - print run("samtools view %s | python %s/scripts/filter_cs.py | %s/scripts/sam2bed.pl --use-RNA-strand - %s"%(bam, options.leafd, options.leafd,bedfile))[1] - print run("%s/scripts/bed2junc.pl %s %s; rm %s"%(options.leafd,bedfile,juncfile, bedfile))[1] - + print(run("samtools view %s | python %s/scripts/filter_cs.py | %s/scripts/sam2bed.pl --use-RNA-strand - %s"%(bam, options.leafd, options.leafd,bedfile))[1]) + print(run("%s/scripts/bed2junc.pl %s %s; rm %s"%(options.leafd,bedfile,juncfile, bedfile))[1]) + fout.close() - - print run("python %s/clustering/leafcutter_cluster.py -j %s/junction_files.txt -m %s -o %s -l %s -r %s -p %s"%(options.leafd,options.tmpdir,options.minclureads, options.outprefix,str(options.maxintronlen), options.tmpdir,str(options.mincluratio)))[1] + + print(run("python %s/clustering/leafcutter_cluster.py -j %s/junction_files.txt -m %s -o %s -l %s -r %s -p %s"%(options.leafd,options.tmpdir,options.minclureads, options.outprefix,str(options.maxintronlen), options.tmpdir,str(options.mincluratio)))[1]) if options.annotation != None: - print run("python %s/clustering/get_cluster_gene.py %s %s/%s_perind.counts.gz"%(options.leafd,options.annotation, options.tmpdir,options.outprefix))[1] + print(run("python %s/clustering/get_cluster_gene.py %s %s/%s_perind.counts.gz"%(options.leafd,options.annotation, options.tmpdir,options.outprefix))[1]) pass - - print run("python %s/scripts/prepare_phenotype_table.py %s/%s_perind.counts.gz"%(options.leafd,options.tmpdir,options.outprefix)) + + print(run("python %s/scripts/prepare_phenotype_table.py %s/%s_perind.counts.gz"%(options.leafd,options.tmpdir,options.outprefix))) sys.stdout.write("\n*******fastQTL instructions (also see http://fastqtl.sourceforge.net/) *******\n") sys.stdout.write("\n(1) Prepare phenotypes: Use `sh %s/%s_perind.counts.gz_prepare.sh' to create index for fastQTL (requires tabix and bgzip).\n"%(options.tmpdir,options.outprefix)) diff --git a/leafviz/prepare_results.R b/leafviz/prepare_results.R index e5e1684..0b7c1c6 100755 --- a/leafviz/prepare_results.R +++ b/leafviz/prepare_results.R @@ -405,6 +405,8 @@ all.clusters$N <- results$N[ match( all.clusters$clusterID, results$clusterID)] all.clusters$verdict <- unlist(classification.list)[ match(all.clusters$clusterID, names(classification.list))] # prepare for PCA +print(colnames(counts)) +print(meta$sample) counts <- counts[,meta$sample] print( "converting counts to ratios") # create per cluster ratios from counts diff --git a/scripts/leafcutter_cluster_regtools.py b/scripts/leafcutter_cluster_regtools.py index f84750e..6cf05b2 100644 --- a/scripts/leafcutter_cluster_regtools.py +++ b/scripts/leafcutter_cluster_regtools.py @@ -9,11 +9,11 @@ import shutil def main(options,libl): - + if options.cluster == None: pool_junc_reads(libl, options) refine_clusters(options) - + sort_junctions(libl, options) merge_junctions(options) get_numers(options) @@ -26,11 +26,11 @@ def pool_junc_reads(flist, options): checkchrom = options.checkchrom outFile = "%s/%s_pooled"%(rundir,outPrefix) - + chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: - + lib = libl.strip() if not os.path.isfile(lib): continue @@ -39,35 +39,35 @@ def pool_junc_reads(flist, options): sys.stderr.write("scanning %s...\n"%lib) for ln in open(lib): - + lnsplit=ln.split() - if len(lnsplit)<6: + if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: - print ln + print(ln) continue - + if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,strand)][(A,B)] - except: + except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom,strand)] = {(A,B):int(counts)} - fout = file(outFile, 'w') + fout = open(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: - read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads + read_ks = [k for k,v in list(by_chrom[chrom].items()) if v >= 3] # a junction must have at least 3 reads read_ks.sort() sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: - if len(cl) > 1: # if cluster has more than one intron + if len(cl) > 1: # if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " @@ -79,7 +79,7 @@ def pool_junc_reads(flist, options): def sort_junctions(libl, options): - chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] + chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir checkchrom = options.checkchrom @@ -116,7 +116,7 @@ def sort_junctions(libl, options): merges[libN] = [] merges[libN].append(lib) - fout_runlibs = file(runName+"_sortedlibs",'w') + fout_runlibs = open(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) @@ -125,10 +125,10 @@ def sort_junctions(libl, options): fout_runlibs.write(foutName+'\n') - if options.verbose: + if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: - if options.verbose: + if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass @@ -137,18 +137,18 @@ def sort_junctions(libl, options): fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) for lib in merges[libN]: - + for ln in open(lib): lnsplit=ln.split() - if len(lnsplit)<6: + if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, count, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: - print ln + print(ln) continue - + if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 @@ -156,12 +156,12 @@ def sort_junctions(libl, options): if chrom not in by_chrom: by_chrom[chrom] = {} intron = (A, B) - + if intron in by_chrom[chrom]: by_chrom[chrom][intron] += int(count) else: by_chrom[chrom][intron] = int(count) - + for clu in cluExons: buf = [] ks = cluExons[clu] @@ -176,7 +176,7 @@ def sort_junctions(libl, options): elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: - + chrom, start, end = exon start, end = int(start), int(end) chrom = tuple(chrom.split(":")) @@ -187,13 +187,13 @@ def sort_junctions(libl, options): buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) - + fout.write("".join(buf)) fout.close() fout_runlibs.close() def refine_clusters(options): - + outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) @@ -202,7 +202,7 @@ def refine_clusters(options): inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) - fout = file(outFile,'w') + fout = open(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] @@ -216,7 +216,7 @@ def refine_clusters(options): #print "CLU",clu #print "linked",refine_linked(clu) #print '\n\n' - + for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: @@ -230,45 +230,45 @@ def refine_clusters(options): fout.close() -def merge_junctions(options): +def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) - + lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) - + # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] - while len(lsts) > 1: + while len(lsts) > 1: clst = [] - - for i in range(0,(len(lsts)/N)+1): + + for i in range(0,(len(lsts)/N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] - + for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') - + merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() - + shutil.move(lsts[0], fnameout+"_perind.counts.gz") def merge_files(fnames, fout, options): @@ -284,12 +284,12 @@ def merge_files(fnames, fout, options): N = 0 while not finished: N += 1 - if N % 50000 == 0: + if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() - if len(ln) == 0: + if len(ln) == 0: finished = True break chrom = ln[0] @@ -328,10 +328,10 @@ def cluster_intervals(E): i += 1 if len(cluster) > 0: - + Eclusters.append(cluster) - + return Eclusters, E def overlaps(A,B): @@ -351,7 +351,7 @@ def refine_linked(clusters): newClusters = [] while len(unassigned) > 0: finished = False - + while not finished: finished = True torm = [] @@ -378,7 +378,7 @@ def refine_linked(clusters): def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' - + remove = [] dic = {} intervals = [] @@ -395,7 +395,7 @@ def refine_cluster(clu, cutoff, readcutoff): else: reCLU = True if len(intervals) == 0: return [] - + # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] @@ -403,7 +403,7 @@ def refine_cluster(clu, cutoff, readcutoff): for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) - + if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: @@ -429,7 +429,7 @@ def get_numers(options): input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True - + for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names @@ -437,7 +437,7 @@ def get_numers(options): else: l=l.strip() words=l.split(" ") - + fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() @@ -461,7 +461,7 @@ def get_numers(options): parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") - + parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") @@ -482,14 +482,14 @@ def get_numers(options): if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) - + # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) - except: + except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) diff --git a/scripts/prepare_phenotype_table.py b/scripts/prepare_phenotype_table.py index ad0c9ad..844abd1 100755 --- a/scripts/prepare_phenotype_table.py +++ b/scripts/prepare_phenotype_table.py @@ -7,7 +7,7 @@ import pickle from optparse import OptionParser - + from sklearn.decomposition import PCA from sklearn import preprocessing from sklearn import linear_model @@ -36,17 +36,17 @@ def stream_table(f, ss = ''): yield attr def main(ratio_file, pcs=50): - + dic_pop, fout = {}, {} try: open(ratio_file) - except: + except: sys.stderr.write("Can't find %s..exiting\n"%(ratio_file)) return sys.stderr.write("Starting...\n") for i in range(1,23): - fout[i] = file(ratio_file+".phen_chr%d"%i,'w') - fout_ave = file(ratio_file+".ave",'w') + fout[i] = open(ratio_file+".phen_chr%d"%i,'w') + fout_ave = open(ratio_file+".ave",'w') valRows, valRowsnn, geneRows = [], [], [] finished = False header = gzip.open(ratio_file).readline().split()[1:] @@ -67,7 +67,7 @@ def main(ratio_file, pcs=50): for sample in header: try: count = dic[sample] - except: print chrom, len(dic) + except: print(chrom, len(dic)) num, denom = count.split('/') if float(denom) < 1: count = "NA" @@ -76,7 +76,7 @@ def main(ratio_file, pcs=50): else: # add a 0.5 pseudocount count = (float(num)+0.5)/((float(denom))+0.5) - tmpvalRow.append(count) + tmpvalRow.append(count) aveReads.append(count) @@ -95,46 +95,46 @@ def main(ratio_file, pcs=50): if np.std(valRow) < 0.005: continue chr_, s, e, clu = chrom.split(":") - if len(valRow) > 0: + if len(valRow) > 0: chrom_int = int(chr_) fout[chrom_int].write("\t".join([chr_,s,e,chrom]+[str(x) for x in valRow])+'\n') fout_ave.write(" ".join(["%s"%chrom]+[str(min(aveReads)), str(max(aveReads)), str(np.mean(aveReads))])+'\n') # scale normalize - valRowsnn.append(valRow) + valRowsnn.append(valRow) valRow = preprocessing.scale(valRow) valRows.append(valRow) geneRows.append("\t".join([chr_,s,e,chrom])) if len(geneRows) % 1000 == 0: sys.stderr.write("Parsed %s introns...\n"%len(geneRows)) - + for i in fout: fout[i].close() # qqnorms on the columns matrix = np.array(valRows) - for i in xrange(len(matrix[0,:])): + for i in range(len(matrix[0,:])): matrix[:,i] = qqnorm(matrix[:,i]) - + # write the corrected tables fout = {} for i in range(1,23): fn="%s.qqnorm_chr%d"%(ratio_file,i) - print("Outputting: " + fn) - fout[i] = file(fn,'w') + print(("Outputting: " + fn)) + fout[i] = open(fn,'w') fout[i].write("\t".join(['#Chr','start','end','ID'] + header)+'\n') lst = [] - for i in xrange(len(matrix)): + for i in range(len(matrix)): chrom, s = geneRows[i].split()[:2] - + lst.append((int(chrom.replace("chr","")), int(s), "\t".join([geneRows[i]] + [str(x) for x in matrix[i]])+'\n')) lst.sort() for ln in lst: fout[ln[0]].write(ln[2]) - - fout_run = file("%s_prepare.sh"%ratio_file,'w') + + fout_run = open("%s_prepare.sh"%ratio_file,'w') for i in fout: fout[i].close() @@ -147,14 +147,14 @@ def main(ratio_file, pcs=50): if pcs>0: #matrix = np.transpose(matrix) # important bug fix (removed as of Jan 1 2018) pcs = min([len(header), pcs]) - pca = PCA(n_components=pcs) - pca.fit(matrix) + pca = PCA(n_components=pcs) + pca.fit(matrix) pca_fn=ratio_file+".PCs" - print("Outputting PCs: " + pca_fn) - pcafile = file(pca_fn,'w') + print(("Outputting PCs: " + pca_fn)) + pcafile = open(pca_fn,'w') pcafile.write("\t".join(['id']+header)+'\n') pcacomp = list(pca.components_) - + for i in range(len(pcacomp)): pcafile.write("\t".join([str(i+1)]+[str(x) for x in pcacomp[i]])+'\n') @@ -169,4 +169,3 @@ def main(ratio_file, pcs=50): sys.stderr.write("Error: no ratio file provided... (e.g. python leafcutter/scripts/prepare_phenotype_table.py input_perind.counts.gz\n") exit(0) main(args[0], int(options.npcs) ) -