diff --git a/.gitignore b/.gitignore index 26bbb14..adc504b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ GRCh37.primary_assembly.genome.fa.gz GRCh37.primary_assembly.genome.fa.gz.fxi +pangolin/__pycache__ +tests/__pycache__ diff --git a/README.md b/README.md index 912ddb9..c23b351 100755 --- a/README.md +++ b/README.md @@ -7,19 +7,17 @@ Pangolin can be run on Google Colab, which provides free acess to GPUs and other See below for information on usage and local installation. ### Installation -* Prerequisites: Python 3.6 or higher and conda, which can both be installed using Miniconda: https://docs.conda.io/en/latest/miniconda.html -* Install PyTorch: https://pytorch.org/get-started/locally/ - * If a supported GPU is available, installation with GPU support is recommended (choose an option under "Compute Platform") -* Install other dependencies: - ``` - conda install -c conda-forge pyvcf - pip install gffutils biopython pandas pyfastx - ``` +* Prerequisites: Python 3.8 or higher +* Poetry: See https://python-poetry.org/docs/#installation * Install Pangolin: ``` - git clone https://github.com/tkzeng/Pangolin.git + git clone https://github.com/invitae/Pangolin.git cd Pangolin - pip install . + poetry install + ``` +* Activate env + ``` + poetry shell ``` ### Usage (command-line) @@ -52,13 +50,13 @@ See below for information on usage and local installation. ``` See full options below: ``` - usage: pangolin [-h] [-c COLUMN_IDS] [-m {False,True}] [-s SCORE_CUTOFF] [-d DISTANCE] variant_file reference_file annotation_file output_file + usage: pangolin [-h] [-c COLUMN_IDS] [-m {False,True}] [-s SCORE_CUTOFF] [-d DISTANCE] [-b BATCH_SIZE] [-v] variant_file reference_file annotation_file output_file positional arguments: variant_file VCF or CSV file with a header (see COLUMN_IDS option). reference_file FASTA file containing a reference genome sequence. annotation_file gffutils database file. Can be generated using create_db.py. - output_file Prefix for output file. Will be a VCF/CSV if variant_file is VCF/CSV. + output_file Name of output file. optional arguments: -h, --help show this help message and exit @@ -70,12 +68,44 @@ See below for information on usage and local installation. Output all sites with absolute predicted change in score >= cutoff, instead of only the maximum loss/gain sites. -d DISTANCE, --distance DISTANCE Number of bases on either side of the variant for which splice scores should be calculated. (Default: 50) + -b BATCH_SIZE, --batch_size BATCH_SIZE + Number of variants to batch together (Default: 0). Use this to improve GPU optimization + -v, --verbose Enable additional debugging output + --enable_gtf_cache Enable caching of GTF database into memory ``` ### Usage (custom) See `scripts/custom_usage.py` +### Batching Support + +Invitae added batching support in April 2023 to get better GPU optimization. Variants are read in batches and then distributed into collections by tensor sizes and then run through the GPU in larger batches. +After batches are run, data is put back together in the original order and written to disk. You can control the batching via the `-b` parameter documented above. + +![Batching](docs/Pangolin_Batching_Indexing.png) + +### GTF DB Caching + +If you are running a larger batch of variants, you can gain additional performance by caching the gtf database into memory. +You can enable this behavior with `--enable_gtf_cache`. With this enabled, it'll dump the SQLite database into memory using +interval trees for the gene information for quick lookups without hitting the disk. + +## Testing + +There are unit tests available that run some small scale sets of predictions using data on chromosome 19, see details in +the tests about how the data was generated. + +``` +poetry run pytest +``` + +Testing with coverage + +``` +poetry run coverage run --source=pangolin -m pytest && poetry run coverage report -m +``` + ### Citation If you use Pangolin, please cite: diff --git a/docs/Pangolin_Batching_Indexing.png b/docs/Pangolin_Batching_Indexing.png new file mode 100644 index 0000000..adb0c34 Binary files /dev/null and b/docs/Pangolin_Batching_Indexing.png differ diff --git a/docs/Pangolin_Batching_Overview.png b/docs/Pangolin_Batching_Overview.png new file mode 100644 index 0000000..368be76 Binary files /dev/null and b/docs/Pangolin_Batching_Overview.png differ diff --git a/pangolin/.fuse_hidden0000252700000002 b/pangolin/.fuse_hidden0000252700000002 deleted file mode 100644 index 6c2d773..0000000 --- a/pangolin/.fuse_hidden0000252700000002 +++ /dev/null @@ -1,257 +0,0 @@ -import argparse -from pkg_resources import resource_filename -from pangolin.model import * -import vcf -import gffutils -import pandas as pd -import pyfastx -# import time -# startTime = time.time() - -IN_MAP = np.asarray([[0, 0, 0, 0], - [1, 0, 0, 0], - [0, 1, 0, 0], - [0, 0, 1, 0], - [0, 0, 0, 1]]) - - -def one_hot_encode(seq, strand): - seq = seq.upper().replace('A', '1').replace('C', '2') - seq = seq.replace('G', '3').replace('T', '4').replace('N', '0') - if strand == '+': - seq = np.asarray(list(map(int, list(seq)))) - elif strand == '-': - seq = np.asarray(list(map(int, list(seq[::-1])))) - seq = (5 - seq) % 5 # Reverse complement - return IN_MAP[seq.astype('int8')] - - -def compute_score(ref_seq, alt_seq, strand, d, models): - ref_seq = one_hot_encode(ref_seq, strand).T - ref_seq = torch.from_numpy(np.expand_dims(ref_seq, axis=0)).float() - alt_seq = one_hot_encode(alt_seq, strand).T - alt_seq = torch.from_numpy(np.expand_dims(alt_seq, axis=0)).float() - - if torch.cuda.is_available(): - ref_seq = ref_seq.to(torch.device("cuda")) - alt_seq = alt_seq.to(torch.device("cuda")) - - pangolin = [] - for j in range(4): - score = [] - for model in models[3*j:3*j+3]: - with torch.no_grad(): - ref = model(ref_seq)[0][[1,4,7,10][j],:].cpu().numpy() - alt = model(alt_seq)[0][[1,4,7,10][j],:].cpu().numpy() - if strand == '-': - ref = ref[::-1] - alt = alt[::-1] - l = 2*d+1 - ndiff = np.abs(len(ref)-len(alt)) - if len(ref)>len(alt): - alt = np.concatenate([alt[0:l//2+1],np.zeros(ndiff),alt[l//2+1:]]) - elif len(ref) pos or gene[4] < pos: - continue - gene_id = gene["gene_id"][0] - exons = [] - for exon in gtf.children(gene, featuretype="exon"): - exons.extend([exon[3], exon[4]]) - if gene[6] == '+': - genes_pos[gene_id] = exons - elif gene[6] == '-': - genes_neg[gene_id] = exons - - return (genes_pos, genes_neg) - - -def process_variant(lnum, chr, pos, ref, alt, gtf, models, args): - d = args.distance - cutoff = args.score_cutoff - - if len(set("ACGT").intersection(set(ref))) == 0 or len(set("ACGT").intersection(set(alt))) == 0 \ - or (len(ref) != 1 and len(alt) != 1 and len(ref) != len(alt)): - print("[Line %s]" % lnum, "WARNING, skipping variant: Variant format not supported.") - return -1 - elif len(ref) > 2*d: - print("[Line %s]" % lnum, "WARNING, skipping variant: Deletion too large") - return -1 - - fasta = pyfastx.Fasta(args.reference_file) - # try to make vcf chromosomes compatible with reference chromosomes - if chr not in fasta.keys() and "chr"+chr in fasta.keys(): - chr = "chr"+chr - elif chr not in fasta.keys() and chr[3:] in fasta.keys(): - chr = chr[3:] - - try: - seq = fasta[chr][pos-5001-d:pos+len(ref)+4999+d].seq - except Exception as e: - print(e) - print("[Line %s]" % lnum, "WARNING, skipping variant: Could not get sequence, possibly because the variant is too close to chromosome ends. " - "See error message above.") - return -1 - - if seq[5000+d:5000+d+len(ref)] != ref: - print("[Line %s]" % lnum, "WARNING, skipping variant: Mismatch between FASTA (ref base: %s) and variant file (ref base: %s)." - % (seq[5000+d:5000+d+len(ref)], ref)) - return -1 - - ref_seq = seq - alt_seq = seq[:5000+d] + alt + seq[5000+d+len(ref):] - - # get genes that intersect variant - genes_pos, genes_neg = get_genes(chr, pos, gtf) - if len(genes_pos)+len(genes_neg)==0: - print("[Line %s]" % lnum, "WARNING, skipping variant: Variant not contained in a gene body. Do GTF/FASTA chromosome names match?") - return -1 - - # get splice scores - loss_pos, gain_pos = None, None - if len(genes_pos) > 0: - loss_pos, gain_pos = compute_score(ref_seq, alt_seq, '+', d, models) - loss_neg, gain_neg = None, None - if len(genes_neg) > 0: - loss_neg, gain_neg = compute_score(ref_seq, alt_seq, '-', d, models) - - scores = "" - for (genes, loss, gain) in \ - ((genes_pos,loss_pos,gain_pos),(genes_neg,loss_neg,gain_neg)): - for gene, positions in genes.items(): - warnings = "Warnings:" - - if args.mask == "True" and len(positions) != 0: - positions = np.array(positions) - positions = positions - (pos - d) - - positions_filt = positions[(positions>=0) & (positions=cutoff)[0] - for p, s in zip(np.concatenate([g-d,l-d]), np.concatenate([gain[g],loss[l]])): - scores += "%s:%s|" % (p, round(s,2)) - - else: - scores = scores+gene+'|' - l, g = np.argmin(loss), np.argmax(gain), - scores += "%s:%s|%s:%s|" % (g-d, round(gain[g],2), l-d, round(loss[l],2)) - - scores += warnings - - return scores.strip('|') - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("variant_file", help="VCF or CSV file with a header (see COLUMN_IDS option).") - parser.add_argument("reference_file", help="FASTA file containing a reference genome sequence.") - parser.add_argument("annotation_file", help="gffutils database file. Can be generated using create_db.py.") - parser.add_argument("output_file", help="Prefix for output file. Will be a VCF/CSV if variant_file is VCF/CSV.") - parser.add_argument("-c", "--column_ids", default="CHROM,POS,REF,ALT", help="(If variant_file is a CSV) Column IDs for: chromosome, variant position, reference bases, and alternative bases. " - "Separate IDs by commas. (Default: CHROM,POS,REF,ALT)") - parser.add_argument("-m", "--mask", default="True", choices=["False","True"], help="If True, splice gains (increases in score) at annotated splice sites and splice losses (decreases in score) at unannotated splice sites will be set to 0. (Default: True)") - parser.add_argument("-s", "--score_cutoff", type=float, help="Output all sites with absolute predicted change in score >= cutoff, instead of only the maximum loss/gain sites.") - parser.add_argument("-d", "--distance", type=int, default=50, help="Number of bases on either side of the variant for which splice scores should be calculated. (Default: 50)") - #parser.add_argument("--score_exons", default="False", choices=["False","True"], help="Output changes in score for both splice sites of annotated exons, as long as one splice site is within the considered range (specified by -d). Output will be: gene|site1_pos:score|site2_pos:score|...") - args = parser.parse_args() - - variants = args.variant_file - gtf = args.annotation_file - try: - gtf = gffutils.FeatureDB(gtf) - except: - print("ERROR, annotation_file could not be opened. Is it a gffutils database file?") - exit() - - if torch.cuda.is_available(): - print("Using GPU") - else: - print("Using CPU") - - models = [] - for i in [0,2,4,6]: - for j in range(1,4): - model = Pangolin(L, W, AR) - if torch.cuda.is_available(): - model.cuda() - weights = torch.load(resource_filename(__name__,"models/final.%s.%s.3.v2" % (j, i))) - else: - weights = torch.load(resource_filename(__name__,"models/final.%s.%s.3.v2" % (j, i)), map_location=torch.device('cpu')) - model.load_state_dict(weights) - model.eval() - models.append(model) - - if variants.endswith(".vcf"): - lnum = 0 - # count the number of header lines - for line in open(variants, 'r'): - lnum += 1 - if line[0] != '#': - break - - variants = vcf.Reader(filename=variants) - variants.infos["Pangolin"] = vcf.parser._Info( - "Pangolin",'.',"String","Pangolin splice scores. " - "Format: gene|pos:score_change|pos:score_change|...",'.','.') - fout = vcf.Writer(open(args.output_file+".vcf", 'w'), variants) - - for i, variant in enumerate(variants): - scores = process_variant(lnum+i, str(variant.CHROM), int(variant.POS), variant.REF, str(variant.ALT[0]), gtf, models, args) - if scores != -1: - variant.INFO["Pangolin"] = scores - fout.write_record(variant) - fout.flush() - - fout.close() - - elif variants.endswith(".csv"): - col_ids = args.column_ids.split(',') - variants = pd.read_csv(variants, header=0) - fout = open(args.output_file+".csv", 'w') - fout.write(','.join(variants.columns)+',Pangolin\n') - fout.flush() - - for lnum, variant in variants.iterrows(): - chr, pos, ref, alt = variant[col_ids] - ref, alt = ref.upper(), alt.upper() - scores = process_variant(lnum+1, str(chr), int(pos), ref, alt, gtf, models, args) - if scores == -1: - fout.write(','.join(variant.to_csv(header=False, index=False).split('\n'))+'\n') - else: - fout.write(','.join(variant.to_csv(header=False, index=False).split('\n'))+scores+'\n') - fout.flush() - - fout.close() - - else: - print("ERROR, variant_file needs to be a CSV or VCF.") - - # executionTime = (time.time() - startTime) - # print('Execution time in seconds: ' + str(executionTime)) - -if __name__ == '__main__': - main() diff --git a/pangolin/batch.py b/pangolin/batch.py new file mode 100644 index 0000000..e2f4d60 --- /dev/null +++ b/pangolin/batch.py @@ -0,0 +1,304 @@ +import logging +import time + +import numpy as np +import pyfastx +import torch +from typing import List, Dict, Tuple + +from pangolin.data_models import ( + Variant, + PreppedVariant, + BatchLookupIndex, + AppConfig, + SequenceType, +) +from pangolin.genes import GeneAnnotator +from pangolin.utils import combine_scores, prepare_variant + +logger = logging.getLogger(__name__) + + +class PredictionBatch: + def __init__(self, models: List, app_config: AppConfig): + self.app_config = app_config + self.models = models + self.gene_annotator = GeneAnnotator( + self.app_config.annotation_file, + use_cache=self.app_config.enable_gtf_cache, + ) + self.fasta = pyfastx.Fasta(self.app_config.reference_file) + + self.batches = {} + self.variants: List[Variant] = [] + self.prepared_records: List[PreppedVariant] = [] + + self.batch_count = 0 + self.total_records = 0 + + self.prep_total_time = None + self.batch_start_time = None + + # Flag to know when the batch was run + self.did_run_predictions = False + + logger.debug(f"Batch init with batch size: {self.app_config.batch_size}") + + def batch_variant(self, prepped_variant: PreppedVariant) -> List[BatchLookupIndex]: + # Skip batching this variant if it wasn't encoded for validation reasons + if not prepped_variant.encodings: + return [] + + encoded_ref_pos = ( + prepped_variant.encodings.encoded_ref_pos + if prepped_variant.encodings + else "" + ) + encoded_alt_pos = ( + prepped_variant.encodings.encoded_alt_pos + if prepped_variant.encodings + else "" + ) + encoded_ref_neg = ( + prepped_variant.encodings.encoded_ref_neg + if prepped_variant.encodings + else "" + ) + encoded_alt_neg = ( + prepped_variant.encodings.encoded_alt_neg + if prepped_variant.encodings + else "" + ) + + batch_lookup_indexes = [] + + for var_type, encoded_seq in zip( + ( + SequenceType.POS_REF, + SequenceType.POS_ALT, + SequenceType.NEG_REF, + SequenceType.NEG_ALT, + ), + (encoded_ref_pos, encoded_alt_pos, encoded_ref_neg, encoded_alt_neg), + ): + if len(encoded_seq) == 0: + # Add BatchLookupIndex with zeros so when the batch collects the outputs + # it knows that there is no prediction for this record + batch_lookup_indexes.append(BatchLookupIndex(var_type, 0, 0)) + continue + + # Iterate over the encoded sequence and drop into the correct batch by size and + # create an index to use to pull out the result after batch is processed + # for row in encoded_seq: + # Extract the size of the sequence that was encoded to build a batch from + tensor_size = encoded_seq.shape[2] + + # Create batch for this size + if tensor_size not in self.batches: + self.batches[tensor_size] = [] + + # Add encoded record to batch + self.batches[tensor_size].append(encoded_seq) + + # Get the index of the record we just added in the batch + cur_batch_record_ix = len(self.batches[tensor_size]) - 1 + + # Store a reference so we can pull out the prediction for this item from the batches + batch_lookup_indexes.append( + BatchLookupIndex(var_type, tensor_size, cur_batch_record_ix) + ) + + return batch_lookup_indexes + + def prep_all_variants(self) -> None: + prep_time = time.time() + total_seq_time = 0 + total_encode_time = 0 + total_gene_time = 0 + for variant in self.variants: + prepared_record, timing = prepare_variant( + variant, + self.gene_annotator, + self.fasta, + self.app_config.distance, + ) + if prepared_record.skip_message: + logger.debug(prepared_record.skip_message) + total_seq_time += timing.seq_time + total_encode_time += timing.encode_time + total_gene_time += timing.gene_time + self.prepared_records.append(prepared_record) + self.prep_total_time = time.time() - prep_time + logger.debug(f"Total seq time: {total_seq_time:.5f}s") + logger.debug(f"Total gene time: {total_gene_time:.5f}s") + logger.debug(f"Total encode time: {total_encode_time:.5f}s") + logger.debug(f"Prep variant time: {self.prep_total_time:.5f}s") + + # Put the variants into buckets + for prepped_variant in self.prepared_records: + prepped_variant.locations = self.batch_variant(prepped_variant) + + def add_variant(self, variant: Variant) -> None: + self.total_records += 1 + self.variants.append(variant) + self.did_run_predictions = False + + # Once we fill the batch, process the records + if len(self.variants) >= self.app_config.batch_size: + logger.debug(f"Finished collected variants in batch: {len(self.variants)}") + self.run_batch() + self.did_run_predictions = True + + def run_batch(self) -> None: + self.batch_start_time = time.time() + self.prep_all_variants() + self._process_batch() + + def finish(self) -> None: + logger.debug("Finish") + + if len(self.variants) == 0: + logger.debug("No variants left to process") + return + + # Run remaining variants + self.run_batch() + + def run_predictions(self, batch) -> List: + batch_preds = [] + if torch.cuda.is_available(): + batch = batch.to(torch.device("cuda")) + for j in range(4): + for i, model in enumerate(self.models[3 * j : 3 * j + 3]): + with torch.no_grad(): + preds = model(batch) + batch_preds.append(preds) + return batch_preds + + def _process_batch(self) -> None: + start = time.time() + total_batch_predictions = 0 + self.batch_count += 1 + logger.debug(f"Starting process_batch ({self.batch_count})") + + batch_sizes = [ + "{}:{}".format(tensor_size, len(batch)) + for tensor_size, batch in self.batches.items() + ] + logger.debug("Batch Sizes: {}".format(batch_sizes)) + + batch_preds = {} + for tensor_size, batch in self.batches.items(): + # Convert list of encodings into a proper sized numpy matrix + prediction_batch = np.concatenate(batch, axis=0) + torched = torch.from_numpy(prediction_batch).float() + batch_preds[tensor_size] = self.run_predictions(torched) + + for prepped_record in self.prepared_records: + ( + prepped_record.loss_pos, + prepped_record.gain_pos, + ) = self._get_score_from_batch(prepped_record, batch_preds, "+") + ( + prepped_record.loss_neg, + prepped_record.gain_neg, + ) = self._get_score_from_batch(prepped_record, batch_preds, "-") + prepped_record.score = self.calculate_score(prepped_record) + total_batch_predictions += 1 + + duration = time.time() - start + logger.debug(f"Batch time: {duration:0.2f}s") + batch_duration = time.time() - self.batch_start_time + preds_per_sec = total_batch_predictions / batch_duration + preds_per_hour = preds_per_sec * 60 * 60 + logger.info( + f"Finished batch {self.batch_count}: Total Time {batch_duration:0.2f}s, Prep Time: {self.prep_total_time:0.2f}s, Preds/Hour: {preds_per_hour:0.0f}, Records: {self.total_records}" + ) + + def _get_score_from_batch( + self, prepped_record: PreppedVariant, batch_preds: Dict[int, List], strand: str + ) -> Tuple: + if len(prepped_record.locations) == 0: + return None, None + + # Get the lookup locations of the ref and alt values + ref_location = ( + prepped_record.locations[SequenceType.POS_REF.value] + if strand == "+" + else prepped_record.locations[SequenceType.NEG_REF.value] + ) + alt_location = ( + prepped_record.locations[SequenceType.POS_ALT.value] + if strand == "+" + else prepped_record.locations[SequenceType.NEG_ALT.value] + ) + + if ref_location.tensor_size == 0 and alt_location.tensor_size == 0: + return None, None + + ix = 0 + pangolin = [] + for j in range(4): + scores = [] + for _ in self.models[3 * j : 3 * j + 3]: + # Pull out predictions from the batch + ref_prediction = batch_preds[ref_location.tensor_size][ix] + alt_prediction = batch_preds[alt_location.tensor_size][ix] + + # Bring data back to CPU + ref = ( + ref_prediction[ref_location.batch_index][[1, 4, 7, 10][j], :] + .cpu() + .numpy() + ) + alt = ( + alt_prediction[alt_location.batch_index][[1, 4, 7, 10][j], :] + .cpu() + .numpy() + ) + if strand == "-": + ref = ref[::-1] + alt = alt[::-1] + l = 2 * self.app_config.distance + 1 + ndiff = np.abs(len(ref) - len(alt)) + if len(ref) > len(alt): + alt = np.concatenate( + [alt[0 : l // 2 + 1], np.zeros(ndiff), alt[l // 2 + 1 :]] + ) + elif len(ref) < len(alt): + alt = np.concatenate( + [ + alt[0 : l // 2], + np.max(alt[l // 2 : l // 2 + ndiff + 1], keepdims=True), + alt[l // 2 + ndiff + 1 :], + ] + ) + score = alt - ref + scores.append(score) + ix += 1 + pangolin.append(np.mean(scores, axis=0)) + + pangolin = np.array(pangolin) + loss = pangolin[np.argmin(pangolin, axis=0), np.arange(pangolin.shape[1])] + gain = pangolin[np.argmax(pangolin, axis=0), np.arange(pangolin.shape[1])] + return loss, gain + + def calculate_score(self, variant: PreppedVariant) -> str: + if len(variant.locations) == 0: + return "" + scores = combine_scores( + variant.variant.pos, + variant.genes_pos, + variant.loss_pos, + variant.gain_pos, + variant.genes_neg, + variant.loss_neg, + variant.gain_neg, + self.app_config, + ) + return scores + + def clear_batch(self) -> None: + self.batches.clear() + del self.variants[:] + del self.prepared_records[:] diff --git a/pangolin/data_models.py b/pangolin/data_models.py new file mode 100644 index 0000000..67d25f6 --- /dev/null +++ b/pangolin/data_models.py @@ -0,0 +1,94 @@ +import dataclasses +from enum import Enum +from typing import Optional, List + +from torch._C._te import Tensor + + +class SequenceType(Enum): + POS_REF = 0 + POS_ALT = 1 + NEG_REF = 2 + NEG_ALT = 3 + + +@dataclasses.dataclass +class TimingDetails: + seq_time: float = 0 + encode_time: float = 0 + gene_time: float = 0 + + +@dataclasses.dataclass +class BatchLookupIndex: + sequence_type: SequenceType + tensor_size: int + batch_index: int + + +@dataclasses.dataclass +class AppConfig: + variant_file: str + output_file: str + reference_file: str + annotation_file: str + batch_size: int + distance: int + mask: str + score_exons: str + column_ids: str + score_cutoff: Optional[float] + enable_gtf_cache: bool + + @classmethod + def from_args(cls, args) -> "AppConfig": + return cls( + variant_file=args.variant_file, + output_file=args.output_file, + reference_file=args.reference_file, + annotation_file=args.annotation_file, + batch_size=args.batch_size, + distance=args.distance, + score_cutoff=args.score_cutoff, + mask=args.mask, + score_exons=args.score_exons, + column_ids=args.column_ids, + enable_gtf_cache=args.enable_gtf_cache, + ) + + +@dataclasses.dataclass +class Variant: + lnum: int + chr: str + pos: int + ref: str + alt: str + id: Optional[int] = None + + +@dataclasses.dataclass +class VariantEncodings: + encoded_ref_pos: Tensor + encoded_alt_pos: Tensor + encoded_ref_neg: Tensor + encoded_alt_neg: Tensor + + +@dataclasses.dataclass +class PreppedVariant: + variant: Variant + score: str = "" + skip_message: str = "" + locations: Optional[List[BatchLookupIndex]] = None + encodings: Optional[VariantEncodings] = None + genes_pos: Optional[List] = None + genes_neg: Optional[List] = None + loss_pos: Optional[List] = None + gain_pos: Optional[List] = None + loss_neg: Optional[List] = None + gain_neg: Optional[List] = None + + @classmethod + def with_skip_message(cls, variant: Variant, skip_message: str) -> "PreppedVariant": + return cls(variant=variant, skip_message=skip_message, locations=[]) diff --git a/pangolin/genes.py b/pangolin/genes.py new file mode 100644 index 0000000..bd8e01b --- /dev/null +++ b/pangolin/genes.py @@ -0,0 +1,70 @@ +import logging +import time +from typing import Dict, Tuple + +import gffutils +from intervaltree import IntervalTree + +logger = logging.getLogger(__name__) + + +class GeneAnnotator: + def __init__(self, annotation_file: str, use_cache: bool = True): + self.use_cache = use_cache + self.gtf = None + self.trees = None + + if use_cache: + self.trees = self._load_data(annotation_file) + else: + self.gtf = gffutils.FeatureDB(annotation_file) + + def _load_data(self, annotation_file: str) -> Dict[str, IntervalTree]: + load_time = time.time() + gtf = gffutils.FeatureDB(annotation_file) + trees = {} + for gene in gtf.features_of_type("gene"): + if gene.seqid not in trees: + trees[gene.seqid] = IntervalTree() + exons = [] + for exon in gtf.children(gene, featuretype="exon"): + exons.extend([exon[3], exon[4]]) + trees[gene.seqid][gene.start : gene.stop] = (gene.id, gene.strand, exons) + logger.debug(f"Load cached db: {time.time() - load_time:.5f}s") + return trees + + def get_genes(self, chrom, pos) -> Tuple[Dict, Dict]: + if self.use_cache: + return self.get_cached_genes(chrom, pos) + return self.get_db_genes(chrom, pos) + + def get_cached_genes(self, chrom: str, pos: int) -> Tuple[Dict, Dict]: + genes = self.trees[chrom][pos - 1] + + genes_pos, genes_neg = {}, {} + for gene in genes: + gene_id, strand, exons = gene.data + if strand == "+": + genes_pos[gene_id] = exons + elif strand == "-": + genes_neg[gene_id] = exons + + return genes_pos, genes_neg + + def get_db_genes(self, chrom: str, pos: int) -> Tuple[Dict, Dict]: + genes = self.gtf.region((chrom, pos - 1, pos - 1), featuretype="gene") + genes_pos, genes_neg = {}, {} + + for gene in genes: + if gene[3] > pos or gene[4] < pos: + continue + gene_id = gene["gene_id"][0] + exons = [] + for exon in self.gtf.children(gene, featuretype="exon"): + exons.extend([exon[3], exon[4]]) + if gene[6] == "+": + genes_pos[gene_id] = exons + elif gene[6] == "-": + genes_neg[gene_id] = exons + + return (genes_pos, genes_neg) diff --git a/pangolin/legacy.py b/pangolin/legacy.py new file mode 100644 index 0000000..e4a9698 --- /dev/null +++ b/pangolin/legacy.py @@ -0,0 +1,94 @@ +import pyfastx + +from pangolin.data_models import AppConfig +from pangolin.utils import compute_score, combine_scores + + +def get_genes(chrom, pos, gtf): + genes = gtf.region((chrom, pos - 1, pos - 1), featuretype="gene") + genes_pos, genes_neg = {}, {} + + for gene in genes: + if gene[3] > pos or gene[4] < pos: + continue + gene_id = gene["gene_id"][0] + exons = [] + for exon in gtf.children(gene, featuretype="exon"): + exons.extend([exon[3], exon[4]]) + if gene[6] == "+": + genes_pos[gene_id] = exons + elif gene[6] == "-": + genes_neg[gene_id] = exons + + return (genes_pos, genes_neg) + + +def process_variant_legacy( + lnum, chr, pos, ref, alt, gtf, models, app_config: AppConfig +): + d = app_config.distance + + if ( + len(set("ACGT").intersection(set(ref))) == 0 + or len(set("ACGT").intersection(set(alt))) == 0 + or (len(ref) != 1 and len(alt) != 1 and len(ref) != len(alt)) + ): + print( + "[Line %s]" % lnum, + "WARNING, skipping variant: Variant format not supported.", + ) + return -1 + elif len(ref) > 2 * d: + print("[Line %s]" % lnum, "WARNING, skipping variant: Deletion too large") + return -1 + + fasta = pyfastx.Fasta(app_config.reference_file) + # try to make vcf chromosomes compatible with reference chromosomes + if chr not in fasta.keys() and "chr" + chr in fasta.keys(): + chr = "chr" + chr + elif chr not in fasta.keys() and chr[3:] in fasta.keys(): + chr = chr[3:] + + try: + seq = fasta[chr][pos - 5001 - d : pos + len(ref) + 4999 + d].seq + except Exception as e: + print(e) + print( + "[Line %s]" % lnum, + "WARNING, skipping variant: Could not get sequence, possibly because the variant is too close to chromosome ends. " + "See error message above.", + ) + return -1 + + if seq[5000 + d : 5000 + d + len(ref)].upper() != ref: + print( + "[Line %s]" % lnum, + "WARNING, skipping variant: Mismatch between FASTA (ref base: %s) and variant file (ref base: %s)." + % (seq[5000 + d : 5000 + d + len(ref)], ref), + ) + return -1 + + ref_seq = seq + alt_seq = seq[: 5000 + d] + alt + seq[5000 + d + len(ref) :] + + # get genes that intersect variant + genes_pos, genes_neg = get_genes(chr, pos, gtf) + if len(genes_pos) + len(genes_neg) == 0: + print( + "[Line %s]" % lnum, + "WARNING, skipping variant: Variant not contained in a gene body. Do GTF/FASTA chromosome names match?", + ) + return -1 + + # get splice scores + loss_pos, gain_pos = None, None + if len(genes_pos) > 0: + loss_pos, gain_pos = compute_score(ref_seq, alt_seq, "+", d, models) + loss_neg, gain_neg = None, None + if len(genes_neg) > 0: + loss_neg, gain_neg = compute_score(ref_seq, alt_seq, "-", d, models) + + scores = combine_scores( + pos, genes_pos, loss_pos, gain_pos, genes_neg, loss_neg, gain_neg, app_config + ) + return scores diff --git a/pangolin/model.py b/pangolin/model.py index 11dfb43..8f8f304 100755 --- a/pangolin/model.py +++ b/pangolin/model.py @@ -1,16 +1,16 @@ +from typing import List + import numpy as np import torch -import torch.utils.data as data import torch.nn.functional as F import torch.nn as nn +from pkg_resources import resource_filename L = 32 # convolution window size in residual units -W = np.asarray([11, 11, 11, 11, 11, 11, 11, 11, - 21, 21, 21, 21, 41, 41, 41, 41]) +W = np.asarray([11, 11, 11, 11, 11, 11, 11, 11, 21, 21, 21, 21, 41, 41, 41, 41]) # atrous rate in residual units -AR = np.asarray([1, 1, 1, 1, 4, 4, 4, 4, - 10, 10, 10, 10, 25, 25, 25, 25]) +AR = np.asarray([1, 1, 1, 1, 4, 4, 4, 4, 10, 10, 10, 10, 25, 25, 25, 25]) class ResBlock(nn.Module): @@ -47,7 +47,7 @@ def __init__(self, L, W, AR): self.resblocks, self.convs = nn.ModuleList(), nn.ModuleList() for i in range(len(W)): self.resblocks.append(ResBlock(L, W[i], AR[i])) - if (((i + 1) % 4 == 0) or ((i + 1) == len(W))): + if ((i + 1) % 4 == 0) or ((i + 1) == len(W)): self.convs.append(nn.Conv1d(L, L, 1)) self.conv_last1 = nn.Conv1d(L, 2, 1) self.conv_last2 = nn.Conv1d(L, 1, 1) @@ -64,7 +64,7 @@ def forward(self, x): j = 0 for i in range(len(W)): conv = self.resblocks[i](conv) - if (((i + 1) % 4 == 0) or ((i + 1) == len(W))): + if ((i + 1) % 4 == 0) or ((i + 1) == len(W)): dense = self.convs[j](conv) j += 1 skip = skip + dense @@ -81,3 +81,22 @@ def forward(self, x): return torch.cat([out1, out2, out3, out4, out5, out6, out7, out8], 1) +def load_models() -> List: + models = [] + for i in [0, 2, 4, 6]: + for j in range(1, 4): + model = Pangolin(L, W, AR) + if torch.cuda.is_available(): + model.cuda() + weights = torch.load( + resource_filename(__name__, "models/final.%s.%s.3.v2" % (j, i)) + ) + else: + weights = torch.load( + resource_filename(__name__, "models/final.%s.%s.3.v2" % (j, i)), + map_location=torch.device("cpu"), + ) + model.load_state_dict(weights) + model.eval() + models.append(model) + return models diff --git a/pangolin/pangolin.py b/pangolin/pangolin.py index 00a541d..0317ecb 100755 --- a/pangolin/pangolin.py +++ b/pangolin/pangolin.py @@ -1,278 +1,112 @@ import argparse -from pkg_resources import resource_filename -from pangolin.model import * -import vcf -import gffutils -import pandas as pd -import pyfastx -# import time -# startTime = time.time() +import logging +from dataclasses import asdict -IN_MAP = np.asarray([[0, 0, 0, 0], - [1, 0, 0, 0], - [0, 1, 0, 0], - [0, 0, 1, 0], - [0, 0, 0, 1]]) +import torch +from pangolin.data_models import AppConfig -def one_hot_encode(seq, strand): - seq = seq.upper().replace('A', '1').replace('C', '2') - seq = seq.replace('G', '3').replace('T', '4').replace('N', '0') - if strand == '+': - seq = np.asarray(list(map(int, list(seq)))) - elif strand == '-': - seq = np.asarray(list(map(int, list(seq[::-1])))) - seq = (5 - seq) % 5 # Reverse complement - return IN_MAP[seq.astype('int8')] +import time +from pangolin.processors import process_variants_file -def compute_score(ref_seq, alt_seq, strand, d, models): - ref_seq = one_hot_encode(ref_seq, strand).T - ref_seq = torch.from_numpy(np.expand_dims(ref_seq, axis=0)).float() - alt_seq = one_hot_encode(alt_seq, strand).T - alt_seq = torch.from_numpy(np.expand_dims(alt_seq, axis=0)).float() - if torch.cuda.is_available(): - ref_seq = ref_seq.to(torch.device("cuda")) - alt_seq = alt_seq.to(torch.device("cuda")) - - pangolin = [] - for j in range(4): - score = [] - for model in models[3*j:3*j+3]: - with torch.no_grad(): - ref = model(ref_seq)[0][[1,4,7,10][j],:].cpu().numpy() - alt = model(alt_seq)[0][[1,4,7,10][j],:].cpu().numpy() - if strand == '-': - ref = ref[::-1] - alt = alt[::-1] - l = 2*d+1 - ndiff = np.abs(len(ref)-len(alt)) - if len(ref)>len(alt): - alt = np.concatenate([alt[0:l//2+1],np.zeros(ndiff),alt[l//2+1:]]) - elif len(ref) pos or gene[4] < pos: - continue - gene_id = gene["gene_id"][0] - exons = [] - for exon in gtf.children(gene, featuretype="exon"): - exons.extend([exon[3], exon[4]]) - if gene[6] == '+': - genes_pos[gene_id] = exons - elif gene[6] == '-': - genes_neg[gene_id] = exons - - return (genes_pos, genes_neg) - - -def process_variant(lnum, chr, pos, ref, alt, gtf, models, args): - d = args.distance - cutoff = args.score_cutoff - - if len(set("ACGT").intersection(set(ref))) == 0 or len(set("ACGT").intersection(set(alt))) == 0 \ - or (len(ref) != 1 and len(alt) != 1 and len(ref) != len(alt)): - print("[Line %s]" % lnum, "WARNING, skipping variant: Variant format not supported.") - return -1 - elif len(ref) > 2*d: - print("[Line %s]" % lnum, "WARNING, skipping variant: Deletion too large") - return -1 - - fasta = pyfastx.Fasta(args.reference_file) - # try to make vcf chromosomes compatible with reference chromosomes - if chr not in fasta.keys() and "chr"+chr in fasta.keys(): - chr = "chr"+chr - elif chr not in fasta.keys() and chr[3:] in fasta.keys(): - chr = chr[3:] - - try: - seq = fasta[chr][pos-5001-d:pos+len(ref)+4999+d].seq - except Exception as e: - print(e) - print("[Line %s]" % lnum, "WARNING, skipping variant: Could not get sequence, possibly because the variant is too close to chromosome ends. " - "See error message above.") - return -1 - - if seq[5000+d:5000+d+len(ref)] != ref: - print("[Line %s]" % lnum, "WARNING, skipping variant: Mismatch between FASTA (ref base: %s) and variant file (ref base: %s)." - % (seq[5000+d:5000+d+len(ref)], ref)) - return -1 - - ref_seq = seq - alt_seq = seq[:5000+d] + alt + seq[5000+d+len(ref):] - - # get genes that intersect variant - genes_pos, genes_neg = get_genes(chr, pos, gtf) - if len(genes_pos)+len(genes_neg)==0: - print("[Line %s]" % lnum, "WARNING, skipping variant: Variant not contained in a gene body. Do GTF/FASTA chromosome names match?") - return -1 - - # get splice scores - loss_pos, gain_pos = None, None - if len(genes_pos) > 0: - loss_pos, gain_pos = compute_score(ref_seq, alt_seq, '+', d, models) - loss_neg, gain_neg = None, None - if len(genes_neg) > 0: - loss_neg, gain_neg = compute_score(ref_seq, alt_seq, '-', d, models) +logger = logging.getLogger(__name__) - scores = "" - for (genes, loss, gain) in \ - ((genes_pos,loss_pos,gain_pos),(genes_neg,loss_neg,gain_neg)): - for gene, positions in genes.items(): - warnings = "Warnings:" - positions = np.array(positions) - positions = positions - (pos - d) - - if args.mask == "True" and len(positions) != 0: - positions_filt = positions[(positions>=0) & (positions=len(loss): - s1 = "NA" - else: - s1 = [loss[p1],gain[p1]] - s1 = round(s1[np.argmax(np.abs(s1))],2) - if p2<0 or p2>=len(loss): - s2 = "NA" - else: - s2 = [loss[p2],gain[p2]] - s2 = round(s2[np.argmax(np.abs(s2))],2) - if s1 == "NA" and s2 == "NA": - continue - scores1 += "%s:%s|" % (p1-d, s1) - scores2 += "%s:%s|" % (p2-d, s2) - scores = scores+scores1+scores2 - - elif cutoff != None: - scores = scores+gene+'|' - l, g = np.where(loss<=-cutoff)[0], np.where(gain>=cutoff)[0] - for p, s in zip(np.concatenate([g-d,l-d]), np.concatenate([gain[g],loss[l]])): - scores += "%s:%s|" % (p, round(s,2)) - - else: - scores = scores+gene+'|' - l, g = np.argmin(loss), np.argmax(gain), - scores += "%s:%s|%s:%s|" % (g-d, round(gain[g],2), l-d, round(loss[l],2)) - - scores += warnings - - return scores.strip('|') def main(): parser = argparse.ArgumentParser() - parser.add_argument("variant_file", help="VCF or CSV file with a header (see COLUMN_IDS option).") - parser.add_argument("reference_file", help="FASTA file containing a reference genome sequence.") - parser.add_argument("annotation_file", help="gffutils database file. Can be generated using create_db.py.") - parser.add_argument("output_file", help="Prefix for output file. Will be a VCF/CSV if variant_file is VCF/CSV.") - parser.add_argument("-c", "--column_ids", default="CHROM,POS,REF,ALT", help="(If variant_file is a CSV) Column IDs for: chromosome, variant position, reference bases, and alternative bases. " - "Separate IDs by commas. (Default: CHROM,POS,REF,ALT)") - parser.add_argument("-m", "--mask", default="True", choices=["False","True"], help="If True, splice gains (increases in score) at annotated splice sites and splice losses (decreases in score) at unannotated splice sites will be set to 0. (Default: True)") - parser.add_argument("-s", "--score_cutoff", type=float, help="Output all sites with absolute predicted change in score >= cutoff, instead of only the maximum loss/gain sites.") - parser.add_argument("-d", "--distance", type=int, default=50, help="Number of bases on either side of the variant for which splice scores should be calculated. (Default: 50)") - parser.add_argument("--score_exons", default="False", choices=["False","True"], help="Output changes in score for both splice sites of annotated exons, as long as one splice site is within the considered range (specified by -d). Output will be: gene|site1_pos:score|site2_pos:score|...") + parser.add_argument( + "variant_file", help="VCF or CSV file with a header (see COLUMN_IDS option)." + ) + parser.add_argument( + "reference_file", help="FASTA file containing a reference genome sequence." + ) + parser.add_argument( + "annotation_file", + help="gffutils database file. Can be generated using create_db.py.", + ) + parser.add_argument("output_file", help="Name of output file") + parser.add_argument( + "-c", + "--column_ids", + default="CHROM,POS,REF,ALT", + help="(If variant_file is a CSV) Column IDs for: chromosome, variant position, reference bases, and alternative bases. " + "Separate IDs by commas. (Default: CHROM,POS,REF,ALT)", + ) + parser.add_argument( + "-m", + "--mask", + default="True", + choices=["False", "True"], + help="If True, splice gains (increases in score) at annotated splice sites and splice losses (decreases in score) at unannotated splice sites will be set to 0. (Default: True)", + ) + parser.add_argument( + "-s", + "--score_cutoff", + type=float, + help="Output all sites with absolute predicted change in score >= cutoff, instead of only the maximum loss/gain sites.", + ) + parser.add_argument( + "-d", + "--distance", + type=int, + default=50, + help="Number of bases on either side of the variant for which splice scores should be calculated. (Default: 50)", + ) + parser.add_argument( + "-b", + "--batch_size", + type=int, + default=0, + help="Number of variants to batch together", + ) + parser.add_argument( + "-v", + "--verbose", + default=False, + action="store_true", + help="Enable additional debugging output", + ) + parser.add_argument( + "--enable_gtf_cache", + default=False, + action="store_true", + help="Enable GTF db in memory caching, useful for large batches", + ) + parser.add_argument( + "--score_exons", + default="False", + choices=["False", "True"], + help="Output changes in score for both splice sites of annotated exons, as long as one splice site is within the considered range (specified by -d). Output will be: gene|site1_pos:score|site2_pos:score|...", + ) args = parser.parse_args() - variants = args.variant_file - gtf = args.annotation_file - try: - gtf = gffutils.FeatureDB(gtf) - except: - print("ERROR, annotation_file could not be opened. Is it a gffutils database file?") - exit() + log_level = logging.INFO + if args.verbose: + log_level = logging.DEBUG - if torch.cuda.is_available(): - print("Using GPU") - else: - print("Using CPU") - - models = [] - for i in [0,2,4,6]: - for j in range(1,4): - model = Pangolin(L, W, AR) - if torch.cuda.is_available(): - model.cuda() - weights = torch.load(resource_filename(__name__,"models/final.%s.%s.3.v2" % (j, i))) - else: - weights = torch.load(resource_filename(__name__,"models/final.%s.%s.3.v2" % (j, i)), map_location=torch.device('cpu')) - model.load_state_dict(weights) - model.eval() - models.append(model) - - if variants.endswith(".vcf"): - lnum = 0 - # count the number of header lines - for line in open(variants, 'r'): - lnum += 1 - if line[0] != '#': - break - - variants = vcf.Reader(filename=variants) - variants.infos["Pangolin"] = vcf.parser._Info( - "Pangolin",'.',"String","Pangolin splice scores. " - "Format: gene|pos:score_change|pos:score_change|...",'.','.') - fout = vcf.Writer(open(args.output_file+".vcf", 'w'), variants) + logging.basicConfig( + format="%(processName)s %(threadName)s %(asctime)s %(levelname)s %(name)s: - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=log_level, + ) - for i, variant in enumerate(variants): - scores = process_variant(lnum+i, str(variant.CHROM), int(variant.POS), variant.REF, str(variant.ALT[0]), gtf, models, args) - if scores != -1: - variant.INFO["Pangolin"] = scores - fout.write_record(variant) - fout.flush() + start_time = time.time() - fout.close() + if torch.cuda.is_available(): + logger.info("Using GPU") + else: + logger.info("Using CPU") - elif variants.endswith(".csv"): - col_ids = args.column_ids.split(',') - variants = pd.read_csv(variants, header=0) - fout = open(args.output_file+".csv", 'w') - fout.write(','.join(variants.columns)+',Pangolin\n') - fout.flush() + app_config = AppConfig.from_args(args) - for lnum, variant in variants.iterrows(): - chr, pos, ref, alt = variant[col_ids] - ref, alt = ref.upper(), alt.upper() - scores = process_variant(lnum+1, str(chr), int(pos), ref, alt, gtf, models, args) - if scores == -1: - fout.write(','.join(variant.to_csv(header=False, index=False).split('\n'))+'\n') - else: - fout.write(','.join(variant.to_csv(header=False, index=False).split('\n'))+scores+'\n') - fout.flush() + logger.info(f"Using config : {asdict(app_config)}") - fout.close() + process_variants_file(app_config) - else: - print("ERROR, variant_file needs to be a CSV or VCF.") + print(f"Execution time in seconds: {time.time() - start_time:.2f}") - # executionTime = (time.time() - startTime) - # print('Execution time in seconds: ' + str(executionTime)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/pangolin/processors.py b/pangolin/processors.py new file mode 100644 index 0000000..cfe1650 --- /dev/null +++ b/pangolin/processors.py @@ -0,0 +1,171 @@ +import logging +from typing import Callable, List, Union + +import gffutils +import pandas as pd +import pysam +import typing + +from pysam import VariantFile + +from pangolin.batch import PredictionBatch +from pangolin.legacy import process_variant_legacy +from pangolin.model import load_models +from pangolin.data_models import Variant, AppConfig + +logger = logging.getLogger(__name__) + + +def process_variants_file(app_config: AppConfig) -> None: + models = load_models() + batch = PredictionBatch(models, app_config) + if app_config.variant_file.endswith(".vcf"): + process_vcf(batch, models, app_config) + elif app_config.variant_file.endswith(".csv"): + process_csv(batch, models, app_config) + else: + raise RuntimeError("ERROR, variant_file needs to be a CSV or VCF.") + + +def handle_batch( + batch: PredictionBatch, + original_records: List, + writer: Callable, + fout: Union[typing.TextIO, VariantFile], +) -> None: + for prepared_record, original_record in zip( + batch.prepared_records, original_records + ): + writer(original_record, prepared_record.score, fout) + + +def vcf_writer(original_record, score: str, fout: VariantFile) -> None: + if score != "": + original_record.info["Pangolin"] = score + fout.write(original_record) + + +def csv_writer(original_record, score: str, fout: typing.TextIO) -> None: + if score == "": + fout.write( + ",".join(original_record.to_csv(header=False, index=False).split("\n")) + + "\n" + ) + else: + fout.write( + ",".join(original_record.to_csv(header=False, index=False).split("\n")) + + score + + "\n" + ) + + +def process_vcf(batch: PredictionBatch, models: List, app_config: AppConfig): + input_vcf = pysam.VariantFile(app_config.variant_file) + header = input_vcf.header + header.add_line( + '##INFO=' + ) + fout = pysam.VariantFile(app_config.output_file, "w", header=header) + + # NOTE: Only used in non batching mode + gtf = gffutils.FeatureDB(app_config.annotation_file) + + original_records = [] + for i, variant in enumerate(input_vcf): + if app_config.batch_size > 0: + # Store original VCF row + original_records.append(variant) + # NOTE: Only single alts are supported here + if len(variant.alts) > 1: + raise RuntimeError( + f"Only single ALTs are supported for VCF predictions" + ) + v = Variant( + i, + chr=str(variant.chrom), + pos=int(variant.pos), + ref=variant.ref, + alt=variant.alts[0], + ) + batch.add_variant(v) + if batch.did_run_predictions: + handle_batch(batch, original_records, vcf_writer, fout) + original_records.clear() + batch.clear_batch() + else: + # This is the original path through the code + scores = process_variant_legacy( + i, + str(variant.chrom), + int(variant.pos), + variant.ref, + str(variant.alts[0]), + gtf, + models, + app_config, + ) + if scores != -1: + variant.info["Pangolin"] = scores + fout.write(variant) + + if app_config.batch_size > 0: + batch.finish() + handle_batch(batch, original_records, vcf_writer, fout) + + fout.close() + print(f"Wrote results to: {app_config.output_file}") + + +def process_csv(batch: PredictionBatch, models: List, app_config: AppConfig): + col_ids = app_config.column_ids.split(",") + variants = pd.read_csv(app_config.variant_file, header=0) + fout = open(app_config.output_file, "w") + fout.write(",".join(variants.columns) + ",Pangolin\n") + fout.flush() + + # NOTE: Only used in non batching mode + gtf = gffutils.FeatureDB(app_config.annotation_file) + + # Store original record here to use again when batching is completed + original_records = [] + + for lnum, variant in variants.iterrows(): + lnum = typing.cast(int, lnum) # Used to solve type hinting issues + chr, pos, ref, alt = variant[col_ids] + ref, alt = ref.upper(), alt.upper() + + # Only do the batching if the batch size is set + if app_config.batch_size > 0: + # Store original CSV record + original_records.append(variant) + v = Variant(lnum=lnum, chr=str(chr), pos=int(pos), ref=ref, alt=alt) + + batch.add_variant(v) + if batch.did_run_predictions: + handle_batch(batch, original_records, csv_writer, fout) + original_records.clear() + batch.clear_batch() + else: + scores = process_variant_legacy( + lnum + 1, str(chr), int(pos), ref, alt, gtf, models, app_config + ) + if scores == -1: + fout.write( + ",".join(variant.to_csv(header=False, index=False).split("\n")) + + "\n" + ) + else: + fout.write( + ",".join(variant.to_csv(header=False, index=False).split("\n")) + + scores + + "\n" + ) + fout.flush() + + if app_config.batch_size > 0: + batch.finish() + handle_batch(batch, original_records, csv_writer, fout) + + fout.close() + print(f"Wrote results to: {app_config.output_file}") diff --git a/pangolin/utils.py b/pangolin/utils.py new file mode 100644 index 0000000..ffc539f --- /dev/null +++ b/pangolin/utils.py @@ -0,0 +1,277 @@ +import logging +import time +from typing import Tuple + +import numpy as np +from pyfaidx import Fasta +import torch + +from pangolin.batch import Variant, PreppedVariant +from pangolin.data_models import VariantEncodings, AppConfig, TimingDetails +from pangolin.genes import GeneAnnotator + +logger = logging.getLogger(__name__) + + +IN_MAP = np.asarray( + [[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]] +) + + +def compute_score(ref_seq, alt_seq, strand, d, models): + ref_seq = one_hot_encode(ref_seq, strand).T + ref_seq = torch.from_numpy(np.expand_dims(ref_seq, axis=0)).float() + alt_seq = one_hot_encode(alt_seq, strand).T + alt_seq = torch.from_numpy(np.expand_dims(alt_seq, axis=0)).float() + + if torch.cuda.is_available(): + ref_seq = ref_seq.to(torch.device("cuda")) + alt_seq = alt_seq.to(torch.device("cuda")) + + pangolin = [] + for j in range(4): + score = [] + for model in models[3 * j : 3 * j + 3]: + with torch.no_grad(): + ref = model(ref_seq)[0][[1, 4, 7, 10][j], :].cpu().numpy() + alt = model(alt_seq)[0][[1, 4, 7, 10][j], :].cpu().numpy() + if strand == "-": + ref = ref[::-1] + alt = alt[::-1] + l = 2 * d + 1 + ndiff = np.abs(len(ref) - len(alt)) + if len(ref) > len(alt): + alt = np.concatenate( + [alt[0 : l // 2 + 1], np.zeros(ndiff), alt[l // 2 + 1 :]] + ) + elif len(ref) < len(alt): + alt = np.concatenate( + [ + alt[0 : l // 2], + np.max(alt[l // 2 : l // 2 + ndiff + 1], keepdims=True), + alt[l // 2 + ndiff + 1 :], + ] + ) + score.append(alt - ref) + pangolin.append(np.mean(score, axis=0)) + + pangolin = np.array(pangolin) + loss = pangolin[np.argmin(pangolin, axis=0), np.arange(pangolin.shape[1])] + gain = pangolin[np.argmax(pangolin, axis=0), np.arange(pangolin.shape[1])] + return loss, gain + + +def combine_scores( + variant_pos, + genes_pos, + loss_pos, + gain_pos, + genes_neg, + loss_neg, + gain_neg, + app_config: AppConfig, +) -> str: + all_gene_scores = [] + + for genes, loss, gain in ( + (genes_pos, loss_pos, gain_pos), + (genes_neg, loss_neg, gain_neg), + ): + for gene, positions in genes.items(): + warnings = "Warnings:" + positions = np.array(positions) + positions = positions - (variant_pos - app_config.distance) + + if app_config.mask == "True" and len(positions) != 0: + positions_filt = positions[(positions >= 0) & (positions < len(loss))] + # set splice gain at annotated sites to 0 + gain[positions_filt] = np.minimum(gain[positions_filt], 0) + # set splice loss at unannotated sites to 0 + not_positions = ~np.isin(np.arange(len(loss)), positions_filt) + loss[not_positions] = np.maximum(loss[not_positions], 0) + + elif app_config.mask == "True": + warnings += "NoAnnotatedSitesToMaskForThisGene" + loss[:] = np.maximum(loss[:], 0) + + if app_config.score_exons == "True": + scores1 = gene + "_sites1|" + scores2 = gene + "_sites2|" + + for i in range(len(positions) // 2): + p1, p2 = positions[2 * i], positions[2 * i + 1] + if p1 < 0 or p1 >= len(loss): + s1 = "NA" + else: + s1 = [loss[p1], gain[p1]] + s1 = round(s1[np.argmax(np.abs(s1))], 2) + if p2 < 0 or p2 >= len(loss): + s2 = "NA" + else: + s2 = [loss[p2], gain[p2]] + s2 = round(s2[np.argmax(np.abs(s2))], 2) + if s1 == "NA" and s2 == "NA": + continue + scores1 += "%s:%s|" % (p1 - app_config.distance, s1) + scores2 += "%s:%s|" % (p2 - app_config.distance, s2) + score = scores1 + scores2 + + elif app_config.score_cutoff != None: + score = gene + "|" + l, g = ( + np.where(loss <= -app_config.score_cutoff)[0], + np.where(gain >= app_config.score_cutoff)[0], + ) + for p, s in zip( + np.concatenate([g - app_config.distance, l - app_config.distance]), + np.concatenate([gain[g], loss[l]]), + ): + score += "%s:%s|" % (p, round(s, 2)) + + else: + score = gene + "|" + l, g = ( + np.argmin(loss), + np.argmax(gain), + ) + score += "%s:%s|%s:%s|" % ( + g - app_config.distance, + round(gain[g], 2), + l - app_config.distance, + round(loss[l], 2), + ) + + score += warnings + all_gene_scores.append(score.strip("|")) + + return "||".join(all_gene_scores) + + +def one_hot_encode(seq, strand): + seq = seq.upper().replace("A", "1").replace("C", "2") + seq = seq.replace("G", "3").replace("T", "4").replace("N", "0") + if strand == "+": + seq = np.asarray(list(map(int, seq))) + elif strand == "-": + seq = np.asarray(list(map(int, seq[::-1]))) + seq = (5 - seq) % 5 # Reverse complement + return IN_MAP[seq.astype("int8")] + + +def encode_seqs(ref_seq, alt_seq, strand): + ref_seq = one_hot_encode(ref_seq, strand).T + ref_seq = torch.from_numpy(np.expand_dims(ref_seq, axis=0)).float() + alt_seq = one_hot_encode(alt_seq, strand).T + alt_seq = torch.from_numpy(np.expand_dims(alt_seq, axis=0)).float() + return ref_seq, alt_seq + + +def prepare_variant( + variant: Variant, gene_annotator: GeneAnnotator, fasta: Fasta, distance: int +) -> Tuple[PreppedVariant, TimingDetails]: + chr = variant.chr + pos = variant.pos + ref = variant.ref + alt = variant.alt + + empty_timing = TimingDetails() + + skip_message = "" + seq_time = time.time() + if ( + len(set("ACGT").intersection(set(ref))) == 0 + or len(set("ACGT").intersection(set(alt))) == 0 + or (len(ref) != 1 and len(alt) != 1 and len(ref) != len(alt)) + ): + skip_message = "Variant format not supported." + elif len(ref) > 2 * distance: + skip_message = "Deletion too large" + + if skip_message: + return ( + PreppedVariant.with_skip_message( + variant=variant, skip_message=skip_message + ), + empty_timing, + ) + + # try to make vcf chromosomes compatible with reference chromosomes + fasta_keys = fasta.keys() + if chr not in fasta_keys and "chr" + chr in fasta_keys: + variant.chr = "chr" + chr + elif chr not in fasta_keys and chr[3:] in fasta_keys: + variant.chr = chr[3:] + + seq = "" + try: + seq = fasta[chr][pos - 5001 - distance : pos + len(ref) + 4999 + distance].seq + except Exception as e: + logger.exception(e) + skip_message = ( + "Could not get sequence, possibly because the variant is too close to chromosome ends. " + "See error message above." + ) + if skip_message: + return ( + PreppedVariant.with_skip_message( + variant=variant, skip_message=skip_message + ), + empty_timing, + ) + + if seq[5000 + distance : 5000 + distance + len(ref)].upper() != ref: + ref_base = seq[5000 + distance : 5000 + distance + len(ref)] + skip_message = f"Mismatch between FASTA (ref base: {ref_base}) and variant file (ref base: {ref})." + return ( + PreppedVariant.with_skip_message( + variant=variant, skip_message=skip_message + ), + empty_timing, + ) + + ref_seq = seq + alt_seq = seq[: 5000 + distance] + alt + seq[5000 + distance + len(ref) :] + total_seq_time = time.time() - seq_time + + gene_time = time.time() + genes_pos, genes_neg = gene_annotator.get_genes(chr, pos) + if len(genes_pos) + len(genes_neg) == 0: + skip_message = ( + "Variant not contained in a gene body. Do GTF/FASTA chromosome names match?" + ) + return ( + PreppedVariant.with_skip_message( + variant=variant, skip_message=skip_message + ), + empty_timing, + ) + total_gene_time = time.time() - gene_time + + encode_time = time.time() + encoded_ref_pos, encoded_alt_pos, encoded_ref_neg, encoded_alt_neg = "", "", "", "" + if len(genes_pos) > 0: + encoded_ref_pos, encoded_alt_pos = encode_seqs(ref_seq, alt_seq, "+") + if len(genes_neg) > 0: + encoded_ref_neg, encoded_alt_neg = encode_seqs(ref_seq, alt_seq, "-") + total_encode_time = time.time() - encode_time + + prep_timing = TimingDetails( + seq_time=total_seq_time, + gene_time=total_gene_time, + encode_time=total_encode_time, + ) + + return ( + PreppedVariant( + variant=variant, + genes_pos=genes_pos, + genes_neg=genes_neg, + encodings=VariantEncodings( + encoded_ref_neg=encoded_ref_neg, + encoded_ref_pos=encoded_ref_pos, + encoded_alt_pos=encoded_alt_pos, + encoded_alt_neg=encoded_alt_neg, + ), + ), + prep_timing, + ) diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..ec0b44c --- /dev/null +++ b/poetry.lock @@ -0,0 +1,863 @@ +[[package]] +name = "argcomplete" +version = "2.1.1" +description = "Bash tab completion for argparse" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +lint = ["flake8", "mypy"] +test = ["coverage", "flake8", "mypy", "pexpect", "wheel"] + +[[package]] +name = "argh" +version = "0.28.1" +description = "An unobtrusive argparse wrapper with natural syntax" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.extras] +completion = ["argcomplete (>=2.0)"] +docs = ["readthedocs-sphinx-search (==0.2.0)", "sphinx (>=6.1)", "sphinx-pyproject (==0.1.0)", "sphinx_rtd_theme (>=1.2.0)"] +linters = ["pre-commit (>=3.0.4)"] +test = ["iocapture (>=0.1.2)", "pytest (>=7.2)", "pytest-cov (>=4.0)", "tox (>=4.4)"] + +[[package]] +name = "attrs" +version = "22.2.0" +description = "Classes Without Boilerplate" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] +dev = ["attrs[docs,tests]"] +docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"] +tests = ["attrs[tests-no-zope]", "zope.interface"] +tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] + +[[package]] +name = "biopython" +version = "1.81" +description = "Freely available tools for computational molecular biology." +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +numpy = "*" + +[[package]] +name = "black" +version = "23.3.0" +description = "The uncompromising code formatter." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +packaging = ">=22.0" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + +[[package]] +name = "click" +version = "8.1.3" +description = "Composable command line interface toolkit" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" + +[[package]] +name = "coverage" +version = "7.2.3" +description = "Code coverage measurement for Python" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +toml = ["tomli"] + +[[package]] +name = "exceptiongroup" +version = "1.1.1" +description = "Backport of PEP 654 (exception groups)" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "gffutils" +version = "0.11.1" +description = "Work with GFF and GTF files in a flexible database framework" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +argcomplete = ">=1.9.4" +argh = ">=0.26.2" +pyfaidx = ">=0.5.5.2" +simplejson = "*" +six = ">=1.12.0" + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "intervaltree" +version = "3.1.0" +description = "Editable interval tree data structure for Python 2 and 3" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +sortedcontainers = ">=2.0,<3.0" + +[[package]] +name = "mypy-extensions" +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." +category = "dev" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "numpy" +version = "1.24.2" +description = "Fundamental package for array computing in Python" +category = "main" +optional = false +python-versions = ">=3.8" + +[[package]] +name = "nvidia-cublas-cu11" +version = "11.10.3.66" +description = "CUBLAS native runtime libraries" +category = "main" +optional = false +python-versions = ">=3" + +[package.dependencies] +setuptools = "*" +wheel = "*" + +[[package]] +name = "nvidia-cuda-nvrtc-cu11" +version = "11.7.99" +description = "NVRTC native runtime libraries" +category = "main" +optional = false +python-versions = ">=3" + +[package.dependencies] +setuptools = "*" +wheel = "*" + +[[package]] +name = "nvidia-cuda-runtime-cu11" +version = "11.7.99" +description = "CUDA Runtime native Libraries" +category = "main" +optional = false +python-versions = ">=3" + +[package.dependencies] +setuptools = "*" +wheel = "*" + +[[package]] +name = "nvidia-cudnn-cu11" +version = "8.5.0.96" +description = "cuDNN runtime libraries" +category = "main" +optional = false +python-versions = ">=3" + +[package.dependencies] +setuptools = "*" +wheel = "*" + +[[package]] +name = "packaging" +version = "23.0" +description = "Core utilities for Python packages" +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "pandas" +version = "1.5.3" +description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +numpy = [ + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, +] +python-dateutil = ">=2.8.1" +pytz = ">=2020.1" + +[package.extras] +test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] + +[[package]] +name = "pathspec" +version = "0.11.1" +description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "platformdirs" +version = "3.3.0" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] + +[[package]] +name = "pluggy" +version = "1.0.0" +description = "plugin and hook calling mechanisms for python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pyfaidx" +version = "0.7.2.1" +description = "pyfaidx: efficient pythonic random access to fasta subsequences" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +setuptools = "*" +six = "*" + +[[package]] +name = "pyfastx" +version = "0.8.4" +description = "pyfastx is a python module for fast random access to sequences from plain and gzipped FASTA/Q file" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "pysam" +version = "0.20.0" +description = "pysam" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "pytest" +version = "7.2.2" +description = "pytest: simple powerful testing with Python" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] + +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2022.7.1" +description = "World timezone definitions, modern and historical" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "setuptools" +version = "67.6.0" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + +[[package]] +name = "simplejson" +version = "3.18.3" +description = "Simple, fast, extensible JSON encoder/decoder for Python" +category = "main" +optional = false +python-versions = ">=2.5, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "sortedcontainers" +version = "2.4.0" +description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "torch" +version = "1.13.1" +description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" +category = "main" +optional = false +python-versions = ">=3.7.0" + +[package.dependencies] +nvidia-cublas-cu11 = {version = "11.10.3.66", markers = "platform_system == \"Linux\""} +nvidia-cuda-nvrtc-cu11 = {version = "11.7.99", markers = "platform_system == \"Linux\""} +nvidia-cuda-runtime-cu11 = {version = "11.7.99", markers = "platform_system == \"Linux\""} +nvidia-cudnn-cu11 = {version = "8.5.0.96", markers = "platform_system == \"Linux\""} +typing-extensions = "*" + +[package.extras] +opt-einsum = ["opt-einsum (>=3.3)"] + +[[package]] +name = "typing-extensions" +version = "4.5.0" +description = "Backported and Experimental Type Hints for Python 3.7+" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "wheel" +version = "0.38.4" +description = "A built-package format for Python" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +test = ["pytest (>=3.0.0)"] + +[metadata] +lock-version = "1.1" +python-versions = "^3.8" +content-hash = "12a3bc2ba36b6f619cda0069cce9c4ded268a3e560cd847e518433f296f35c4d" + +[metadata.files] +argcomplete = [ + {file = "argcomplete-2.1.1-py3-none-any.whl", hash = "sha256:17041f55b8c45099428df6ce6d0d282b892471a78c71375d24f227e21c13f8c5"}, + {file = "argcomplete-2.1.1.tar.gz", hash = "sha256:72e08340852d32544459c0c19aad1b48aa2c3a96de8c6e5742456b4f538ca52f"}, +] +argh = [ + {file = "argh-0.28.1-py3-none-any.whl", hash = "sha256:10e7311f3ea54a78a366e5456900d8b81049f44d8d653b524eb90cf7d29a71ee"}, + {file = "argh-0.28.1.tar.gz", hash = "sha256:b2093086f0e809a3ecc24b64a2145309ee8f56d034936cd59e57c558a357329d"}, +] +attrs = [ + {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"}, + {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"}, +] +biopython = [ + {file = "biopython-1.81-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef7c79b65b0b3f3c7dc59e20a7f8ae5758d8e852cb8b9cace590dc5617e348ba"}, + {file = "biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ebfbce0d91796c7aef422ee9dffe8827e07e5abaa94545e006f1f20e965c80b"}, + {file = "biopython-1.81-cp310-cp310-win32.whl", hash = "sha256:919a2c583cabf9c96d2ae4e1245a6b0376932fb342aca302a0fc198b71ab3275"}, + {file = "biopython-1.81-cp310-cp310-win_amd64.whl", hash = "sha256:b37c0d24191e5c96ca02415a5188551980c83a0d518bbc4ffe3c9a5d1fe0ee81"}, + {file = "biopython-1.81-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7a168709694e10b338718c18d967edd5b56c237dc88642c22275796007a70000"}, + {file = "biopython-1.81-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a51d9c1d1b4b634447535da74a644fae59bc234fbbf9001e2dc6b6fbabb98019"}, + {file = "biopython-1.81-cp311-cp311-win32.whl", hash = "sha256:2f9cfaf16d55ab80d514e7aebe5710dabe4e4ff47ede851031202e33b3249da3"}, + {file = "biopython-1.81-cp311-cp311-win_amd64.whl", hash = "sha256:e41b55edcfd448630e77bf4de66a7235324a8a149621499891da6bd1d5085b9a"}, + {file = "biopython-1.81-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3b36ba1bf6395c09a365c53530c9d71f3617763fa2c1d452b3d8948368c0f1de"}, + {file = "biopython-1.81-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c5c07123ff5f44c9e6b5369df854a38afd3c0c50ef58498a0ae8f7eb799f3e8"}, + {file = "biopython-1.81-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:97cbdbed01b2512471f36c74b91658d1dfbdcbf39bc038f6ce5a41c3e60a8fc6"}, + {file = "biopython-1.81-cp37-cp37m-win32.whl", hash = "sha256:35506e39822c52d11cf09a3951e82375ca1bb9303960b4286acf02c9a6f6c4cc"}, + {file = "biopython-1.81-cp37-cp37m-win_amd64.whl", hash = "sha256:793c42a376cd63f62f8a088ce39b7dc6b5c55e4e9031d887c434de1595bfa4b8"}, + {file = "biopython-1.81-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:11d673698b3d0d6589292ea951fb62cb24ea27d273eca0d08dbbd956690f97f5"}, + {file = "biopython-1.81-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:655df416936662c0c8a06a549cb25e1560e1fea5067d850f34fb714b8a3fae6c"}, + {file = "biopython-1.81-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:762c6c43a8486b5fcd07f136a3217b87d24755618b9ea9da1f17124ff44c2ad6"}, + {file = "biopython-1.81-cp38-cp38-win32.whl", hash = "sha256:ee51bb1cd7decffd24da6b76d5e01b7e2fd818ab85cf0c180226cbb5793a3abd"}, + {file = "biopython-1.81-cp38-cp38-win_amd64.whl", hash = "sha256:ccd729249fd5f586dd4c2a3507c2ea2456825d7e615e97c07c409c850eaf4594"}, + {file = "biopython-1.81-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9ba33244f0eff830beaa7240065bdb5095d96fded6599b76bbb9ddab45cd2bbd"}, + {file = "biopython-1.81-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bb0c690c7368f255ed45236bf0f5464b476b8c083c8f634533921af78278261"}, + {file = "biopython-1.81-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65b93b513ce9dd7b2ce058720eadf42cd03f312db3409356efeb93123d1320aa"}, + {file = "biopython-1.81-cp39-cp39-win32.whl", hash = "sha256:811796f8d222aa3869a50e31e54ce62b69106b47cd8bb06934867c0d843297b5"}, + {file = "biopython-1.81-cp39-cp39-win_amd64.whl", hash = "sha256:b09efcb4733c8770f25eab5fe555a96a08f5ab9e1bc36939e08ebf2ffbf3e0f1"}, + {file = "biopython-1.81.tar.gz", hash = "sha256:2cf38112b6d8415ad39d6a611988cd11fb5f33eb09346666a87263beba9614e0"}, +] +black = [ + {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"}, + {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"}, + {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"}, + {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"}, + {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"}, + {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"}, + {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"}, + {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"}, + {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"}, + {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"}, + {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"}, + {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"}, + {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"}, + {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"}, +] +click = [ + {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, + {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, +] +colorama = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] +coverage = [ + {file = "coverage-7.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e58c0d41d336569d63d1b113bd573db8363bc4146f39444125b7f8060e4e04f5"}, + {file = "coverage-7.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:344e714bd0fe921fc72d97404ebbdbf9127bac0ca1ff66d7b79efc143cf7c0c4"}, + {file = "coverage-7.2.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:974bc90d6f6c1e59ceb1516ab00cf1cdfbb2e555795d49fa9571d611f449bcb2"}, + {file = "coverage-7.2.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0743b0035d4b0e32bc1df5de70fba3059662ace5b9a2a86a9f894cfe66569013"}, + {file = "coverage-7.2.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d0391fb4cfc171ce40437f67eb050a340fdbd0f9f49d6353a387f1b7f9dd4fa"}, + {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4a42e1eff0ca9a7cb7dc9ecda41dfc7cbc17cb1d02117214be0561bd1134772b"}, + {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:be19931a8dcbe6ab464f3339966856996b12a00f9fe53f346ab3be872d03e257"}, + {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:72fcae5bcac3333a4cf3b8f34eec99cea1187acd55af723bcbd559adfdcb5535"}, + {file = "coverage-7.2.3-cp310-cp310-win32.whl", hash = "sha256:aeae2aa38395b18106e552833f2a50c27ea0000122bde421c31d11ed7e6f9c91"}, + {file = "coverage-7.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:83957d349838a636e768251c7e9979e899a569794b44c3728eaebd11d848e58e"}, + {file = "coverage-7.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:dfd393094cd82ceb9b40df4c77976015a314b267d498268a076e940fe7be6b79"}, + {file = "coverage-7.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:182eb9ac3f2b4874a1f41b78b87db20b66da6b9cdc32737fbbf4fea0c35b23fc"}, + {file = "coverage-7.2.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bb1e77a9a311346294621be905ea8a2c30d3ad371fc15bb72e98bfcfae532df"}, + {file = "coverage-7.2.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca0f34363e2634deffd390a0fef1aa99168ae9ed2af01af4a1f5865e362f8623"}, + {file = "coverage-7.2.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55416d7385774285b6e2a5feca0af9652f7f444a4fa3d29d8ab052fafef9d00d"}, + {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:06ddd9c0249a0546997fdda5a30fbcb40f23926df0a874a60a8a185bc3a87d93"}, + {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:fff5aaa6becf2c6a1699ae6a39e2e6fb0672c2d42eca8eb0cafa91cf2e9bd312"}, + {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ea53151d87c52e98133eb8ac78f1206498c015849662ca8dc246255265d9c3c4"}, + {file = "coverage-7.2.3-cp311-cp311-win32.whl", hash = "sha256:8f6c930fd70d91ddee53194e93029e3ef2aabe26725aa3c2753df057e296b925"}, + {file = "coverage-7.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:fa546d66639d69aa967bf08156eb8c9d0cd6f6de84be9e8c9819f52ad499c910"}, + {file = "coverage-7.2.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b2317d5ed777bf5a033e83d4f1389fd4ef045763141d8f10eb09a7035cee774c"}, + {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be9824c1c874b73b96288c6d3de793bf7f3a597770205068c6163ea1f326e8b9"}, + {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c3b2803e730dc2797a017335827e9da6da0e84c745ce0f552e66400abdfb9a1"}, + {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f69770f5ca1994cb32c38965e95f57504d3aea96b6c024624fdd5bb1aa494a1"}, + {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1127b16220f7bfb3f1049ed4a62d26d81970a723544e8252db0efde853268e21"}, + {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:aa784405f0c640940595fa0f14064d8e84aff0b0f762fa18393e2760a2cf5841"}, + {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3146b8e16fa60427e03884301bf8209221f5761ac754ee6b267642a2fd354c48"}, + {file = "coverage-7.2.3-cp37-cp37m-win32.whl", hash = "sha256:1fd78b911aea9cec3b7e1e2622c8018d51c0d2bbcf8faaf53c2497eb114911c1"}, + {file = "coverage-7.2.3-cp37-cp37m-win_amd64.whl", hash = "sha256:0f3736a5d34e091b0a611964c6262fd68ca4363df56185902528f0b75dbb9c1f"}, + {file = "coverage-7.2.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:981b4df72c93e3bc04478153df516d385317628bd9c10be699c93c26ddcca8ab"}, + {file = "coverage-7.2.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c0045f8f23a5fb30b2eb3b8a83664d8dc4fb58faddf8155d7109166adb9f2040"}, + {file = "coverage-7.2.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f760073fcf8f3d6933178d67754f4f2d4e924e321f4bb0dcef0424ca0215eba1"}, + {file = "coverage-7.2.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c86bd45d1659b1ae3d0ba1909326b03598affbc9ed71520e0ff8c31a993ad911"}, + {file = "coverage-7.2.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:172db976ae6327ed4728e2507daf8a4de73c7cc89796483e0a9198fd2e47b462"}, + {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:d2a3a6146fe9319926e1d477842ca2a63fe99af5ae690b1f5c11e6af074a6b5c"}, + {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:f649dd53833b495c3ebd04d6eec58479454a1784987af8afb77540d6c1767abd"}, + {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7c4ed4e9f3b123aa403ab424430b426a1992e6f4c8fd3cb56ea520446e04d152"}, + {file = "coverage-7.2.3-cp38-cp38-win32.whl", hash = "sha256:eb0edc3ce9760d2f21637766c3aa04822030e7451981ce569a1b3456b7053f22"}, + {file = "coverage-7.2.3-cp38-cp38-win_amd64.whl", hash = "sha256:63cdeaac4ae85a179a8d6bc09b77b564c096250d759eed343a89d91bce8b6367"}, + {file = "coverage-7.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:20d1a2a76bb4eb00e4d36b9699f9b7aba93271c9c29220ad4c6a9581a0320235"}, + {file = "coverage-7.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ea748802cc0de4de92ef8244dd84ffd793bd2e7be784cd8394d557a3c751e21"}, + {file = "coverage-7.2.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21b154aba06df42e4b96fc915512ab39595105f6c483991287021ed95776d934"}, + {file = "coverage-7.2.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd214917cabdd6f673a29d708574e9fbdb892cb77eb426d0eae3490d95ca7859"}, + {file = "coverage-7.2.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c2e58e45fe53fab81f85474e5d4d226eeab0f27b45aa062856c89389da2f0d9"}, + {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:87ecc7c9a1a9f912e306997ffee020297ccb5ea388421fe62a2a02747e4d5539"}, + {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:387065e420aed3c71b61af7e82c7b6bc1c592f7e3c7a66e9f78dd178699da4fe"}, + {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ea3f5bc91d7d457da7d48c7a732beaf79d0c8131df3ab278e6bba6297e23c6c4"}, + {file = "coverage-7.2.3-cp39-cp39-win32.whl", hash = "sha256:ae7863a1d8db6a014b6f2ff9c1582ab1aad55a6d25bac19710a8df68921b6e30"}, + {file = "coverage-7.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:3f04becd4fcda03c0160d0da9c8f0c246bc78f2f7af0feea1ec0930e7c93fa4a"}, + {file = "coverage-7.2.3-pp37.pp38.pp39-none-any.whl", hash = "sha256:965ee3e782c7892befc25575fa171b521d33798132692df428a09efacaffe8d0"}, + {file = "coverage-7.2.3.tar.gz", hash = "sha256:d298c2815fa4891edd9abe5ad6e6cb4207104c7dd9fd13aea3fdebf6f9b91259"}, +] +exceptiongroup = [ + {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, + {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, +] +gffutils = [ + {file = "gffutils-0.11.1.tar.gz", hash = "sha256:ca7bf814d600b389bb2d5c403dd279755118cb1476c19c6f7aecb8c51a84263c"}, +] +iniconfig = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] +intervaltree = [ + {file = "intervaltree-3.1.0.tar.gz", hash = "sha256:902b1b88936918f9b2a19e0e5eb7ccb430ae45cde4f39ea4b36932920d33952d"}, +] +mypy-extensions = [ + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, +] +numpy = [ + {file = "numpy-1.24.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eef70b4fc1e872ebddc38cddacc87c19a3709c0e3e5d20bf3954c147b1dd941d"}, + {file = "numpy-1.24.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e8d2859428712785e8a8b7d2b3ef0a1d1565892367b32f915c4a4df44d0e64f5"}, + {file = "numpy-1.24.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6524630f71631be2dabe0c541e7675db82651eb998496bbe16bc4f77f0772253"}, + {file = "numpy-1.24.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a51725a815a6188c662fb66fb32077709a9ca38053f0274640293a14fdd22978"}, + {file = "numpy-1.24.2-cp310-cp310-win32.whl", hash = "sha256:2620e8592136e073bd12ee4536149380695fbe9ebeae845b81237f986479ffc9"}, + {file = "numpy-1.24.2-cp310-cp310-win_amd64.whl", hash = "sha256:97cf27e51fa078078c649a51d7ade3c92d9e709ba2bfb97493007103c741f1d0"}, + {file = "numpy-1.24.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7de8fdde0003f4294655aa5d5f0a89c26b9f22c0a58790c38fae1ed392d44a5a"}, + {file = "numpy-1.24.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4173bde9fa2a005c2c6e2ea8ac1618e2ed2c1c6ec8a7657237854d42094123a0"}, + {file = "numpy-1.24.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4cecaed30dc14123020f77b03601559fff3e6cd0c048f8b5289f4eeabb0eb281"}, + {file = "numpy-1.24.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a23f8440561a633204a67fb44617ce2a299beecf3295f0d13c495518908e910"}, + {file = "numpy-1.24.2-cp311-cp311-win32.whl", hash = "sha256:e428c4fbfa085f947b536706a2fc349245d7baa8334f0c5723c56a10595f9b95"}, + {file = "numpy-1.24.2-cp311-cp311-win_amd64.whl", hash = "sha256:557d42778a6869c2162deb40ad82612645e21d79e11c1dc62c6e82a2220ffb04"}, + {file = "numpy-1.24.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d0a2db9d20117bf523dde15858398e7c0858aadca7c0f088ac0d6edd360e9ad2"}, + {file = "numpy-1.24.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c72a6b2f4af1adfe193f7beb91ddf708ff867a3f977ef2ec53c0ffb8283ab9f5"}, + {file = "numpy-1.24.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c29e6bd0ec49a44d7690ecb623a8eac5ab8a923bce0bea6293953992edf3a76a"}, + {file = "numpy-1.24.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2eabd64ddb96a1239791da78fa5f4e1693ae2dadc82a76bc76a14cbb2b966e96"}, + {file = "numpy-1.24.2-cp38-cp38-win32.whl", hash = "sha256:e3ab5d32784e843fc0dd3ab6dcafc67ef806e6b6828dc6af2f689be0eb4d781d"}, + {file = "numpy-1.24.2-cp38-cp38-win_amd64.whl", hash = "sha256:76807b4063f0002c8532cfeac47a3068a69561e9c8715efdad3c642eb27c0756"}, + {file = "numpy-1.24.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4199e7cfc307a778f72d293372736223e39ec9ac096ff0a2e64853b866a8e18a"}, + {file = "numpy-1.24.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:adbdce121896fd3a17a77ab0b0b5eedf05a9834a18699db6829a64e1dfccca7f"}, + {file = "numpy-1.24.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:889b2cc88b837d86eda1b17008ebeb679d82875022200c6e8e4ce6cf549b7acb"}, + {file = "numpy-1.24.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f64bb98ac59b3ea3bf74b02f13836eb2e24e48e0ab0145bbda646295769bd780"}, + {file = "numpy-1.24.2-cp39-cp39-win32.whl", hash = "sha256:63e45511ee4d9d976637d11e6c9864eae50e12dc9598f531c035265991910468"}, + {file = "numpy-1.24.2-cp39-cp39-win_amd64.whl", hash = "sha256:a77d3e1163a7770164404607b7ba3967fb49b24782a6ef85d9b5f54126cc39e5"}, + {file = "numpy-1.24.2-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:92011118955724465fb6853def593cf397b4a1367495e0b59a7e69d40c4eb71d"}, + {file = "numpy-1.24.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9006288bcf4895917d02583cf3411f98631275bc67cce355a7f39f8c14338fa"}, + {file = "numpy-1.24.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:150947adbdfeceec4e5926d956a06865c1c690f2fd902efede4ca6fe2e657c3f"}, + {file = "numpy-1.24.2.tar.gz", hash = "sha256:003a9f530e880cb2cd177cba1af7220b9aa42def9c4afc2a2fc3ee6be7eb2b22"}, +] +nvidia-cublas-cu11 = [ + {file = "nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl", hash = "sha256:d32e4d75f94ddfb93ea0a5dda08389bcc65d8916a25cb9f37ac89edaeed3bded"}, + {file = "nvidia_cublas_cu11-11.10.3.66-py3-none-win_amd64.whl", hash = "sha256:8ac17ba6ade3ed56ab898a036f9ae0756f1e81052a317bf98f8c6d18dc3ae49e"}, +] +nvidia-cuda-nvrtc-cu11 = [ + {file = "nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl", hash = "sha256:9f1562822ea264b7e34ed5930567e89242d266448e936b85bc97a3370feabb03"}, + {file = "nvidia_cuda_nvrtc_cu11-11.7.99-py3-none-manylinux1_x86_64.whl", hash = "sha256:f7d9610d9b7c331fa0da2d1b2858a4a8315e6d49765091d28711c8946e7425e7"}, + {file = "nvidia_cuda_nvrtc_cu11-11.7.99-py3-none-win_amd64.whl", hash = "sha256:f2effeb1309bdd1b3854fc9b17eaf997808f8b25968ce0c7070945c4265d64a3"}, +] +nvidia-cuda-runtime-cu11 = [ + {file = "nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl", hash = "sha256:cc768314ae58d2641f07eac350f40f99dcb35719c4faff4bc458a7cd2b119e31"}, + {file = "nvidia_cuda_runtime_cu11-11.7.99-py3-none-win_amd64.whl", hash = "sha256:bc77fa59a7679310df9d5c70ab13c4e34c64ae2124dd1efd7e5474b71be125c7"}, +] +nvidia-cudnn-cu11 = [ + {file = "nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl", hash = "sha256:402f40adfc6f418f9dae9ab402e773cfed9beae52333f6d86ae3107a1b9527e7"}, + {file = "nvidia_cudnn_cu11-8.5.0.96-py3-none-manylinux1_x86_64.whl", hash = "sha256:71f8111eb830879ff2836db3cccf03bbd735df9b0d17cd93761732ac50a8a108"}, +] +packaging = [ + {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, + {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, +] +pandas = [ + {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406"}, + {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572"}, + {file = "pandas-1.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996"}, + {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354"}, + {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23"}, + {file = "pandas-1.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328"}, + {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc"}, + {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d"}, + {file = "pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc"}, + {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae"}, + {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6"}, + {file = "pandas-1.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003"}, + {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813"}, + {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31"}, + {file = "pandas-1.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792"}, + {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7"}, + {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf"}, + {file = "pandas-1.5.3-cp38-cp38-win32.whl", hash = "sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51"}, + {file = "pandas-1.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373"}, + {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa"}, + {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee"}, + {file = "pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a"}, + {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0"}, + {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5"}, + {file = "pandas-1.5.3-cp39-cp39-win32.whl", hash = "sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a"}, + {file = "pandas-1.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9"}, + {file = "pandas-1.5.3.tar.gz", hash = "sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1"}, +] +pathspec = [ + {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"}, + {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, +] +platformdirs = [ + {file = "platformdirs-3.3.0-py3-none-any.whl", hash = "sha256:ea61fd7b85554beecbbd3e9b37fb26689b227ffae38f73353cbcc1cf8bd01878"}, + {file = "platformdirs-3.3.0.tar.gz", hash = "sha256:64370d47dc3fca65b4879f89bdead8197e93e05d696d6d1816243ebae8595da5"}, +] +pluggy = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] +pyfaidx = [ + {file = "pyfaidx-0.7.2.1-py3-none-any.whl", hash = "sha256:eee13d35bb5f2aa65932a9ad9dd74fa695aefe6e0baafc5836cfa869a7695acc"}, + {file = "pyfaidx-0.7.2.1.tar.gz", hash = "sha256:30f0d20a9e3d53353fb20eb69b7e22e6f01a53ed4f21b3e17dd408f0be5051a0"}, +] +pyfastx = [ + {file = "pyfastx-0.8.4-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:bb67f42735e72d8b14b28590fcb0ced1a98cd9005413a0898c7cfa6c5bc1a5c6"}, + {file = "pyfastx-0.8.4-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:85c0edb900848de5a25a0fefab30af314abb8a7b05900173a96508bfd3571ddb"}, + {file = "pyfastx-0.8.4-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:3b4ae3809b71b72d59347830fa87412cf96c2f0b837a44cca1bb91d51b7eebf5"}, + {file = "pyfastx-0.8.4-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:dc4207c19c3dc3ec4ce76661904f31a78ec99756b3ef1b656f5712b0ec7432b9"}, + {file = "pyfastx-0.8.4-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:6b3c71b8acd850fd4b662728cb356d92b4789672efe2f5d2791becbc4705d27d"}, + {file = "pyfastx-0.8.4-cp35-cp35m-win32.whl", hash = "sha256:82fb7340f628cffc13751cb8ddf4539d96b1542e9b9dd21fc368b8f3b35024b7"}, + {file = "pyfastx-0.8.4-cp35-cp35m-win_amd64.whl", hash = "sha256:6bf0dfc3fd1e7af8ef0777dbd20635e6e35db832af5c461bb0e676ff02e7552f"}, + {file = "pyfastx-0.8.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:53a1b2d91c2a9c364579aff6305b413b6a9eac1cf739ffaa6182d0bdf6ec3dd6"}, + {file = "pyfastx-0.8.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:5966ea5788eac13a41816eb91348c26ac83e0dde677e0a1a875b24e73f88363e"}, + {file = "pyfastx-0.8.4-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:1857354f75a5c4dbff2cdf2b885c5eb58221dc72e539320ff75814d591aa0e8b"}, + {file = "pyfastx-0.8.4-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:a12a8046e8585e8159403a95e110cec7020a79a6ceeffe6c1ad814b9365df6b9"}, + {file = "pyfastx-0.8.4-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:ca06602d642d6fce692d9cf76003a2010d0791ca9a4f1c4ad26226a378866787"}, + {file = "pyfastx-0.8.4-cp36-cp36m-win32.whl", hash = "sha256:6364813d2d32bb52c2e8967a3fbeeb063e562942a47ee7966c5d0555aa39245b"}, + {file = "pyfastx-0.8.4-cp36-cp36m-win_amd64.whl", hash = "sha256:45839975776f9217f66925cfb319d6780e1cdbb1bc91c23fff4e43411f1623a9"}, + {file = "pyfastx-0.8.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8d4085c767550ba98e4b93cb1c7fb10fa4948347ae69460fdca22d65223755d0"}, + {file = "pyfastx-0.8.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:4f7d4c41cf5597622de4e6e28b7dd242f6ed14a017134076264c2461ef0952ec"}, + {file = "pyfastx-0.8.4-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:ca57156dd6bb8dc03d91ba6bb71513b3cc197d15dcc626b085f384f90ef73f19"}, + {file = "pyfastx-0.8.4-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:5983dd1153485cbd0803b0c1cd2240c9156a5827f26d62d7cb34c74d269885d6"}, + {file = "pyfastx-0.8.4-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:04ac4f5fc5c514e75fdbdaf783df770187221574277a38f5f27aa4d4ee849fd0"}, + {file = "pyfastx-0.8.4-cp37-cp37m-win32.whl", hash = "sha256:e3552a6a80d91355b29fbe562126270210587899a6514374eda4336967a4586e"}, + {file = "pyfastx-0.8.4-cp37-cp37m-win_amd64.whl", hash = "sha256:3fcfde0890142c8f73872f6239bfab5f15d89276a2bb09ef18dabca0555c8038"}, + {file = "pyfastx-0.8.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bebc53c878fa6d2a15d5dd574e760c1ebc93e40c46d27872e2d5f0fb28b63dd2"}, + {file = "pyfastx-0.8.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:c1693f7448e3e0dbd4fa60e7bea80720b953044d29b3fc191a4a3534d194ff29"}, + {file = "pyfastx-0.8.4-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:ab071008c3a0d5a721891a2510067b475699dfd617446d0ac97a588a3a2d7471"}, + {file = "pyfastx-0.8.4-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:88eaef44817d0932e1abd32c63015cb9a7400a1be783e72b0f9746926d8f3cfb"}, + {file = "pyfastx-0.8.4-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:bd4bfbd9d1a7e796e0e2f558beddfa04f141467a82df76861b7c64d5750d62ac"}, + {file = "pyfastx-0.8.4-cp38-cp38-win32.whl", hash = "sha256:6dbde773e8f1bb7a1a7def1f505519c577edce631e59a00cbf4080c3d80ae9a7"}, + {file = "pyfastx-0.8.4-cp38-cp38-win_amd64.whl", hash = "sha256:cc06070a09a5d55c8a15ab23ec5c119dc25ed3c4fd9956da36c0d252e8746290"}, + {file = "pyfastx-0.8.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0760041447904aefd647df8ce2f4d008059324e6aa275b703c6c2ac1334b40cf"}, + {file = "pyfastx-0.8.4-cp39-cp39-manylinux1_i686.whl", hash = "sha256:51e9fffa705992a4ac558cc399405e003b5b87cbdbe06f366ad4891511777dac"}, + {file = "pyfastx-0.8.4-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:ca38b8c0f97dde4ebf8f221c52da13b4559722e32f209a886f4061597a5df21d"}, + {file = "pyfastx-0.8.4-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:4b0718ea0b64e76588cb6e8a2427559db78cef50ead70097a3cd8994571f4f7d"}, + {file = "pyfastx-0.8.4-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:de6027a1c04c9cf1cab90167fc1f20097b4503485e6a142fa0c473aa9d00d45a"}, + {file = "pyfastx-0.8.4-cp39-cp39-win_amd64.whl", hash = "sha256:1563bc974bce54f586689df185aae03aef72ae5ed79ded9340254ea94507d764"}, + {file = "pyfastx-0.8.4.tar.gz", hash = "sha256:20cee9faff140f973c59fbe98121eac2d67acf3eb7fef5fdf69a8b4942b4468c"}, +] +pysam = [ + {file = "pysam-0.20.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d919f40db3027f092bb39177aecbb49a02e2fd746bb5adfbe48eb839b2225e51"}, + {file = "pysam-0.20.0-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:7145694675a0cfe0c04abb5582c70b3f6a19d6b30e6835931016afd57d423719"}, + {file = "pysam-0.20.0-cp310-cp310-manylinux_2_24_i686.whl", hash = "sha256:fa98bd2e6bf1252dac7c275fe7c34bbc125644b781a6196bfe25cc078c6cb341"}, + {file = "pysam-0.20.0-cp310-cp310-manylinux_2_24_x86_64.whl", hash = "sha256:12c56353739f2b76266407502e06127235197030a8e11188cb80693ca46321d1"}, + {file = "pysam-0.20.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d4744e162476a62fab9458aa3d1e2e51614e0f919e4578c14e986c7e7cab377e"}, + {file = "pysam-0.20.0-cp36-cp36m-manylinux_2_24_aarch64.whl", hash = "sha256:e61c3a68fb254ffd2c34ce956277615663c5ecab7a30e6308744873984794330"}, + {file = "pysam-0.20.0-cp36-cp36m-manylinux_2_24_i686.whl", hash = "sha256:ff15d6a6ac29541d5dee30ea8233356c43a5f3a99886451fd0188b80daa0422d"}, + {file = "pysam-0.20.0-cp36-cp36m-manylinux_2_24_x86_64.whl", hash = "sha256:29d1a2c60944f1bc46b9324f9e78dd343fd6a8de039badff71df43df960e223c"}, + {file = "pysam-0.20.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7e017a3b8baeee7c0df7f666138e8bf5a73f9805c2ab6287a2a4d5351f6822d5"}, + {file = "pysam-0.20.0-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:7c57867c80af3b5c3a4ae391ec86b914bb6361bb5cd41d985cbe06a75163188c"}, + {file = "pysam-0.20.0-cp37-cp37m-manylinux_2_24_i686.whl", hash = "sha256:38f292478f32cbf6f2981021f7c2a961956e9b49141493557ace79810221c4e4"}, + {file = "pysam-0.20.0-cp37-cp37m-manylinux_2_24_x86_64.whl", hash = "sha256:3d8c86ae4413c25d047aa4e9529b2adc366ecfeb1eb3f0098c525705314a0332"}, + {file = "pysam-0.20.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3b8a07732549ff10dfad8b0db7663076b39727a558e1f6d06ab5c6819f3cae9f"}, + {file = "pysam-0.20.0-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:d958ce70865869f2aa8c8c0880ad451bafd4d5a8c94fb78a269ab913a57d9303"}, + {file = "pysam-0.20.0-cp38-cp38-manylinux_2_24_i686.whl", hash = "sha256:93f0ea6b2050ad470b5b1cdd19fae0b88afd5ae48ee6e66a0dcd054b61e9fba4"}, + {file = "pysam-0.20.0-cp38-cp38-manylinux_2_24_x86_64.whl", hash = "sha256:e14e33703bdb8ed812ab16b5c816ce68ffee2ae2a19906efdc5732c3e446791e"}, + {file = "pysam-0.20.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f5aefffd4ac1fad35b720cb7a1663be9bdb18376d0d361d33a744254a0da8e96"}, + {file = "pysam-0.20.0-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:3ed4dbf8613007daf8b127c32743206126badc35822730de767d86f9ead445ad"}, + {file = "pysam-0.20.0-cp39-cp39-manylinux_2_24_i686.whl", hash = "sha256:7ca81b7e5af5f3cdac460e0ce59a0eab11fdbe1a216ddee6f3172aa16445ae54"}, + {file = "pysam-0.20.0-cp39-cp39-manylinux_2_24_x86_64.whl", hash = "sha256:a2d8f2e15934100ce6b380659af884066d5ebffa69e36025b4029f8c9e8b3adc"}, + {file = "pysam-0.20.0.tar.gz", hash = "sha256:7cc250148ba0ffc9bdc38db6988b91e13b75db0d11c18cf1336467d1c97dd312"}, +] +pytest = [ + {file = "pytest-7.2.2-py3-none-any.whl", hash = "sha256:130328f552dcfac0b1cec75c12e3f005619dc5f874f0a06e8ff7263f0ee6225e"}, + {file = "pytest-7.2.2.tar.gz", hash = "sha256:c99ab0c73aceb050f68929bc93af19ab6db0558791c6a0715723abe9d0ade9d4"}, +] +python-dateutil = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] +pytz = [ + {file = "pytz-2022.7.1-py2.py3-none-any.whl", hash = "sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a"}, + {file = "pytz-2022.7.1.tar.gz", hash = "sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0"}, +] +setuptools = [ + {file = "setuptools-67.6.0-py3-none-any.whl", hash = "sha256:b78aaa36f6b90a074c1fa651168723acbf45d14cb1196b6f02c0fd07f17623b2"}, + {file = "setuptools-67.6.0.tar.gz", hash = "sha256:2ee892cd5f29f3373097f5a814697e397cf3ce313616df0af11231e2ad118077"}, +] +simplejson = [ + {file = "simplejson-3.18.3-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:50f4b6d52f3a2d1cffd11834a1fe7f9516f0e3f20cbe78027aa88ff990fad7d6"}, + {file = "simplejson-3.18.3-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:169c2c7446ef33439c304a6aa5b7b5a2dbc938c9c2dd882dd3f2553f9518ebf6"}, + {file = "simplejson-3.18.3-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:56f186d44a9f625b5e5d9ba4b9551e263604000a7df60cb373b3e789ca603b2a"}, + {file = "simplejson-3.18.3-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:cf7168b2046db0eceb83d8ed2ee31c0847ce18b2d8baf3e93de9560f3921a8c3"}, + {file = "simplejson-3.18.3-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:55df3dfd8777bf134e1078d2f195352432a77f23ccb90b92b08218123d56adc9"}, + {file = "simplejson-3.18.3-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:6b997739fdbc9b7030ff490fc8e5f8c144b8ec80f3605eff643983672bb8cfde"}, + {file = "simplejson-3.18.3-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:c98fddc374468158778a8afb3fd7296412a2b2fc34cebba64212ac3e018e7382"}, + {file = "simplejson-3.18.3-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:55aa983575b0aef143845f5bfbb35075475eccaebf7d4b30f4037a2fe8414666"}, + {file = "simplejson-3.18.3-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:1b79e2607ac5ba98381c2e068727acc1e4dd385a6d216914c0613f8f568a06a5"}, + {file = "simplejson-3.18.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b0352428b35da859a98770949e7353866ae65463026f1c8e4c89a6395d4b5fd7"}, + {file = "simplejson-3.18.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eb81cfef0c0039010f0212f4e5eb6909641b8a54c761584054ac97fd7bd0c21a"}, + {file = "simplejson-3.18.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e762e9d8556fa9f3a99f8a278eeba50a35b5f554b82deeb282ddbdd85816e638"}, + {file = "simplejson-3.18.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc8df5831b645e96a318ea51a66ce6e2bb869eebc3fa9a860bbf67aecd270055"}, + {file = "simplejson-3.18.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:14b35fb90083218e59df5dba733c7086655f2938f3fcabe36ad849623941d660"}, + {file = "simplejson-3.18.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f89f078114cacedb9a3392615cc099cf02a51efa7507f90e2006bf7ec38c880d"}, + {file = "simplejson-3.18.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a2960b95f3ba822d077d1afa7e1fea9799cfb2990028cf010e666f64195ecb5a"}, + {file = "simplejson-3.18.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:96ade36640734b54176c4765d00a60767bd7fae5b7a5b3574accc055ac18e34c"}, + {file = "simplejson-3.18.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6c4c56c5abb82e22877b913186e5c0fd7d9eef0c930719e28fa451d3f11defb4"}, + {file = "simplejson-3.18.3-cp310-cp310-win32.whl", hash = "sha256:8209c40279ed9b2cd5fbe2d617a29a074e90ea97fce7c07a0128a01cb3e8afc5"}, + {file = "simplejson-3.18.3-cp310-cp310-win_amd64.whl", hash = "sha256:6a49665169c18f27a0fc10935466332ee7406ee14ced8dc0a1b4d465547299aa"}, + {file = "simplejson-3.18.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:508342d7227ed66beecfbba7a38b46e1a713faeb034216f43f03ec5c175e0622"}, + {file = "simplejson-3.18.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:070ab073ce72f1624107dfd6d095c87ac32aafe7ba54a5c5055a3dd83cb06e51"}, + {file = "simplejson-3.18.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:88f59a07873dc1f06fd9e6712dd71286f1b297a066ad2fd9110ad080d3cb011c"}, + {file = "simplejson-3.18.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5780e3929435a8d39671537174f8ce0ccafb4f6e0c748ffe139916ffbdca39d3"}, + {file = "simplejson-3.18.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2be75f4cb9951efeb2616e16f944ee4f9a09768475a3f5c40a6ac4dc5ee68dfd"}, + {file = "simplejson-3.18.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e49c84df6e71e3c23169d3df481565dd607cbee4aa1e0af15c493cccad7c745"}, + {file = "simplejson-3.18.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ab5bdf0b8d07f7fd603b2d0c1982412cd9f8ade997088ddced251f7e656c7fd4"}, + {file = "simplejson-3.18.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:141782a0a25c1792627575b37b4951583358ccc7137623aa45947f8425ee8d96"}, + {file = "simplejson-3.18.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:24823364fee93bab141621b3a2e10612e31be7ca58788bf9b2cd2b1ce37ab07d"}, + {file = "simplejson-3.18.3-cp311-cp311-win32.whl", hash = "sha256:f73bae5e315adf7bc8cb7f0a13a1e9e33bead42e8ce174be83ac9ecc2513c86a"}, + {file = "simplejson-3.18.3-cp311-cp311-win_amd64.whl", hash = "sha256:063db62a9251e61ea0c17e49c3e7bed465bfcc5359655abcb8c0bc6130a4e0d4"}, + {file = "simplejson-3.18.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:3bab9ea49ff477c926c5787f79ec47cf51c7ffb15c9d8dd0f09e728807d44f4b"}, + {file = "simplejson-3.18.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cf299fbb7d476676dfea372a3262654af98694bd1df35b060ce0fe1b68087f1"}, + {file = "simplejson-3.18.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62628ea5df8c830d00a7417d5ecd949a1b24a8d0a5063a2a77f7ec7522110a0f"}, + {file = "simplejson-3.18.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ff65b475091084e5bdb7f26e9c555956be7355b573ce494fa96f9f8e34541ac"}, + {file = "simplejson-3.18.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:2b0f6de11f5ce4b80f51bc49d08b898602e190547f8efe4e44af8ae3cda7779d"}, + {file = "simplejson-3.18.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:d990ea42ba908cb57a3df97d283aa26c1822f10a0a60e250b54ee21cd08c48d0"}, + {file = "simplejson-3.18.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:2c7ee643ee93684bf76196e2d84a2090c6df8f01737a016e869b579593827b6e"}, + {file = "simplejson-3.18.3-cp36-cp36m-win32.whl", hash = "sha256:0e7c3fae6c9540064e06a653780b4f263675cd69ca6841345029fee3e27e9bb5"}, + {file = "simplejson-3.18.3-cp36-cp36m-win_amd64.whl", hash = "sha256:0baf8c60efef74944ed4adb034d14bcf737731576f0e4c3c56fb875ea256af69"}, + {file = "simplejson-3.18.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:52465a5578cfc2c5e374a574df14dfb75e04c6cb6a100b7abc8bf6c89bea8f5e"}, + {file = "simplejson-3.18.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fe1173b4146641c872bafa6f9a21f3a2012f502d54fbb523a76e6320024fae9"}, + {file = "simplejson-3.18.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23fce984045804194f513a2739dcd82be350198470d5ade5058da019a48cf3f8"}, + {file = "simplejson-3.18.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aad323e92cb1bd3b1db6f57c007dca964d13c52247ad844203ce381e94066601"}, + {file = "simplejson-3.18.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7c26fe63755ecc59c502ddde8e58ce8b765bf4fdd3f5858d2b7c8ab28bc2a9c8"}, + {file = "simplejson-3.18.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:502d86fbfe914263642479b87ed61af3b27b9e039df77acd2416cfccfc892e68"}, + {file = "simplejson-3.18.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:44d6c52d4f5c0c087a6e88a92bf9f94234321d21be32c6471ba39856e304bbe3"}, + {file = "simplejson-3.18.3-cp37-cp37m-win32.whl", hash = "sha256:2a1b3222bc8f6ac91b5ebe3263111c7dc4dc4b01c52f0153f5bb1f3ef3bf0023"}, + {file = "simplejson-3.18.3-cp37-cp37m-win_amd64.whl", hash = "sha256:1907d49d70c75530976119c13785db91168d2599288debaca7d25da9cd2f3747"}, + {file = "simplejson-3.18.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:04a4b9a297cccbc9e1d66fe652fbffd55b36d6579c43132e821d315957302194"}, + {file = "simplejson-3.18.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:16cc750d19852fa5ebafd55da86fa357f87991e07b4e2afb37a5975dfdde0153"}, + {file = "simplejson-3.18.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:290bbcdcbb37af3f7e43378f592ab7a9168fca640da6af63d42cdb535f96bbf2"}, + {file = "simplejson-3.18.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:694332fd6fd10fe8868c2508583220d1a1a7be9ff049dab5bd6b9aedfb9edc50"}, + {file = "simplejson-3.18.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f9f72d2b539512f382a48cc9ad6cea2d3a572e71e92c40e03d2140041eeaa233"}, + {file = "simplejson-3.18.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcd9eac304a133ee4af58e68c5ded4c5ba663d3ee4602e8613359b776a1f8c8f"}, + {file = "simplejson-3.18.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:cde5a3ff5e0bd5d6da676314dfae86c9e99bff77bca03d30223c9718a58f9e83"}, + {file = "simplejson-3.18.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:099bbd3b5b4ea83159a980348cd481a34984dee5fe1b9fac31a9137158f46960"}, + {file = "simplejson-3.18.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4de9fed1166aeedee44150fa83bc059aca6b612940281f8b5a39374781f16196"}, + {file = "simplejson-3.18.3-cp38-cp38-win32.whl", hash = "sha256:59a629240cfbc5b4f390a8578dca74ae77ab617de971862acb946822d2eb1b11"}, + {file = "simplejson-3.18.3-cp38-cp38-win_amd64.whl", hash = "sha256:5b009342e712026ffabe8a471d5b4a4ff2a038687387e74eae601574c04dae33"}, + {file = "simplejson-3.18.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6bd81d10cb3384f64242316da8a2b2f88618776bc1ef38bcc79f1afe8ad36616"}, + {file = "simplejson-3.18.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c3b696770b504f881f271f97b94a687487ec1ef20bfbd5f20d92bbab7a85952d"}, + {file = "simplejson-3.18.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:75eb555dc349d0cbe2c95ea2be665b306c6ac6d5b64e3a3920af9b805ecdb5f7"}, + {file = "simplejson-3.18.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d774782159347d66563cd7ac18b9dd37010438a825160cde4818caa18110a746"}, + {file = "simplejson-3.18.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b2c4e8b65987f3c6529149495d28e23efe213e94dc3659176c4ab22d18a9ee4a"}, + {file = "simplejson-3.18.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8a4750e8db92109e6f1f7783a7faae4254d6d5dc28a41ff7eff7d2265f0586b"}, + {file = "simplejson-3.18.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4b8d4d958c5ab3489d1174917a7fad82da642560c39ce559a624e63deaaa36b1"}, + {file = "simplejson-3.18.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:efa70fd9b6c7b57b048ecadb909683acd535cddebc5b22f3c05ba3b369739caf"}, + {file = "simplejson-3.18.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7e73d9d6af3c29b60a92e28b3144d951110f59a3d05fc402c3f6c5248b883400"}, + {file = "simplejson-3.18.3-cp39-cp39-win32.whl", hash = "sha256:a80bd9a3db88a76a401155c64e3499376c702307c2206cb381cc2a8dd9cc4f1f"}, + {file = "simplejson-3.18.3-cp39-cp39-win_amd64.whl", hash = "sha256:c4514675f6571da8190fea52a110bca686fa844972e8b2b3bc07ace9e632ee4f"}, + {file = "simplejson-3.18.3-py3-none-any.whl", hash = "sha256:37bdef13412c0bc338db2993a38f3911d5bd2a0ba8d00b3bc66d1063edd7c33e"}, + {file = "simplejson-3.18.3.tar.gz", hash = "sha256:ebb53837c5ffcb6100646018565d3f1afed6f4b185b14b2c9cbccf874fe40157"}, +] +six = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] +sortedcontainers = [ + {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, + {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, +] +tomli = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] +torch = [ + {file = "torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:fd12043868a34a8da7d490bf6db66991108b00ffbeecb034228bfcbbd4197143"}, + {file = "torch-1.13.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:d9fe785d375f2e26a5d5eba5de91f89e6a3be5d11efb497e76705fdf93fa3c2e"}, + {file = "torch-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:98124598cdff4c287dbf50f53fb455f0c1e3a88022b39648102957f3445e9b76"}, + {file = "torch-1.13.1-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:393a6273c832e047581063fb74335ff50b4c566217019cc6ace318cd79eb0566"}, + {file = "torch-1.13.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:0122806b111b949d21fa1a5f9764d1fd2fcc4a47cb7f8ff914204fd4fc752ed5"}, + {file = "torch-1.13.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:22128502fd8f5b25ac1cd849ecb64a418382ae81dd4ce2b5cebaa09ab15b0d9b"}, + {file = "torch-1.13.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:76024be052b659ac1304ab8475ab03ea0a12124c3e7626282c9c86798ac7bc11"}, + {file = "torch-1.13.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:ea8dda84d796094eb8709df0fcd6b56dc20b58fdd6bc4e8d7109930dafc8e419"}, + {file = "torch-1.13.1-cp37-cp37m-win_amd64.whl", hash = "sha256:2ee7b81e9c457252bddd7d3da66fb1f619a5d12c24d7074de91c4ddafb832c93"}, + {file = "torch-1.13.1-cp37-none-macosx_10_9_x86_64.whl", hash = "sha256:0d9b8061048cfb78e675b9d2ea8503bfe30db43d583599ae8626b1263a0c1380"}, + {file = "torch-1.13.1-cp37-none-macosx_11_0_arm64.whl", hash = "sha256:f402ca80b66e9fbd661ed4287d7553f7f3899d9ab54bf5c67faada1555abde28"}, + {file = "torch-1.13.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:727dbf00e2cf858052364c0e2a496684b9cb5aa01dc8a8bc8bbb7c54502bdcdd"}, + {file = "torch-1.13.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:df8434b0695e9ceb8cc70650afc1310d8ba949e6db2a0525ddd9c3b2b181e5fe"}, + {file = "torch-1.13.1-cp38-cp38-win_amd64.whl", hash = "sha256:5e1e722a41f52a3f26f0c4fcec227e02c6c42f7c094f32e49d4beef7d1e213ea"}, + {file = "torch-1.13.1-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:33e67eea526e0bbb9151263e65417a9ef2d8fa53cbe628e87310060c9dcfa312"}, + {file = "torch-1.13.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:eeeb204d30fd40af6a2d80879b46a7efbe3cf43cdbeb8838dd4f3d126cc90b2b"}, + {file = "torch-1.13.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:50ff5e76d70074f6653d191fe4f6a42fdbe0cf942fbe2a3af0b75eaa414ac038"}, + {file = "torch-1.13.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:2c3581a3fd81eb1f0f22997cddffea569fea53bafa372b2c0471db373b26aafc"}, + {file = "torch-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:0aa46f0ac95050c604bcf9ef71da9f1172e5037fdf2ebe051962d47b123848e7"}, + {file = "torch-1.13.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:6930791efa8757cb6974af73d4996b6b50c592882a324b8fb0589c6a9ba2ddaf"}, + {file = "torch-1.13.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:e0df902a7c7dd6c795698532ee5970ce898672625635d885eade9976e5a04949"}, +] +typing-extensions = [ + {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"}, + {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"}, +] +wheel = [ + {file = "wheel-0.38.4-py3-none-any.whl", hash = "sha256:b60533f3f5d530e971d6737ca6d58681ee434818fab630c83a734bb10c083ce8"}, + {file = "wheel-0.38.4.tar.gz", hash = "sha256:965f5259b566725405b05e7cf774052044b1ed30119b5d586b2703aafe8719ac"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d443ffb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,29 @@ +[tool.poetry] +name = "pangolin" +version = "1.3.11" +description = "" +authors = ["Tony Zeng ", "Kevin Kazmierczak "] +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.8" +numpy = "^1.24.2" +torch = "^1.13.1" +gffutils = "^0.11.1" +pyfastx = "0.8.4" # This is fixed due to a weird error in the latest version where it complains about missing keys +pandas = "^1.5.3" +biopython = "^1.81" +pysam = "^0.20.0" +intervaltree = "^3.1.0" + +[tool.poetry.group.dev.dependencies] +pytest = "^7.2.2" +black = "^23.3.0" +coverage = "^7.2.3" + +[tool.poetry.scripts] +pangolin = "pangolin.pangolin:main" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/scripts/create_db.py b/scripts/create_db.py index ee89984..d252575 100755 --- a/scripts/create_db.py +++ b/scripts/create_db.py @@ -2,8 +2,15 @@ import gffutils parser = argparse.ArgumentParser() -parser.add_argument("annotation_file", help="GTF file containing gene annotations. For example, from https://www.gencodegenes.org/") -parser.add_argument("--filter", default="Ensembl_canonical", help="Only keep GTF features with the specified tags. Format: tag1,tag2,... or None to keep all features. Default: Ensembl_canonical") +parser.add_argument( + "annotation_file", + help="GTF file containing gene annotations. For example, from https://www.gencodegenes.org/", +) +parser.add_argument( + "--filter", + default="Ensembl_canonical", + help="Only keep GTF features with the specified tags. Format: tag1,tag2,... or None to keep all features. Default: Ensembl_canonical", +) args = parser.parse_args() gtf = args.annotation_file @@ -14,20 +21,27 @@ else: exit("ERROR, annotation_file should be a GTF file.") + def filter(feat): - if feat.featuretype not in ["gene","transcript","exon"]: + if feat.featuretype not in ["gene", "transcript", "exon"]: return False - elif args.filter != "None" and feat.featuretype in ["transcript","exon"]: + elif args.filter != "None" and feat.featuretype in ["transcript", "exon"]: present = False - for tag in args.filter.split(','): + for tag in args.filter.split(","): if "tag" in feat.attributes and tag in feat["tag"]: present = True if not present: return False return feat -db = gffutils.create_db(gtf, prefix+".db", force=True, - disable_infer_genes=True, disable_infer_transcripts=True, - transform=filter) + +db = gffutils.create_db( + gtf, + prefix + ".db", + force=True, + disable_infer_genes=True, + disable_infer_transcripts=True, + transform=filter, +) print("Database created: %s.db" % prefix) diff --git a/scripts/custom_usage.py b/scripts/custom_usage.py index 377ad47..735581d 100644 --- a/scripts/custom_usage.py +++ b/scripts/custom_usage.py @@ -15,8 +15,8 @@ # Change this to the desired sequences and strand for each sequence. If the sequence is N bases long, Pangolin will # return scores for the middle N-10000 bases (so if you are interested in the score for a single site, the input should # be: 5000 bases before the site, base at the site, 5000 bases after the site). Sequences < 10001 bases can be padded with 'N'. -seqs = [10001*'A'] -strands = ['-'] +seqs = [10001 * "A"] +strands = ["-"] # Load models models = [] @@ -25,32 +25,36 @@ model = Pangolin(L, W, AR) if torch.cuda.is_available(): model.cuda() - weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i))) + weights = torch.load( + resource_filename("pangolin", "models/final.%s.%s.3" % (j, i)) + ) else: - weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)), - map_location=torch.device('cpu')) + weights = torch.load( + resource_filename("pangolin", "models/final.%s.%s.3" % (j, i)), + map_location=torch.device("cpu"), + ) model.load_state_dict(weights) model.eval() models.append(model) # Get scores -IN_MAP = np.asarray([[0, 0, 0, 0], - [1, 0, 0, 0], - [0, 1, 0, 0], - [0, 0, 1, 0], - [0, 0, 0, 1]]) -INDEX_MAP = {0:1, 1:2, 2:4, 3:5, 4:7, 5:8, 6:10, 7:11} +IN_MAP = np.asarray( + [[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]] +) +INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11} + def one_hot_encode(seq, strand): - seq = seq.upper().replace('A', '1').replace('C', '2') - seq = seq.replace('G', '3').replace('T', '4').replace('N', '0') - if strand == '+': + seq = seq.upper().replace("A", "1").replace("C", "2") + seq = seq.replace("G", "3").replace("T", "4").replace("N", "0") + if strand == "+": seq = np.asarray(list(map(int, list(seq)))) - elif strand == '-': + elif strand == "-": seq = np.asarray(list(map(int, list(seq[::-1])))) seq = (5 - seq) % 5 # Reverse complement - return IN_MAP[seq.astype('int8')] + return IN_MAP[seq.astype("int8")] + for i, seq in enumerate(seqs): seq = one_hot_encode(seq, strands[i]).T @@ -62,7 +66,7 @@ def one_hot_encode(seq, strand): for j, model_num in enumerate(model_nums): score = [] # Average across 5 models - for model in models[5*j:5*j+5]: + for model in models[5 * j : 5 * j + 5]: with torch.no_grad(): - score.append(model(seq)[0][INDEX_MAP[model_num],:].cpu().numpy()) + score.append(model(seq)[0][INDEX_MAP[model_num], :].cpu().numpy()) print(np.mean(score, axis=0)) diff --git a/setup.py b/setup.py deleted file mode 100755 index 6f86643..0000000 --- a/setup.py +++ /dev/null @@ -1,23 +0,0 @@ -import setuptools - -with open("README.md", "r", encoding="utf-8") as fh: - long_description = fh.read() - -setuptools.setup( - name="pangolin", - version="1.0.2", - author="Tony Zeng", - author_email="tkyzeng@gmail.com", - description="Pangolin", - long_description=long_description, - long_description_content_type="text/markdown", - packages=['pangolin'], - package_data={ - "pangolin": ["models/*"], - }, - entry_points={ - "console_scripts": [ - "pangolin=pangolin.pangolin:main" - ] - } -) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/expected/medium_out.csv b/tests/data/expected/medium_out.csv new file mode 100644 index 0000000..34cf08c --- /dev/null +++ b/tests/data/expected/medium_out.csv @@ -0,0 +1,100 @@ +gene,CHROM,position.dna,REF,ALT,POS,Pangolin +PEX10,chr1,2408451.0,C,T,2339890,PEX10|-3:0.02|1:-0.76|Warnings: +MTHFR,chr1,11790916.0,C,T,11850973,MTHFR|-2:0.7|-18:-0.57|Warnings: +MTHFR,chr1,11794385.0,C,T,11854442,MTHFR|-2:0.4|-27:-0.03|Warnings: +MTHFR,chr1,11794539.0,CT,C,11854596,MTHFR|-62:0.36|-1:-0.83|Warnings: +MTHFR,chr1,11794724.0,C,G,11854781,MTHFR|88:0.04|5:-0.78|Warnings: +PLOD1,chr1,11952620.0,CA,C,12012677,PLOD1|31:0.65|3:-0.83|Warnings: +PLOD1,chr1,11966985.0,A,G,12027042,PLOD1|41:0.12|2:-0.82|Warnings: +ATP13A2,chr1,16996381.0,C,T,17322876,ATP13A2|31:0.06|5:-0.73|Warnings: +SDHB,chr1,17028599.0,C,T,17355094,SDHB|55:0.43|1:-0.8|Warnings: +SDHB,chr1,17028737.0,C,G,17355232,SDHB|32:0.03|-1:-0.85|Warnings: +SDHB,chr1,17053947.0,C,A,17380442,SDHB|-195:0.41|1:-0.77|Warnings: +HMGCL,chr1,23817475.0,C,T,24143965,HMGCL|14:0.04|1:-0.87|Warnings: +DHDDS,chr1,26447535.0,A,G,26774026,DHDDS|-5:0.8|24:-0.57|Warnings: +MECR,chr1,29200513.0,T,TA,29527025,MECR|-103:0.37|3:-0.56|Warnings: +AK2,chr1,33014521.0,C,T,33480122,AK2|-3:0.26|1:-0.86|Warnings: +P3H1,chr1,42747254.0,C,T,43212925,P3H1|-1:0.57|18:-0.61|Warnings: +SZT2,chr1,43425950.0,G,A,43891621,SZT2|-11:0.04|-1:-0.88|Warnings: +MUTYH,chr1,45330513.0,T,A,45796185,MUTYH|-96:0.0|44:-0.76|Warnings: +MUTYH,chr1,45331180.0,G,A,45796852,MUTYH|63:0.01|-200:0.0|Warnings: +MUTYH,chr1,45331558.0,T,C,45797230,MUTYH|-14:0.86|-2:-0.89|Warnings: +MUTYH,chr1,45331656.0,G,C,45797328,MUTYH|176:0.01|-100:-0.0|Warnings: +MUTYH,chr1,45331660.0,C,A,45797332,MUTYH|-9:0.01|1:-0.63|Warnings: +MUTYH,chr1,45331809.0,C,T,45797481,MUTYH|-2:0.66|40:-0.32|Warnings: +MUTYH,chr1,45332163.0,T,G,45797835,MUTYH|72:0.11|3:-0.79|Warnings: +MUTYH,chr1,45332619.0,C,T,45798291,MUTYH|-2:0.01|68:-0.14|Warnings: +MUTYH,chr1,45332886.0,TCCTATTTCCCCTA,T,45798558,MUTYH|-2:0.01|32:-0.51|Warnings: +MUTYH,chr1,45333171.0,C,G,45798843,MUTYH|-13:0.79|-1:-0.86|Warnings: +MUTYH,chr1,45340215.0,G,C,45805887,TOE1|10:0.0|89:-0.0|Warnings:MUTYH|-97:0.0|4:-0.06|Warnings: +POMGNT1,chr1,46189457.0,C,G,46655129,POMGNT1|27:0.66|1:-0.87|Warnings: +POMGNT1,chr1,46189457.0,C,A,46655129,POMGNT1|27:0.67|1:-0.87|Warnings: +POMGNT1,chr1,46194860.0,G,A,46660532,POMGNT1|-8:0.0|-16:-0.29|Warnings: +CPT2,chr1,53202430.0,G,A,53668102,CPT2|3:0.8|-1:-0.87|Warnings: +ALG6,chr1,63402344.0,G,GT,63868015,ALG6|-91:0.02|-1:-0.85|Warnings: +ALG6,chr1,63402348.0,G,A,63868019,ALG6|-95:0.02|-5:-0.79|Warnings: +PGM1,chr1,63648295.0,G,T,64113966,PGM1|-2:0.69|-199:0.0|Warnings: +ACADM,chr1,75745788.0,G,A,76211473,ACADM|47:0.12|18:-0.06|Warnings: +ACADM,chr1,75761116.0,T,G,76226801,ACADM|1:0.81|6:-0.67|Warnings: +ABCA4,chr1,94000832.0,T,C,94466388,ABCA4|54:0.08|4:-0.59|Warnings: +ABCA4,chr1,94001046.0,C,T,94466602,ABCA4|3:0.55|-44:-0.5|Warnings: +ABCA4,chr1,94011395.0,A,G,94476951,ABCA4|9:0.02|-10:-0.08|Warnings: +ABCA4,chr1,94018366.0,G,T,94483922,ABCA4|4:0.24|-200:0.0|Warnings: +ABCA4,chr1,94018445.0,C,T,94484001,ABCA4|-3:0.39|-200:0.0|Warnings: +ABCA4,chr1,94019575.0,CACTT,C,94485131,ABCA4|5:0.24|7:-0.84|Warnings: +ABCA4,chr1,94019581.0,C,T,94485137,ABCA4|-5:0.18|1:-0.84|Warnings: +ABCA4,chr1,94025056.0,A,T,94490612,ABCA4|-2:0.52|-8:-0.74|Warnings: +ABCA4,chr1,94027381.0,G,A,94492937,ABCA4|4:0.29|-200:0.0|Warnings: +ABCA4,chr1,94027417.0,G,A,94492973,ABCA4|137:0.02|-200:0.0|Warnings: +ABCA4,chr1,94027444.0,C,T,94493000,ABCA4|110:0.2|-200:0.0|Warnings: +ABCA4,chr1,94030427.0,C,T,94495983,ABCA4|-56:0.07|1:-0.52|Warnings: +ABCA4,chr1,94030427.0,C,G,94495983,ABCA4|-56:0.04|1:-0.53|Warnings: +ABCA4,chr1,94030953.0,C,T,94496509,ABCA4|-2:0.0|43:-0.03|Warnings: +ABCA4,chr1,94030991.0,C,T,94496547,ABCA4|-26:0.01|5:-0.44|Warnings: +ABCA4,chr1,94036737.0,T,C,94502293,ABCA4|-161:0.04|3:-0.71|Warnings: +ABCA4,chr1,94044608.0,C,T,94510164,ABCA4|-8:0.02|5:-0.69|Warnings: +ABCA4,chr1,94056830.0,C,T,94522386,ABCA4|-2:0.56|-8:-0.63|Warnings: +ABCA4,chr1,94062576.0,C,T,94528132,ABCA4|-10:0.23|1:-0.76|Warnings: +ABCA4,chr1,94081224.0,C,G,94546780,ABCA4|-58:0.07|-200:0.0|Warnings: +ABCA4,chr1,94081264.0,C,T,94546820,ABCA4|-1:0.17|-199:0.0|Warnings: +ABCA4,chr1,94084225.0,G,A,94549781,ABCA4|-6:0.09|-200:0.0|Warnings: +ALG14,chr1,95027119.0,TCTTA,T,95492675,ALG14|29:0.01|10:-0.17|Warnings: +AGL,chr1,99864592.0,A,G,100330148,AGL|-1:0.06|-3:-0.55|Warnings: +AGL,chr1,99880047.0,G,T,100345603,AGL|-37:0.01|-1:-0.77|Warnings: +AGL,chr1,99884704.0,G,T,100350260,AGL|40:0.14|-1:-0.85|Warnings: +AGL,chr1,99916398.0,A,G,100381954,AGL|1:0.76|12:-0.78|Warnings: +AGL,chr1,99916498.0,G,T,100382054,AGL|-24:0.04|-1:-0.81|Warnings: +DBT,chr1,100207186.0,T,C,100672742,DBT|1:0.8|-200:0.0|Warnings: +DBT,chr1,100210772.0,C,A,100676328,DBT|-25:0.11|-1:-0.88|Warnings: +DBT,chr1,100218750.0,GGTAACAAGGTAA,G,100684306,DBT|19:0.84|-3:-0.85|Warnings: +COL11A1,chr1,102915630.0,C,T,103381186,COL11A1|9:0.27|1:-0.83|Warnings: +COL11A1,chr1,102997075.0,C,A,103462631,COL11A1|-7:0.01|5:-0.8|Warnings: +COL11A1,chr1,103008517.0,CT,C,103474073,COL11A1|71:0.17|-1:-0.83|Warnings: +CASQ2,chr1,115705187.0,C,G,116247808,CASQ2|-68:0.62|5:-0.69|Warnings: +CTSK,chr1,150806225.0,C,T,150778701,CTSK|-2:0.13|-1:-0.78|Warnings: +CTSK,chr1,150806226.0,T,C,150778702,CTSK|-47:0.01|-2:-0.78|Warnings: +ADAR,chr1,154588263.0,A,G,154560739,ADAR|-8:0.02|-5:-0.08|Warnings: +LMNA,chr1,156130615.0,A,G,156100406,LMNA|12:0.74|2:-0.87|Warnings: +LMNA,chr1,156130740.0,C,T,156100531,LMNA|-2:0.31|33:-0.05|Warnings: +LMNA,chr1,156130818.0,T,G,156100609,LMNA|0:0.09|-45:-0.02|Warnings: +LMNA,chr1,156134795.0,A,G,156104586,LMNA|1:0.88|10:-0.84|Warnings: +LMNA,chr1,156134933.0,G,A,156104724,LMNA|-3:0.37|42:-0.07|Warnings: +LMNA,chr1,156134977.0,T,C,156104768,LMNA|-47:0.27|-2:-0.84|Warnings: +LMNA,chr1,156135890.0,C,G,156105681,LMNA|-29:0.69|11:-0.77|Warnings: +LMNA,chr1,156136916.0,G,A,156106707,LMNA|2:0.78|5:-0.13|Warnings: +LMNA,chr1,156137642.0,T,G,156107433,LMNA|1:0.88|12:-0.86|Warnings: +LMNA,chr1,156137651.0,C,G,156107442,LMNA|7:0.36|3:-0.58|Warnings: +NTRK1,chr1,156868250.0,G,T,156838042,NTRK1|10:0.05|-1:-0.75|Warnings: +NTRK1,chr1,156873600.0,T,A,156843392,NTRK1|-104:0.16|33:-0.13|Warnings: +NTRK1,chr1,156879365.0,A,C,156849157,NTRK1|-150:0.04|-3:-0.71|Warnings: +NTRK1,chr1,156881445.0,C,A,156851237,NTRK1|2:0.85|12:-0.72|Warnings: +NTRK1,chr1,156881446.0,G,A,156851238,NTRK1|2:0.85|11:-0.72|Warnings: +SPTA1,chr1,158618068.0,G,A,158587858,SPTA1|2:0.0|-12:-0.06|Warnings: +SPTA1,chr1,158643524.0,G,A,158613314,SPTA1|-29:0.1|-99:-0.01|Warnings: +MPZ,chr1,161306847.0,C,A,161276637,MPZ|6:0.24|-139:-0.03|Warnings: +SDHC,chr1,161318491.0,C,G,161288281,SDHC|-1:0.85|-198:0.0|Warnings: +SDHC,chr1,161328417.0,G,A,161298207,SDHC|2:0.02|-21:-0.02|Warnings: +SDHC,chr1,161340592.0,A,G,161310382,SDHC|46:0.03|2:-0.85|Warnings: +SDHC,chr1,161340658.0,A,G,161310448,SDHC|109:0.06|-3:-0.1|Warnings: +SDHC,chr1,161356841.0,G,C,161326631,SDHC|-61:0.03|-1:-0.86|Warnings: +SDHC,chr1,161356841.0,G,T,161326631,SDHC|-61:0.03|-1:-0.86|Warnings: diff --git a/tests/data/expected/small_out.csv b/tests/data/expected/small_out.csv new file mode 100644 index 0000000..ac355a4 --- /dev/null +++ b/tests/data/expected/small_out.csv @@ -0,0 +1,7 @@ +gene,CHROM,position.dna,REF,ALT,POS,Pangolin +foo,chr19,1.0,A,G,1, +ELANE,chr19,855795.0,G,A,855795,ELANE|-31:0.17|-1:-0.81|Warnings: +ELANE,chr19,855799.0,G,A,855799,ELANE|-35:0.15|-5:-0.76|Warnings: +STK11,chr19,1218501.0,A,G,1218500,STK11|100:0.0|-84:-0.0|Warnings: +STK11,chr19,1228432.0,C,A,1228431,STK11|199:0.01|-1:-0.0|Warnings:||CBARP|6:0.01|-145:-0.0|Warnings: +PNKP,chr19,49861542.0,AGGGGTCAGGGGAGGAGG,A,50364799,PNKP|-184:0.02|-32:-0.12|Warnings: diff --git a/tests/data/expected/small_out.vcf b/tests/data/expected/small_out.vcf new file mode 100644 index 0000000..5eac2b3 --- /dev/null +++ b/tests/data/expected/small_out.vcf @@ -0,0 +1,32 @@ +##fileformat=VCFv4.2 +##FILTER= +##fileDate=20191004 +##reference=GRCh37/hg19 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr19 1228431 . C A . . Pangolin=STK11|199:0.01|-1:-0.0|Warnings:||CBARP|6:0.01|-145:-0.0|Warnings: diff --git a/tests/data/medium.csv b/tests/data/medium.csv new file mode 100644 index 0000000..0e2804a --- /dev/null +++ b/tests/data/medium.csv @@ -0,0 +1,100 @@ +gene,CHROM,position.dna,REF,ALT,POS +PEX10,chr1,2408451.0,C,T,2339890 +MTHFR,chr1,11790916.0,C,T,11850973 +MTHFR,chr1,11794385.0,C,T,11854442 +MTHFR,chr1,11794539.0,CT,C,11854596 +MTHFR,chr1,11794724.0,C,G,11854781 +PLOD1,chr1,11952620.0,CA,C,12012677 +PLOD1,chr1,11966985.0,A,G,12027042 +ATP13A2,chr1,16996381.0,C,T,17322876 +SDHB,chr1,17028599.0,C,T,17355094 +SDHB,chr1,17028737.0,C,G,17355232 +SDHB,chr1,17053947.0,C,A,17380442 +HMGCL,chr1,23817475.0,C,T,24143965 +DHDDS,chr1,26447535.0,A,G,26774026 +MECR,chr1,29200513.0,T,TA,29527025 +AK2,chr1,33014521.0,C,T,33480122 +P3H1,chr1,42747254.0,C,T,43212925 +SZT2,chr1,43425950.0,G,A,43891621 +MUTYH,chr1,45330513.0,T,A,45796185 +MUTYH,chr1,45331180.0,G,A,45796852 +MUTYH,chr1,45331558.0,T,C,45797230 +MUTYH,chr1,45331656.0,G,C,45797328 +MUTYH,chr1,45331660.0,C,A,45797332 +MUTYH,chr1,45331809.0,C,T,45797481 +MUTYH,chr1,45332163.0,T,G,45797835 +MUTYH,chr1,45332619.0,C,T,45798291 +MUTYH,chr1,45332886.0,TCCTATTTCCCCTA,T,45798558 +MUTYH,chr1,45333171.0,C,G,45798843 +MUTYH,chr1,45340215.0,G,C,45805887 +POMGNT1,chr1,46189457.0,C,G,46655129 +POMGNT1,chr1,46189457.0,C,A,46655129 +POMGNT1,chr1,46194860.0,G,A,46660532 +CPT2,chr1,53202430.0,G,A,53668102 +ALG6,chr1,63402344.0,G,GT,63868015 +ALG6,chr1,63402348.0,G,A,63868019 +PGM1,chr1,63648295.0,G,T,64113966 +ACADM,chr1,75745788.0,G,A,76211473 +ACADM,chr1,75761116.0,T,G,76226801 +ABCA4,chr1,94000832.0,T,C,94466388 +ABCA4,chr1,94001046.0,C,T,94466602 +ABCA4,chr1,94011395.0,A,G,94476951 +ABCA4,chr1,94018366.0,G,T,94483922 +ABCA4,chr1,94018445.0,C,T,94484001 +ABCA4,chr1,94019575.0,CACTT,C,94485131 +ABCA4,chr1,94019581.0,C,T,94485137 +ABCA4,chr1,94025056.0,A,T,94490612 +ABCA4,chr1,94027381.0,G,A,94492937 +ABCA4,chr1,94027417.0,G,A,94492973 +ABCA4,chr1,94027444.0,C,T,94493000 +ABCA4,chr1,94030427.0,C,T,94495983 +ABCA4,chr1,94030427.0,C,G,94495983 +ABCA4,chr1,94030953.0,C,T,94496509 +ABCA4,chr1,94030991.0,C,T,94496547 +ABCA4,chr1,94036737.0,T,C,94502293 +ABCA4,chr1,94044608.0,C,T,94510164 +ABCA4,chr1,94056830.0,C,T,94522386 +ABCA4,chr1,94062576.0,C,T,94528132 +ABCA4,chr1,94081224.0,C,G,94546780 +ABCA4,chr1,94081264.0,C,T,94546820 +ABCA4,chr1,94084225.0,G,A,94549781 +ALG14,chr1,95027119.0,TCTTA,T,95492675 +AGL,chr1,99864592.0,A,G,100330148 +AGL,chr1,99880047.0,G,T,100345603 +AGL,chr1,99884704.0,G,T,100350260 +AGL,chr1,99916398.0,A,G,100381954 +AGL,chr1,99916498.0,G,T,100382054 +DBT,chr1,100207186.0,T,C,100672742 +DBT,chr1,100210772.0,C,A,100676328 +DBT,chr1,100218750.0,GGTAACAAGGTAA,G,100684306 +COL11A1,chr1,102915630.0,C,T,103381186 +COL11A1,chr1,102997075.0,C,A,103462631 +COL11A1,chr1,103008517.0,CT,C,103474073 +CASQ2,chr1,115705187.0,C,G,116247808 +CTSK,chr1,150806225.0,C,T,150778701 +CTSK,chr1,150806226.0,T,C,150778702 +ADAR,chr1,154588263.0,A,G,154560739 +LMNA,chr1,156130615.0,A,G,156100406 +LMNA,chr1,156130740.0,C,T,156100531 +LMNA,chr1,156130818.0,T,G,156100609 +LMNA,chr1,156134795.0,A,G,156104586 +LMNA,chr1,156134933.0,G,A,156104724 +LMNA,chr1,156134977.0,T,C,156104768 +LMNA,chr1,156135890.0,C,G,156105681 +LMNA,chr1,156136916.0,G,A,156106707 +LMNA,chr1,156137642.0,T,G,156107433 +LMNA,chr1,156137651.0,C,G,156107442 +NTRK1,chr1,156868250.0,G,T,156838042 +NTRK1,chr1,156873600.0,T,A,156843392 +NTRK1,chr1,156879365.0,A,C,156849157 +NTRK1,chr1,156881445.0,C,A,156851237 +NTRK1,chr1,156881446.0,G,A,156851238 +SPTA1,chr1,158618068.0,G,A,158587858 +SPTA1,chr1,158643524.0,G,A,158613314 +MPZ,chr1,161306847.0,C,A,161276637 +SDHC,chr1,161318491.0,C,G,161288281 +SDHC,chr1,161328417.0,G,A,161298207 +SDHC,chr1,161340592.0,A,G,161310382 +SDHC,chr1,161340658.0,A,G,161310448 +SDHC,chr1,161356841.0,G,C,161326631 +SDHC,chr1,161356841.0,G,T,161326631 \ No newline at end of file diff --git a/tests/data/reference/chr19.fa.gz b/tests/data/reference/chr19.fa.gz new file mode 100644 index 0000000..7b781fa Binary files /dev/null and b/tests/data/reference/chr19.fa.gz differ diff --git a/tests/data/reference/chr19.fa.gz.fxi b/tests/data/reference/chr19.fa.gz.fxi new file mode 100644 index 0000000..28e1739 Binary files /dev/null and b/tests/data/reference/chr19.fa.gz.fxi differ diff --git a/tests/data/reference/chr19_genes.gtf.gz b/tests/data/reference/chr19_genes.gtf.gz new file mode 100644 index 0000000..8d1537e Binary files /dev/null and b/tests/data/reference/chr19_genes.gtf.gz differ diff --git a/tests/data/reference/chr19_genes_filtered.db b/tests/data/reference/chr19_genes_filtered.db new file mode 100644 index 0000000..7684e24 Binary files /dev/null and b/tests/data/reference/chr19_genes_filtered.db differ diff --git a/tests/data/reference/chr19_genes_filtered.gtf b/tests/data/reference/chr19_genes_filtered.gtf new file mode 100644 index 0000000..1a8b14d --- /dev/null +++ b/tests/data/reference/chr19_genes_filtered.gtf @@ -0,0 +1,223 @@ +chr19 uta gene 852209 856246 . + . gene_id "ELANE"; transcript_id ""; +chr19 uta transcript 852291 856246 . + . gene_id "ELANE"; transcript_id "NM_001972.2"; protein_id "NP_001963.1"; +chr19 uta exon 852291 852395 . + . gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta exon 852876 853032 . + . gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "2"; protein_id "NP_001963.1"; +chr19 uta exon 853262 853403 . + . gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "3"; protein_id "NP_001963.1"; +chr19 uta exon 855564 855794 . + . gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "4"; protein_id "NP_001963.1"; +chr19 uta exon 855958 856246 . + . gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta CDS 852329 852395 . + 0 gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta CDS 852876 853032 . + 2 gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "2"; protein_id "NP_001963.1"; +chr19 uta CDS 853262 853403 . + 1 gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "3"; protein_id "NP_001963.1"; +chr19 uta CDS 855564 855794 . + 0 gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "4"; protein_id "NP_001963.1"; +chr19 uta CDS 855958 856161 . + 0 gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta 5UTR 852291 852328 . + . gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta 3UTR 856165 856246 . + . gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta start_codon 852329 852331 . + 0 gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta stop_codon 856162 856164 . + 0 gene_id "ELANE"; transcript_id "NM_001972.2"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta transcript 852209 856246 . + . gene_id "ELANE"; transcript_id "NM_001972.3"; protein_id "NP_001963.1"; +chr19 uta exon 852209 852395 . + . gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta exon 852876 853032 . + . gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "2"; protein_id "NP_001963.1"; +chr19 uta exon 853262 853403 . + . gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "3"; protein_id "NP_001963.1"; +chr19 uta exon 855564 855794 . + . gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "4"; protein_id "NP_001963.1"; +chr19 uta exon 855958 856246 . + . gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta CDS 852329 852395 . + 0 gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta CDS 852876 853032 . + 2 gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "2"; protein_id "NP_001963.1"; +chr19 uta CDS 853262 853403 . + 1 gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "3"; protein_id "NP_001963.1"; +chr19 uta CDS 855564 855794 . + 0 gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "4"; protein_id "NP_001963.1"; +chr19 uta CDS 855958 856161 . + 0 gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta 5UTR 852209 852328 . + . gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta 3UTR 856165 856246 . + . gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta start_codon 852329 852331 . + 0 gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta stop_codon 856162 856164 . + 0 gene_id "ELANE"; transcript_id "NM_001972.3"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta transcript 852303 856243 . + . gene_id "ELANE"; transcript_id "NM_001972.4"; protein_id "NP_001963.1"; +chr19 uta exon 852303 852395 . + . gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta exon 852876 853032 . + . gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "2"; protein_id "NP_001963.1"; +chr19 uta exon 853262 853403 . + . gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "3"; protein_id "NP_001963.1"; +chr19 uta exon 855564 855794 . + . gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "4"; protein_id "NP_001963.1"; +chr19 uta exon 855958 856243 . + . gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta CDS 852329 852395 . + 0 gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta CDS 852876 853032 . + 2 gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "2"; protein_id "NP_001963.1"; +chr19 uta CDS 853262 853403 . + 1 gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "3"; protein_id "NP_001963.1"; +chr19 uta CDS 855564 855794 . + 0 gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "4"; protein_id "NP_001963.1"; +chr19 uta CDS 855958 856161 . + 0 gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta 5UTR 852303 852328 . + . gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta 3UTR 856165 856243 . + . gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta start_codon 852329 852331 . + 0 gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "1"; protein_id "NP_001963.1"; +chr19 uta stop_codon 856162 856164 . + 0 gene_id "ELANE"; transcript_id "NM_001972.4"; exon_number "5"; protein_id "NP_001963.1"; +chr19 uta gene 1205777 1228434 . + . gene_id "STK11"; transcript_id ""; +chr19 uta transcript 1205798 1228434 . + . gene_id "STK11"; transcript_id "NM_000455.4"; protein_id "NP_000446.1"; +chr19 uta exon 1205798 1207202 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "1"; protein_id "NP_000446.1"; +chr19 uta exon 1218416 1218499 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "2"; protein_id "NP_000446.1"; +chr19 uta exon 1219323 1219412 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "3"; protein_id "NP_000446.1"; +chr19 uta exon 1220372 1220504 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "4"; protein_id "NP_000446.1"; +chr19 uta exon 1220580 1220716 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "5"; protein_id "NP_000446.1"; +chr19 uta exon 1221212 1221339 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "6"; protein_id "NP_000446.1"; +chr19 uta exon 1221948 1222005 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "7"; protein_id "NP_000446.1"; +chr19 uta exon 1222984 1223171 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "8"; protein_id "NP_000446.1"; +chr19 uta exon 1226453 1226662 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "9"; protein_id "NP_000446.1"; +chr19 uta exon 1227592 1228434 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "10"; protein_id "NP_000446.1"; +chr19 uta CDS 1206913 1207202 . + 0 gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "1"; protein_id "NP_000446.1"; +chr19 uta CDS 1218416 1218499 . + 1 gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "2"; protein_id "NP_000446.1"; +chr19 uta CDS 1219323 1219412 . + 1 gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "3"; protein_id "NP_000446.1"; +chr19 uta CDS 1220372 1220504 . + 1 gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "4"; protein_id "NP_000446.1"; +chr19 uta CDS 1220580 1220716 . + 0 gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "5"; protein_id "NP_000446.1"; +chr19 uta CDS 1221212 1221339 . + 1 gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "6"; protein_id "NP_000446.1"; +chr19 uta CDS 1221948 1222005 . + 2 gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "7"; protein_id "NP_000446.1"; +chr19 uta CDS 1222984 1223171 . + 1 gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "8"; protein_id "NP_000446.1"; +chr19 uta CDS 1226453 1226643 . + 2 gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "9"; protein_id "NP_000446.1"; +chr19 uta 5UTR 1205798 1206912 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "1"; protein_id "NP_000446.1"; +chr19 uta 3UTR 1226647 1226662 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "9"; protein_id "NP_000446.1"; +chr19 uta 3UTR 1227592 1228434 . + . gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "10"; protein_id "NP_000446.1"; +chr19 uta start_codon 1206913 1206915 . + 0 gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "1"; protein_id "NP_000446.1"; +chr19 uta stop_codon 1226644 1226646 . + 0 gene_id "STK11"; transcript_id "NM_000455.4"; exon_number "9"; protein_id "NP_000446.1"; +chr19 uta transcript 1205777 1228430 . + . gene_id "STK11"; transcript_id "NM_000455.5"; protein_id "NP_000446.1"; +chr19 uta exon 1205777 1207202 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "1"; protein_id "NP_000446.1"; +chr19 uta exon 1218416 1218499 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "2"; protein_id "NP_000446.1"; +chr19 uta exon 1219323 1219412 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "3"; protein_id "NP_000446.1"; +chr19 uta exon 1220372 1220504 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "4"; protein_id "NP_000446.1"; +chr19 uta exon 1220580 1220716 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "5"; protein_id "NP_000446.1"; +chr19 uta exon 1221212 1221339 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "6"; protein_id "NP_000446.1"; +chr19 uta exon 1221948 1222005 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "7"; protein_id "NP_000446.1"; +chr19 uta exon 1222984 1223171 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "8"; protein_id "NP_000446.1"; +chr19 uta exon 1226453 1226662 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "9"; protein_id "NP_000446.1"; +chr19 uta exon 1227592 1228430 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "10"; protein_id "NP_000446.1"; +chr19 uta CDS 1206913 1207202 . + 0 gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "1"; protein_id "NP_000446.1"; +chr19 uta CDS 1218416 1218499 . + 1 gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "2"; protein_id "NP_000446.1"; +chr19 uta CDS 1219323 1219412 . + 1 gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "3"; protein_id "NP_000446.1"; +chr19 uta CDS 1220372 1220504 . + 1 gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "4"; protein_id "NP_000446.1"; +chr19 uta CDS 1220580 1220716 . + 0 gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "5"; protein_id "NP_000446.1"; +chr19 uta CDS 1221212 1221339 . + 1 gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "6"; protein_id "NP_000446.1"; +chr19 uta CDS 1221948 1222005 . + 2 gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "7"; protein_id "NP_000446.1"; +chr19 uta CDS 1222984 1223171 . + 1 gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "8"; protein_id "NP_000446.1"; +chr19 uta CDS 1226453 1226643 . + 2 gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "9"; protein_id "NP_000446.1"; +chr19 uta 5UTR 1205777 1206912 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "1"; protein_id "NP_000446.1"; +chr19 uta 3UTR 1226647 1226662 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "9"; protein_id "NP_000446.1"; +chr19 uta 3UTR 1227592 1228430 . + . gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "10"; protein_id "NP_000446.1"; +chr19 uta start_codon 1206913 1206915 . + 0 gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "1"; protein_id "NP_000446.1"; +chr19 uta stop_codon 1226644 1226646 . + 0 gene_id "STK11"; transcript_id "NM_000455.5"; exon_number "9"; protein_id "NP_000446.1"; +chr19 uta gene 50364460 50370833 . - . gene_id "PNKP"; transcript_id ""; +chr19 uta transcript 50364460 50370822 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; protein_id "NP_009185.2"; +chr19 uta exon 50370726 50370822 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "1"; protein_id "NP_009185.2"; +chr19 uta exon 50370311 50370474 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "2"; protein_id "NP_009185.2"; +chr19 uta exon 50369656 50369702 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "3"; protein_id "NP_009185.2"; +chr19 uta exon 50368384 50368683 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "4"; protein_id "NP_009185.2"; +chr19 uta exon 50367581 50367660 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "5"; protein_id "NP_009185.2"; +chr19 uta exon 50367436 50367493 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "6"; protein_id "NP_009185.2"; +chr19 uta exon 50367221 50367328 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "7"; protein_id "NP_009185.2"; +chr19 uta exon 50366946 50367017 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "8"; protein_id "NP_009185.2"; +chr19 uta exon 50365947 50365995 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "9"; protein_id "NP_009185.2"; +chr19 uta exon 50365795 50365865 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "10"; protein_id "NP_009185.2"; +chr19 uta exon 50365628 50365720 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "11"; protein_id "NP_009185.2"; +chr19 uta exon 50365442 50365538 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "12"; protein_id "NP_009185.2"; +chr19 uta exon 50365301 50365362 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "13"; protein_id "NP_009185.2"; +chr19 uta exon 50365029 50365138 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "14"; protein_id "NP_009185.2"; +chr19 uta exon 50364865 50364952 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "15"; protein_id "NP_009185.2"; +chr19 uta exon 50364706 50364767 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "16"; protein_id "NP_009185.2"; +chr19 uta exon 50364460 50364622 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "17"; protein_id "NP_009185.2"; +chr19 uta CDS 50370311 50370461 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "2"; protein_id "NP_009185.2"; +chr19 uta CDS 50369656 50369702 . - 2 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "3"; protein_id "NP_009185.2"; +chr19 uta CDS 50368384 50368683 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "4"; protein_id "NP_009185.2"; +chr19 uta CDS 50367581 50367660 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "5"; protein_id "NP_009185.2"; +chr19 uta CDS 50367436 50367493 . - 1 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "6"; protein_id "NP_009185.2"; +chr19 uta CDS 50367221 50367328 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "7"; protein_id "NP_009185.2"; +chr19 uta CDS 50366946 50367017 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "8"; protein_id "NP_009185.2"; +chr19 uta CDS 50365947 50365995 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "9"; protein_id "NP_009185.2"; +chr19 uta CDS 50365795 50365865 . - 2 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "10"; protein_id "NP_009185.2"; +chr19 uta CDS 50365628 50365720 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "11"; protein_id "NP_009185.2"; +chr19 uta CDS 50365442 50365538 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "12"; protein_id "NP_009185.2"; +chr19 uta CDS 50365301 50365362 . - 2 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "13"; protein_id "NP_009185.2"; +chr19 uta CDS 50365029 50365138 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "14"; protein_id "NP_009185.2"; +chr19 uta CDS 50364865 50364952 . - 1 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "15"; protein_id "NP_009185.2"; +chr19 uta CDS 50364706 50364767 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "16"; protein_id "NP_009185.2"; +chr19 uta CDS 50364508 50364622 . - 1 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "17"; protein_id "NP_009185.2"; +chr19 uta 5UTR 50370726 50370822 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "1"; protein_id "NP_009185.2"; +chr19 uta 5UTR 50370462 50370474 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "2"; protein_id "NP_009185.2"; +chr19 uta 3UTR 50364460 50364504 . - . gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "17"; protein_id "NP_009185.2"; +chr19 uta start_codon 50370459 50370461 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "2"; protein_id "NP_009185.2"; +chr19 uta stop_codon 50364505 50364507 . - 0 gene_id "PNKP"; transcript_id "NM_007254.3"; exon_number "17"; protein_id "NP_009185.2"; +chr19 uta transcript 50364461 50370833 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; protein_id "NP_009185.2"; +chr19 uta exon 50370726 50370833 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "1"; protein_id "NP_009185.2"; +chr19 uta exon 50370311 50370474 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "2"; protein_id "NP_009185.2"; +chr19 uta exon 50369656 50369702 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "3"; protein_id "NP_009185.2"; +chr19 uta exon 50368384 50368683 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "4"; protein_id "NP_009185.2"; +chr19 uta exon 50367581 50367660 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "5"; protein_id "NP_009185.2"; +chr19 uta exon 50367436 50367493 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "6"; protein_id "NP_009185.2"; +chr19 uta exon 50367221 50367328 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "7"; protein_id "NP_009185.2"; +chr19 uta exon 50366946 50367017 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "8"; protein_id "NP_009185.2"; +chr19 uta exon 50365947 50365995 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "9"; protein_id "NP_009185.2"; +chr19 uta exon 50365795 50365865 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "10"; protein_id "NP_009185.2"; +chr19 uta exon 50365628 50365720 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "11"; protein_id "NP_009185.2"; +chr19 uta exon 50365442 50365538 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "12"; protein_id "NP_009185.2"; +chr19 uta exon 50365301 50365362 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "13"; protein_id "NP_009185.2"; +chr19 uta exon 50365029 50365138 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "14"; protein_id "NP_009185.2"; +chr19 uta exon 50364865 50364952 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "15"; protein_id "NP_009185.2"; +chr19 uta exon 50364706 50364767 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "16"; protein_id "NP_009185.2"; +chr19 uta exon 50364461 50364622 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "17"; protein_id "NP_009185.2"; +chr19 uta CDS 50370311 50370461 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "2"; protein_id "NP_009185.2"; +chr19 uta CDS 50369656 50369702 . - 2 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "3"; protein_id "NP_009185.2"; +chr19 uta CDS 50368384 50368683 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "4"; protein_id "NP_009185.2"; +chr19 uta CDS 50367581 50367660 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "5"; protein_id "NP_009185.2"; +chr19 uta CDS 50367436 50367493 . - 1 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "6"; protein_id "NP_009185.2"; +chr19 uta CDS 50367221 50367328 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "7"; protein_id "NP_009185.2"; +chr19 uta CDS 50366946 50367017 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "8"; protein_id "NP_009185.2"; +chr19 uta CDS 50365947 50365995 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "9"; protein_id "NP_009185.2"; +chr19 uta CDS 50365795 50365865 . - 2 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "10"; protein_id "NP_009185.2"; +chr19 uta CDS 50365628 50365720 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "11"; protein_id "NP_009185.2"; +chr19 uta CDS 50365442 50365538 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "12"; protein_id "NP_009185.2"; +chr19 uta CDS 50365301 50365362 . - 2 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "13"; protein_id "NP_009185.2"; +chr19 uta CDS 50365029 50365138 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "14"; protein_id "NP_009185.2"; +chr19 uta CDS 50364865 50364952 . - 1 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "15"; protein_id "NP_009185.2"; +chr19 uta CDS 50364706 50364767 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "16"; protein_id "NP_009185.2"; +chr19 uta CDS 50364508 50364622 . - 1 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "17"; protein_id "NP_009185.2"; +chr19 uta 5UTR 50370726 50370833 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "1"; protein_id "NP_009185.2"; +chr19 uta 5UTR 50370462 50370474 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "2"; protein_id "NP_009185.2"; +chr19 uta 3UTR 50364461 50364504 . - . gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "17"; protein_id "NP_009185.2"; +chr19 uta start_codon 50370459 50370461 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "2"; protein_id "NP_009185.2"; +chr19 uta stop_codon 50364505 50364507 . - 0 gene_id "PNKP"; transcript_id "NM_007254.4"; exon_number "17"; protein_id "NP_009185.2"; +chr19 uta gene 1228286 1238004 . - . gene_id "CBARP"; transcript_id ""; +chr19 uta transcript 1229947 1237990 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; protein_id "NP_689982.3"; +chr19 uta exon 1237755 1237990 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "1"; protein_id "NP_689982.3"; +chr19 uta exon 1235995 1236120 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "2"; protein_id "NP_689982.3"; +chr19 uta exon 1235778 1235917 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "3"; protein_id "NP_689982.3"; +chr19 uta exon 1235500 1235564 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "4"; protein_id "NP_689982.3"; +chr19 uta exon 1235000 1235144 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "5"; protein_id "NP_689982.3"; +chr19 uta exon 1234570 1234741 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "6"; protein_id "NP_689982.3"; +chr19 uta exon 1234190 1234330 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "7"; protein_id "NP_689982.3"; +chr19 uta exon 1233425 1233635 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "8"; protein_id "NP_689982.3"; +chr19 uta exon 1229947 1231274 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "9"; protein_id "NP_689982.3"; +chr19 uta CDS 1235995 1236099 . - 0 gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "2"; protein_id "NP_689982.3"; +chr19 uta CDS 1235778 1235917 . - 0 gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "3"; protein_id "NP_689982.3"; +chr19 uta CDS 1235500 1235564 . - 1 gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "4"; protein_id "NP_689982.3"; +chr19 uta CDS 1235000 1235144 . - 2 gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "5"; protein_id "NP_689982.3"; +chr19 uta CDS 1234570 1234741 . - 1 gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "6"; protein_id "NP_689982.3"; +chr19 uta CDS 1234190 1234330 . - 0 gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "7"; protein_id "NP_689982.3"; +chr19 uta CDS 1233425 1233635 . - 0 gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "8"; protein_id "NP_689982.3"; +chr19 uta CDS 1230895 1231274 . - 2 gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "9"; protein_id "NP_689982.3"; +chr19 uta 5UTR 1237755 1237990 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "1"; protein_id "NP_689982.3"; +chr19 uta 5UTR 1236100 1236120 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "2"; protein_id "NP_689982.3"; +chr19 uta 3UTR 1229947 1230891 . - . gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "9"; protein_id "NP_689982.3"; +chr19 uta start_codon 1236097 1236099 . - 0 gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "2"; protein_id "NP_689982.3"; +chr19 uta stop_codon 1230892 1230894 . - 0 gene_id "CBARP"; transcript_id "NM_152769.2"; exon_number "9"; protein_id "NP_689982.3"; +chr19 uta transcript 1228286 1238004 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; protein_id "NP_689982.3"; +chr19 uta exon 1237755 1238004 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "1"; protein_id "NP_689982.3"; +chr19 uta exon 1235995 1236120 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "2"; protein_id "NP_689982.3"; +chr19 uta exon 1235778 1235917 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "3"; protein_id "NP_689982.3"; +chr19 uta exon 1235500 1235564 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "4"; protein_id "NP_689982.3"; +chr19 uta exon 1235000 1235144 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "5"; protein_id "NP_689982.3"; +chr19 uta exon 1234570 1234741 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "6"; protein_id "NP_689982.3"; +chr19 uta exon 1234190 1234330 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "7"; protein_id "NP_689982.3"; +chr19 uta exon 1233425 1233635 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "8"; protein_id "NP_689982.3"; +chr19 uta exon 1228286 1231274 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "9"; protein_id "NP_689982.3"; +chr19 uta CDS 1235995 1236099 . - 0 gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "2"; protein_id "NP_689982.3"; +chr19 uta CDS 1235778 1235917 . - 0 gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "3"; protein_id "NP_689982.3"; +chr19 uta CDS 1235500 1235564 . - 1 gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "4"; protein_id "NP_689982.3"; +chr19 uta CDS 1235000 1235144 . - 2 gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "5"; protein_id "NP_689982.3"; +chr19 uta CDS 1234570 1234741 . - 1 gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "6"; protein_id "NP_689982.3"; +chr19 uta CDS 1234190 1234330 . - 0 gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "7"; protein_id "NP_689982.3"; +chr19 uta CDS 1233425 1233635 . - 0 gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "8"; protein_id "NP_689982.3"; +chr19 uta CDS 1230895 1231274 . - 2 gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "9"; protein_id "NP_689982.3"; +chr19 uta 5UTR 1237755 1238004 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "1"; protein_id "NP_689982.3"; +chr19 uta 5UTR 1236100 1236120 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "2"; protein_id "NP_689982.3"; +chr19 uta 3UTR 1228286 1230891 . - . gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "9"; protein_id "NP_689982.3"; +chr19 uta start_codon 1236097 1236099 . - 0 gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "2"; protein_id "NP_689982.3"; +chr19 uta stop_codon 1230892 1230894 . - 0 gene_id "CBARP"; transcript_id "NM_152769.3"; exon_number "9"; protein_id "NP_689982.3"; diff --git a/tests/data/small.csv b/tests/data/small.csv new file mode 100644 index 0000000..a02d92e --- /dev/null +++ b/tests/data/small.csv @@ -0,0 +1,7 @@ +gene,CHROM,position.dna,REF,ALT,POS +foo,chr19,1.0,A,G,1 +ELANE,chr19,855795.0,G,A,855795 +ELANE,chr19,855799.0,G,A,855799 +STK11,chr19,1218501.0,A,G,1218500 +STK11,chr19,1228432,C,A,1228431 +PNKP,chr19,49861542.0,AGGGGTCAGGGGAGGAGG,A,50364799 \ No newline at end of file diff --git a/tests/data/small.vcf b/tests/data/small.vcf new file mode 100644 index 0000000..b91d5dd --- /dev/null +++ b/tests/data/small.vcf @@ -0,0 +1,30 @@ +##fileformat=VCFv4.2 +##fileDate=20191004 +##reference=GRCh37/hg19 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr19 1228431 . C A . . . \ No newline at end of file diff --git a/tests/test_processors.py b/tests/test_processors.py new file mode 100644 index 0000000..83ebf8f --- /dev/null +++ b/tests/test_processors.py @@ -0,0 +1,76 @@ +import difflib +import sys +import tempfile + +import pytest + +from pangolin.data_models import AppConfig +from pangolin.processors import process_variants_file + + +def build_config(input_file: str, output_file: str, batch_size: int = 1) -> AppConfig: + # This is just a download of the chr19 reference genome + ref_file = "tests/data/reference/chr19.fa.gz" + + # The chr19_genes.gtf.gz is just a + # This gtf file was built with the following set of commands. I hardcoded in the explicit gene names used + # in the test files to reduce the size of the database and files + # gzcat tests/data/reference/chr19_genes.gtf.gz | grep 'PNKP\|ELANE\|STK11\|CBARP' > tests/data/reference/chr19_genes_filtered.gtf + # python scripts/create_db.py tests/data/reference/chr19_genes_filtered.gtf --filter None + gtf_file = "tests/data/reference/chr19_genes_filtered.db" + + app_config = AppConfig( + variant_file=input_file, + output_file=output_file, + reference_file=ref_file, + annotation_file=gtf_file, + batch_size=batch_size, + distance=200, + score_cutoff=None, + mask="True", + score_exons="False", + column_ids="CHROM,POS,REF,ALT", + enable_gtf_cache=True, + ) + return app_config + + +def run_pangolin(input_file, expected_file, batch_size: int = 0, suffix: str = ""): + with tempfile.NamedTemporaryFile(suffix=suffix) as fh: + output_file = fh.name + config = build_config(input_file, output_file, batch_size) + process_variants_file(config) + with open(output_file) as out_fh: + batch_file_contents = out_fh.readlines() + + with open(expected_file) as fh: + expected_file_contents = fh.readlines() + + if expected_file_contents != batch_file_contents: + sys.stdout.writelines( + difflib.unified_diff(expected_file_contents, batch_file_contents) + ) + assert expected_file_contents == batch_file_contents + + +@pytest.mark.parametrize( + "batch_size", + [ + 0, + 1, + ], +) +def test_batch_vcf(batch_size): + input_file = "tests/data/small.vcf" + expected_file = "tests/data/expected/small_out.vcf" + run_pangolin(input_file, expected_file, batch_size=batch_size, suffix=".vcf") + + +@pytest.mark.parametrize( + "batch_size", + [0, 2, 3, 5], +) +def test_batch_csv(batch_size): + input_file = "tests/data/small.csv" + expected_file = "tests/data/expected/small_out.csv" + run_pangolin(input_file, expected_file, batch_size=batch_size, suffix=".csv")