diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..409225a --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,25 @@ +name: Continuous Integration + +on: + push: + branches: + - main + pull_request: + branches: + - main + merge_group: + types: + - checks_requested + +jobs: + test: + name: Run tests + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Build image + run: docker build --target uta-test -t uta-test . + - name: Run tests + run: docker run --rm uta-test python -m unittest diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..34d9a05 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +FROM ubuntu:22.04 as uta + +# set python version and define arguments +ARG python_version="3.10" + +# list and install dependencies +ARG dependencies="python${python_version} python3-dev python3-pip rsync git postgresql-client-14 tabix" + +RUN apt-get update && apt-get install -y $dependencies && apt-get clean + +# install pysam, copy code, and run pip install +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip +RUN pip install --upgrade setuptools +RUN pip install pysam + +WORKDIR /opt/repos/uta/ +COPY pyproject.toml ./ +COPY etc ./etc +COPY misc ./misc +COPY sbin ./sbin +COPY src ./src +RUN pip install -e .[dev] + + +# UTA test image +FROM uta as uta-test +RUN DEBIAN_FRONTEND=noninteractive apt-get -yq install postgresql +COPY tests ./tests +RUN pip install -e .[test] +RUN useradd uta-tester +RUN chown -R uta-tester . +USER uta-tester diff --git a/README.md b/README.md index 70c7e92..f15f0d4 100644 --- a/README.md +++ b/README.md @@ -203,8 +203,8 @@ you will not need to install PostgreSQL or any of its dependencies. (code) version used to build the instance. $ psql -h localhost -U anonymous -d uta -c "select * from $uta_v.meta" - - key | value + + key | value ----------------+-------------------------------------------------------------------- schema_version | 1.1 created on | 2015-08-21T10:53:50.666152 @@ -213,7 +213,7 @@ you will not need to install PostgreSQL or any of its dependencies. (4 rows) 6. (Optional) To configure [hgvs](https://github.com/biocommons/hgvs) - to use this local installation, consult the + to use this local installation, consult the [hgvs documentation](https://hgvs.readthedocs.io/en/latest/installation.html#local-installation-of-uta-optional) ### Installing from database dumps @@ -253,6 +253,7 @@ the installation environment.* ## Developer Setup +### Virtual Environment To develop UTA, follow these steps. 1. Set up a virtual environment using your preferred method. @@ -272,3 +273,110 @@ To develop UTA, follow these steps. 4. To run the tests: $ python3 -m unittest + +### Docker + +1. Clone UTA and build docker image: + + $ git clone git@github.com:biocommons/uta.git + $ cd uta + $ docker build -t uta . + +2. Restore a database or load a new one using the instructions [above](#installing-from-database-dumps). + +3. Run container and tests + + $ docker run -it --rm uta bash + +4. Testing + + $ docker build --target uta-test -t uta-test . + $ docker run --rm uta-test python -m unittest + +## UTA update procedure + +Requires docker. + +### 0. Setup + +Make directories: +``` +mkdir -p $(pwd)/ncbi-data +mkdir -p $(pwd)/output/artifacts +mkdir -p $(pwd)/output/logs +``` + +Set variables: +``` +export UTA_ETL_OLD_UTA_IMAGE_TAG=uta_20210129b +export UTA_ETL_OLD_UTA_VERSION=UTA_ETL_OLD_UTA_IMAGE_TAG +export UTA_ETL_NEW_UTA_VERSION=uta_20240512 +export UTA_ETL_NCBI_DIR=./ncbi-data +export UTA_ETL_WORK_DIR=./output/artifacts +export UTA_ETL_LOG_DIR=./output/logs +``` + +Build the UTA image: +``` +docker build --target uta -t uta-update . +``` + +### 1. Download SeqRepo data +``` +docker compose run seqrepo-pull +``` + +Note: pulling data takes ~30 minutes and requires ~13 GB. +Note: a container called seqrepo will be left behind. + +### 2. Extract and transform data from NCBI + +Download files from NCBI, extract into intermediate files, and load into UTA and SeqRepo. + +See 2A for nuclear transcripts and 2B for mitochondrial transcripts. + +#### 2A. Nuclear transcripts +``` +docker compose run ncbi-download +docker compose run uta-extract +docker compose run seqrepo-load +docker compose run uta-load +``` + +#### 2B. Mitochondrial transcripts +``` +docker compose -f docker-compose.yml -f misc/mito-transcripts/docker-compose-mito-extract.yml run mito-extract +docker compose run seqrepo-load +docker compose run uta-load +``` + +#### 2C. Manual splign transcripts +To load splign-manual transcripts, the workflow expects an input txdata.yaml file and splign alignments. Define this path +using the environment variable $UTA_SPLIGN_MANUAL_DIR. These file paths should exist: +- `$UTA_SPLIGN_MANUAL_DIR/splign-manual/txdata.yaml` +- `$UTA_SPLIGN_MANUAL_DIR/splign-manual/alignments/*.splign` + +[txdata.yaml](loading/data/splign-manual/txdata.yaml) defines the transcripts and their metadata. The [alignments dir](loading/data/splign-manual/alignments) contains the splign alignments. +To run the workflow: +``` +export UTA_SPLIGN_MANUAL_DIR=$(pwd)/loading/data/splign-manual/ +docker compose run splign-manual +``` + +UTA has updated and the database has been dumped into a pgd file in `UTA_ETL_WORK_DIR`. SeqRepo has been updated in place. + + +## Migrations +UTA uses alembic to manage database migrations. To auto-generate a migration: +``` +alembic -c etc/alembic.ini revision --autogenerate -m "description of the migration" +``` +This will create a migration script in the alembic/versions directory. +Adjust the upgrade and downgrade function definitions. To apply the migration: +``` +alembic -c etc/alembic.ini upgrade head +``` +To reverse a migration, use `downgrade` with the number of steps to reverse. For example, to reverse the last: +``` +alembic -c etc/alembic.ini downgrade -1 +``` diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..84b043a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,62 @@ +# docker compose file for the UTA update procedure + +version: '3' + +services: + seqrepo-pull: + user: root + image: uta-update + command: sbin/seqrepo-pull + volumes: + - seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo + network_mode: host + ncbi-download: + image: uta-update + command: sbin/ncbi-download etc/ncbi-files.txt /ncbi-dir + volumes: + - .:/opt/repos/uta + - ${UTA_ETL_NCBI_DIR}:/ncbi-dir + working_dir: /opt/repos/uta + network_mode: host + uta-extract: + image: uta-update + command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs + volumes: + - ${UTA_ETL_NCBI_DIR}:/ncbi-dir + - ${UTA_ETL_WORK_DIR}:/uta-extract/work + - ${UTA_ETL_LOG_DIR}:/uta-extract/logs + working_dir: /opt/repos/uta + network_mode: host + seqrepo-load: + image: uta-update + command: sbin/seqrepo-load /seqrepo-load/work /seqrepo-load/logs + volumes: + - seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo + - ${UTA_ETL_WORK_DIR}:/seqrepo-load/work + - ${UTA_ETL_LOG_DIR}:/seqrepo-load/logs + working_dir: /opt/repos/uta + network_mode: host + uta: + container_name: uta + image: biocommons/uta:${UTA_ETL_OLD_UTA_IMAGE_TAG} + environment: + - POSTGRES_HOST_AUTH_METHOD=trust + healthcheck: + test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_ETL_OLD_UTA_IMAGE_TAG}.meta" + interval: 10s + retries: 80 + network_mode: host + uta-load: + image: uta-update + command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} ${UTA_ETL_NEW_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs + depends_on: + uta: + condition: service_healthy + volumes: + - seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo + - ${UTA_ETL_WORK_DIR}:/uta-load/work + - ${UTA_ETL_LOG_DIR}:/uta-load/logs + network_mode: host + +volumes: + seqrepo-volume: diff --git a/etc/alembic.ini b/etc/alembic.ini new file mode 100644 index 0000000..6777380 --- /dev/null +++ b/etc/alembic.ini @@ -0,0 +1,116 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = src/alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = postgresql://uta_admin:@localhost/uta + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/etc/global.conf b/etc/global.conf index c929b28..0d3cfcc 100644 --- a/etc/global.conf +++ b/etc/global.conf @@ -16,7 +16,7 @@ aligner = utaaa fasta_directories = aux/sequences2 aux/sequences -seqrepo = /usr/local/share/seqrepo/latest +seqrepo = /biocommons/dl.biocommons.org/seqrepo/master #data/manual #data/bic/sequences.fasta.bgz diff --git a/etc/ncbi-files.txt b/etc/ncbi-files.txt new file mode 100644 index 0000000..0a70efc --- /dev/null +++ b/etc/ncbi-files.txt @@ -0,0 +1,48 @@ +# This configuration file contains the paths to the NCBI data files needed by the SeqRepo/UTA load pipelines. +# +# ├── gene +# │ └── DATA +# │ ├── GENE_INFO +# │ │ └── Mammalia +# │ │ └── Homo_sapiens.gene_info.gz +# │ └── gene2refseq.gz +# ├── genomes +# │ └── refseq +# │ └── vertebrate_mammalian +# │ └── Homo_sapiens +# │ └── all_assembly_versions +# │ └── GCF_000001405.25_GRCh37.p13 +# │ ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz +# │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz +# └── refseq +# └── H_sapiens +# └── mRNA_Prot +# ├── human.1.protein.faa.gz +# ├── human.1.rna.fna.gz +# ├── human.1.rna.gbff.gz +# ├── ... + +## Gene Data +gene/DATA/gene2refseq.gz +gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz + +## RefSeq Data +refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz +refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz +refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz + +## Genome build and alignment data +# Build 37 +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz + +# Build 38 +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_report.txt +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz + +# T2Tv2.0 +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna.gz +genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz diff --git a/etc/scripts/create-new-schema.sh b/etc/scripts/create-new-schema.sh new file mode 100755 index 0000000..d13100a --- /dev/null +++ b/etc/scripts/create-new-schema.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +if [ "$#" -lt 2 ] +then + echo "error: too few arguments, you provided $#, 1 required" + echo "usage: create-new-schema.sh " + exit 1 +fi + +set -euxo pipefail + +source_uta_v=$1 +dest_uta_v=$2 +dumps_dir=/temp/dumps +mkdir -p $dumps_dir + +# dump current version +pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \ + gzip -c > $dumps_dir/"$source_uta_v".pgd.gz + +# create new schema +gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \ + sbin/pg-dump-schema-rename "$source_uta_v" "$dest_uta_v" | \ + psql -U uta_admin -h localhost -d uta -aeE diff --git a/etc/scripts/delete-schema.sh b/etc/scripts/delete-schema.sh new file mode 100755 index 0000000..e64eb3b --- /dev/null +++ b/etc/scripts/delete-schema.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +if [ "$#" -lt 1 ] +then + echo "error: too few arguments, you provided $#, 1 required" + echo "usage: delete-schema.sh " + exit 1 +fi + +set -euxo pipefail + +source_uta_v=$1 + +psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $source_uta_v CASCADE" \ No newline at end of file diff --git a/etc/uta_dev@localhost.conf b/etc/uta_dev@localhost.conf index 2d1e508..bfb5c66 100644 --- a/etc/uta_dev@localhost.conf +++ b/etc/uta_dev@localhost.conf @@ -1,4 +1,4 @@ [uta] hostport = localhost cluster = %(user)s@%(hostport)s -database = uta_dev +database = uta diff --git a/etc/uta_dev@uic.com b/etc/uta_dev@uic.com index e61a48b..cbf2e77 100644 --- a/etc/uta_dev@uic.com +++ b/etc/uta_dev@uic.com @@ -1,4 +1,4 @@ [uta] -hostport = uta.invitae.com +hostport = uta-int-02.cj7o8ef9mt4v.us-east-1.rds.amazonaws.com user = uta_admin -database = uta_dev +database = uta diff --git a/loading/data/splign-manual/README.md b/loading/data/splign-manual/README.md index 3a4d62b..f364281 100644 --- a/loading/data/splign-manual/README.md +++ b/loading/data/splign-manual/README.md @@ -50,7 +50,7 @@ For a given RefSeq transcript (e.g., NM_000996.3), do the following: - Click on the gene id to go to the gene page (e.g., `6165`) - N.B. Strand is inferred from the orientation of aligned exons. -1. Enter the gene and CDS info in txdata.yaml +1. Enter the gene, geneID, and CDS info in txdata.yaml 1. Get the chromosome and coordinates from the gene page - From the "Genomic Context" section, note the chromosomal diff --git a/loading/data/splign-manual/txdata.yaml b/loading/data/splign-manual/txdata.yaml index 1e7498d..64ef14c 100644 --- a/loading/data/splign-manual/txdata.yaml +++ b/loading/data/splign-manual/txdata.yaml @@ -7,6 +7,7 @@ NM_000000.0: # transcript_accession cds: # CDS start and end, 1-based inclusive hgnc: # HGNC *symbol* genomic_region: # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/259291 + gene_id: # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/259291 NM_001025190.1: @@ -14,67 +15,80 @@ NM_001025190.1: cds: 1,3162 hgnc: MSLNL genomic_region: NC_000016.9 (819428..831996, complement) + gene_id: 401827 NM_006060.6: cds: 222,1781 hgnc: IKZF1 genomic_region: NC_000007.13 (50344378..50367358) , (50444231..50472799) + gene_id: 10320 NM_000769.4: cds: 26,1498 hgnc: CYP2C19 genomic_region: NC_000010.10 (96522463..96612671) + gene_id: 1557 NM_001807.4: cds: 17,2287 hgnc: CEL genomic_region: NC_000009.11 (135936741..135947250) + gene_id: 1056 NM_002116.7: cds: 85,1182 hgnc: HLA-A genomic_region: NC_000006.11 (29910247..29913661) + gene_id: 3105 NM_002122.3: cds: 54,821 hgnc: HLA-DQA1 genomic_region: NC_000006.11 (32605169..32612152) + gene_id: 3117 NM_006060.5: cds: 269,1828 hgnc: IKZF1 genomic_region: NC_000007.13 (50344378..50367358) , (50444231..50472799) + gene_id: 10320 NM_000996.3: cds: 65,397 hgnc: RPL35A genomic_region: NC_000003.11 (197677023..197682722) + gene_id: 6165 NM_001261826.2: cds: 293,3940 hgnc: AP3D1 genomic_region: NC_000019.9 (2100987..2151556, complement) + gene_id: 8943 NM_001355436.1: cds: 144,7130 hgnc: SPTB genomic_region: NC_000014.8 (65213001..65346604, complement) + gene_id: 6710 NM_001428.4: cds: 117,1421 hgnc: ENO1 genomic_region: NC_000001.10 (8921059..8939151, complement) + gene_id: 2023 NM_032589.2: - genomic_region: NM_032589.2 was permanently suppressed because currently there is support for the transcript but not for the protein. + # NM_032589.2 was permanently suppressed because currently there is support for the transcript but not for the protein. cds: 150,425 hgnc: DSCR8 genomic_region: NC_000021.8 (39493545..39528605) + gene_id: 84677 NM_176886.1: cds: 1,900 hgnc: TAS2R45 genomic_region: NW_003571050.1 (327525..328424, complement) + gene_id: 259291 @@ -90,6 +104,7 @@ NM_002457.4: cds: 28,15897 hgnc: MUC2 genomic_region: NC_000011.9 (1074875..1104417) + gene_id: 4583 # Case 2: overall low coverage and/or identity. @@ -99,6 +114,7 @@ NM_001277444.1: cds: 76,3411 hgnc: NBPF9 genomic_region: NC_000001.10 (144811743..144830407) + gene_id: 400818 # Case 3: high identity alignments but with large gaps. These @@ -110,18 +126,21 @@ NM_031421.4: cds: 131,2149 hgnc: TTC25 genomic_region: NC_000017.10 (40086888..40117669) + gene_id: 83538 NM_001349168.1: # Splign alignment has 159 nt unaligned exonic sequence. This is unusable. -Reece 2020-04-08 cds: 239,4762 hgnc: DCAF1 genomic_region: NC_000003.11 (51433298..51534018, complement) + gene_id: 9730 NM_001733.5: # Splign alignment has 232 nt unaligned exonic sequence. This is unusable. -Reece 2020-04-08 cds: 220,2337 hgnc: C1R genomic_region: NC_000012.11 (7241205..7245043, complement) , (7187513..7189412, complement) + gene_id: 715 # Transcript, gene, and genomic alignment info # cds start,end (in human, 1-based coordinates) and hgnc symbol @@ -132,53 +151,64 @@ NM_001038633.3: # transcript_accession cds: 893,1684 # CDS start and end, 1-based inclusive hgnc: RSPO1 # HGNC *symbol* genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/284654 + gene_id: 284654 NM_005363.3: # transcript_accession cds: 208,1152 # CDS start and end, 1-based inclusive hgnc: MAGEA6 # HGNC *symbol* genomic_region: NC_000023.10 (151867245..151870814) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/4105 + gene_id: 4105 NM_006561.3: # transcript_accession cds: 161,1726 # CDS start and end, 1-based inclusive hgnc: CELF2 # HGNC *symbol* genomic_region: NC_000010.10 (10838851..11378674) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/10659 + gene_id: 10659 NM_001242908.1: # transcript_accession cds: 714,1505 # CDS start and end, 1-based inclusive hgnc: RSPO1 # HGNC *symbol* genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/284654 + gene_id: 284654 NM_001242909.1: # transcript_accession cds: 474,1184 # CDS start and end, 1-based inclusive hgnc: RSPO1 # HGNC *symbol* genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/259291 + gene_id: 284654 NM_001242910.1: # transcript_accession cds: 714,1316 # CDS start and end, 1-based inclusive hgnc: RSPO1 # HGNC *symbol* genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/284654 + gene_id: 284654 NM_001012709.1: # transcript_accession cds: 46,912 # CDS start and end, 1-based inclusive hgnc: KRTAP5-4 # HGNC *symbol* genomic_region: NC_000011.9 (1642188..1643368, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/387267 + gene_id: 387267 NM_001123068.1: # transcript_accession cds: 34,528 # CDS start and end, 1-based inclusive hgnc: COAS-2 # HGNC *symbol* genomic_region: NC_000001.10 (143767144..143767881, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/644591 + gene_id: 644591 NM_130797.2: # transcript_accession cds: 130,2727 # CDS start and end, 1-based inclusive hgnc: DPPX # HGNC *symbol* genomic_region: NC_000007.13 (153584419..154264025) , (154400205..154685995) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/1804 + gene_id: 1804 NM_033060.2: # transcript_accession cds: 42,425 # CDS start and end, 1-based inclusive hgnc: KRTAP4-1 # HGNC *symbol* genomic_region: NC_000017.10 (39340352..39341147, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/1804 + gene_id: 85285 NM_033060.3: # transcript_accession cds: 58,441 # CDS start and end, 1-based inclusive hgnc: KRTAP4-1 # HGNC *symbol* genomic_region: NC_000017.10 (39340352..39341163, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/1804 + gene_id: 85285 diff --git a/misc/export_mappings.py b/misc/export_mappings.py new file mode 100644 index 0000000..ded3ddb --- /dev/null +++ b/misc/export_mappings.py @@ -0,0 +1,184 @@ +import argparse +import logging +import re +from bioutils.assemblies import make_name_ac_map +from contextlib import ExitStack +from dataclasses import dataclass, field + +import psycopg2 +import six + +import uta + +logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') +logger = logging.getLogger("export_mappings") + + +UTA_SCHEMA_VERSION_SQL = """ +select value as schema_version +from meta +where key='schema_version'; +""" + +ASSOCIATED_ACCESSIONS_SQL = """ +select aa.tx_ac, aa.pro_ac +from associated_accessions as aa +where aa.tx_ac='{}'; +""" + +# get_tx_mapping_options +TX_MAPPING_OPTIONS_SQL = """ +select distinct tx_ac,alt_ac,alt_aln_method +from tx_exon_aln_v where tx_ac='{}' and exon_aln_id is not NULL +order by alt_ac,alt_aln_method; +""" + +# get_tx_info +TX_V1_INFO_SQL = """ +select hgnc, cds_start_i, cds_end_i, tx_ac, alt_ac, alt_aln_method +from transcript T +join exon_set ES on T.ac=ES.tx_ac +where tx_ac='{}' and alt_ac='{}' and alt_aln_method='{}'; +""" + +TX_V2_INFO_SQL = """ +select G.hgnc, T.cds_start_i, T.cds_end_i, ES.tx_ac, ES.alt_ac, ES.alt_aln_method +from gene G +join transcript T on G.gene_id=T.gene_id +join exon_set ES on T.ac=ES.tx_ac +where tx_ac='{}' and alt_ac='{}' and alt_aln_method='{}'; +""" + +EXON_SET_SQL = """ +select * +from tx_exon_aln_v +where tx_ac='{}' and alt_ac='{}' and alt_aln_method='{}' +order by alt_start_i; +""" + +TX_INDENTITY_SQL = """ +select distinct(tx_ac), alt_ac, alt_aln_method, cds_start_i, cds_end_i, lengths, hgnc +from tx_def_summary_v +where tx_ac='{}'; +""" + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Export transcript alignments for a given genome build from UTA database." + ) + parser.add_argument("transcripts_file", type=str) + parser.add_argument("--genome-build", type=str, default="GRCh37.p13") + parser.add_argument("--db-url", default="postgresql://uta_admin@localhost/uta") + parser.add_argument("--schema-name", default="uta_20210129") + return parser.parse_args() + + +def _get_cursor(con, schema_name): + cur = con.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor) + cur.execute(f"set search_path = {schema_name}") + return cur + + +def _get_rows(cur, sql): + cur.execute(sql) + return cur.fetchall() + + +def _get_chr_ac_map(genome_build): + filtered_chr_ac_map = {} + + def_name_ac_map = make_name_ac_map(assy_name=genome_build, primary_only=True) + for chr_name in list(map(str, range(1, 23))) + ["X", "Y"]: + filtered_chr_ac_map[chr_name] = def_name_ac_map.get(chr_name) + + return filtered_chr_ac_map + + +def main(transcripts_file, genome_build, db_url, schema_name): + logger.info(f"connecting to {db_url}") + session = uta.connect(db_url) + + con = session.bind.pool.connect() + cur = _get_cursor(con, schema_name) + + chr_to_acc_dict = _get_chr_ac_map(genome_build=genome_build) + + schema_version = _get_rows(cur, UTA_SCHEMA_VERSION_SQL)[0].schema_version + if schema_version == "1.1": + TX_INFO_SQL = TX_V1_INFO_SQL + else: + TX_INFO_SQL = TX_V2_INFO_SQL + + # read in transcripts + transcripts = [] + with open(transcripts_file, "r") as f: + for line in f: + if line.startswith("accession"): + continue + accession, chrom = line.rstrip("\r\n").split("\t") + transcripts.append((accession, chrom)) + + # setup context managers for file writers + with ExitStack() as stack: + # associated acccessions + assocacs_fh = stack.enter_context( + open(f"{schema_name}_associated_accessions.tsv", "w") + ) + assocacs_fh.write("tx_ac\tpro_ac\n") + + # transcript info + txinfo_fh = stack.enter_context(open(f"{schema_name}_transcript_info.tsv", "w")) + txinfo_fh.write("hgnc\ttx_ac\tcds_start_i\tcds_end_i\talt_ac\talt_aln_method\n") + + # exon sets + exons_fh = stack.enter_context(open(f"{schema_name}_exon_sets.tsv", "w")) + exons_fh.write("hgnc\ttx_ac\talt_ac\talt_aln_method\talt_strand\tords\ttx_ac_se_i\talt_ac_se_i\tcigars\n") + + # transcript identity + tx_identity_fh = stack.enter_context( + open(f"{schema_name}_transcript_identity.tsv", "w") + ) + + logger.info("querying database for transcript mappings...") + for i, (tx_ac, chrom) in enumerate(transcripts): + assocacs_rows = _get_rows(cur, ASSOCIATED_ACCESSIONS_SQL.format(tx_ac)) + for row in assocacs_rows: + assocacs_fh.write(f"{row.tx_ac}\t{row.pro_ac}\n") + + alt_ac = chr_to_acc_dict.get(chrom) + for alt_aln_method in ("splign", "splign-manual"): + txinfo_rows = _get_rows( + cur, TX_INFO_SQL.format(tx_ac, alt_ac, alt_aln_method) + ) + if txinfo_rows: + for row in txinfo_rows: + txinfo_fh.write( + f"{row.hgnc}\t{row.tx_ac}\t{row.cds_start_i}\t{row.cds_end_i}\t{row.alt_ac}\t{row.alt_aln_method}\n" + ) + exons_rows = _get_rows( + cur, EXON_SET_SQL.format(tx_ac, alt_ac, alt_aln_method) + ) + if exons_rows: + hgnc = exons_rows[0].hgnc + tx_ac = exons_rows[0].tx_ac + alt_ac = exons_rows[0].alt_ac + alt_aln_method = exons_rows[0].alt_aln_method + alt_strand = exons_rows[0].alt_strand + ords, tx_ac_se_i, alt_ac_se_i, cigars = [], [], [], [] + for row in sorted(exons_rows, key=lambda x: x.ord): + ords.append(str(row.ord)) + tx_ac_se_i.append(f"{row.tx_start_i},{row.tx_end_i}") + alt_ac_se_i.append(f"{row.alt_start_i},{row.alt_end_i}") + cigars.append(row.cigar) + exons_fh.write( + f"{hgnc}\t{tx_ac}\t{alt_ac}\t{alt_aln_method}\t{alt_strand}\t{';'.join(ords)}\t{';'.join(tx_ac_se_i)}\t{';'.join(alt_ac_se_i)}\t{';'.join(cigars)}\n" + ) + + if i % 2500 == 0 and i > 0: + logger.info(f"processed {i} transcripts") + + +if __name__ == '__main__': + arguments = parse_args() + main(arguments.transcripts_file, arguments.genome_build, arguments.db_url, arguments.schema_name) diff --git a/misc/gene-update/backfill_gene_id.py b/misc/gene-update/backfill_gene_id.py new file mode 100644 index 0000000..350ef46 --- /dev/null +++ b/misc/gene-update/backfill_gene_id.py @@ -0,0 +1,117 @@ +import argparse +import logging + +from datetime import datetime +from sqlalchemy.orm import Session +from sqlalchemy import text + +import uta +from uta.models import Gene, Transcript +from uta.tools.file_utils import open_file + + +logger = None +n = 50000 + + +def backfill_gene(uta_session: Session, gene_update_file: str) -> None: + logger.info("Dropping gene table contents") + uta_session.execute(text("DELETE FROM uta.gene;")) + uta_session.commit() + + logger.info(f"Back filling gene table from {gene_update_file}") + now_ts = datetime.now() + i = 0 + new_genes = [] + with open_file(gene_update_file) as f: + for line in f: + if line.startswith("gene_id"): + continue + + if i % n == 0: + if i > 0: + logger.info(f"Bulk inserting {len(new_genes)} genes") + uta_session.bulk_save_objects(new_genes) + uta_session.commit() + logger.info(f"Processing chunk {int(i/n) + 1}") + new_genes = [] + + gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t") + # set timestamp from file string, if empty set to now. + if added == "": + added_ts = now_ts + else: + added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f") + + # clean up aliases + aliases = aliases.replace("{", "").replace("}", "") + if aliases == "-": + aliases = None + + gene = Gene( + gene_id=gene_id, + hgnc=hgnc, + maploc=maploc if maploc else None, + descr=desc if desc else None, + summary=summary if desc else None, + aliases=aliases if aliases else None, + added=added_ts, + ) + i += 1 + new_genes.append(gene) + + logger.info(f"Bulk inserting {len(new_genes)} genes") + uta_session.bulk_save_objects(new_genes) + uta_session.commit() + logger.info(f"Inserted {i} total genes") + + +def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None: + logger.info("Backfilling gene_id in transcript table") + tx_ac_to_gene_id = {} + + logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}") + with open_file(transcript_update_file) as f: + for line in f: + if line.startswith("origin"): + continue + _, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t") + tx_ac_to_gene_id[tx_ac] = gene_id + logger.info(f" - {len(tx_ac_to_gene_id)} mappings read") + + i = 0 + txs = [] + for tx_ac, gene_id in tx_ac_to_gene_id.items(): + if i % n == 0: + if i > 0: + logger.info(f"Updating {len(txs)} transcripts") + uta_session.flush() + + logger.info(f"Processing chunk {int(i/n) + 1}") + txs = [] + + tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one() + tx.gene_id = gene_id + txs.append(tx) + i += 1 + + logger.info(f"Updating {len(txs)} transcripts") + uta_session.flush() + uta_session.commit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables") + parser.add_argument("db_url", help="URL of the UTA database") + parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table") + parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table") + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + logger = logging.getLogger("backfill_gene_id") + + session = uta.connect(args.db_url) + + backfill_gene(session, args.gene_update_file) + backfill_transcript(session, args.transcript_update_file) + session.close() diff --git a/misc/gene-update/docker-compose-gene-update.yml b/misc/gene-update/docker-compose-gene-update.yml new file mode 100644 index 0000000..2245f44 --- /dev/null +++ b/misc/gene-update/docker-compose-gene-update.yml @@ -0,0 +1,17 @@ +# docker compose file for the NCBI gene_id update and backfill procedure + +version: '3' + +services: + uta-gene-update: + image: uta-update + command: misc/gene-update/upgrade-uta-schema.sh ${UTA_ETL_NEW_UTA_VERSION} + depends_on: + uta: + condition: service_healthy + volumes: + - ${UTA_ETL_NCBI_DIR}:/ncbi-dir + - ${UTA_ETL_WORK_DIR}:/uta-gene-update/work + - ${UTA_ETL_LOG_DIR}:/uta-gene-update/logs + working_dir: /opt/repos/uta + network_mode: host diff --git a/misc/gene-update/gene_update.tsv.gz b/misc/gene-update/gene_update.tsv.gz new file mode 100644 index 0000000..9c44bab Binary files /dev/null and b/misc/gene-update/gene_update.tsv.gz differ diff --git a/misc/gene-update/transcript_update.tsv.gz b/misc/gene-update/transcript_update.tsv.gz new file mode 100644 index 0000000..b5d5da5 Binary files /dev/null and b/misc/gene-update/transcript_update.tsv.gz differ diff --git a/misc/gene-update/upgrade-uta-schema.sh b/misc/gene-update/upgrade-uta-schema.sh new file mode 100755 index 0000000..ba66c0b --- /dev/null +++ b/misc/gene-update/upgrade-uta-schema.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# This script is used to upgrade older UTA schemas (specifically uta_20210129b) to a newer version. +# Part of this upgrade is to introduce gene_id to the gene and transcript tables. The columns are +# added with a Alembic migration. Then a data migration to back fill the new columns. Then a second +# Alembic migration to add the constraints to the columns and update primary and foreign keys. + +if [ "$#" -lt 1 ] +then + echo "error: too few arguments, you provided $#, 1 required" + echo "usage: upgrade-uta-schema.sh " + exit 1 +fi + +set -euxo pipefail + +source_uta_v="uta_20210129b" +working_uta_v="uta" +dest_uta_v=$1 +tmp_dumps_dir="/tmp/dumps" +mkdir -p $tmp_dumps_dir + +## setup working uta schema +# delete schema if exists +psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $working_uta_v CASCADE;" + +# dump source version +pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \ + gzip -c > $tmp_dumps_dir/"$source_uta_v".pgd.gz + +# create new schema +gzip -cdq $tmp_dumps_dir/"$source_uta_v".pgd.gz | \ + sbin/pg-dump-schema-rename "$source_uta_v" "$working_uta_v" | \ + sbin/pg-dump-schema-rename "uta_1_1" "$working_uta_v" | \ + psql -U uta_admin -h localhost -d uta -aeE + +## upgrade working uta schema +# set initial Alembic migration so it is not ran. +alembic -c etc/alembic.ini stamp edadb97f6502 + +# run Alembic migration to add gene_id to gene and transcript tables +alembic -c etc/alembic.ini upgrade 595a586e6de7 + +# run data migration to back fill gene_id +python misc/gene-update/backfill_gene_id.py \ + postgresql://uta_admin:@localhost/uta \ + misc/gene-update/gene_update.tsv.gz \ + misc/gene-update/transcript_update.tsv.gz + +# run Alembic migrations to add constraints and update existing views +alembic -c etc/alembic.ini upgrade head + +## Rename schema to destination schema name +psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $dest_uta_v CASCADE;" +psql -h localhost -U uta_admin -d uta -c "ALTER SCHEMA uta RENAME TO $dest_uta_v"; +pg_dump -h localhost -U uta_admin -d uta -n "$dest_uta_v" | \ + gzip -c > "/uta-gene-update/work/$dest_uta_v.pgd.gz" diff --git a/misc/generate_alignment_metrics.py b/misc/generate_alignment_metrics.py new file mode 100644 index 0000000..d0e0e7f --- /dev/null +++ b/misc/generate_alignment_metrics.py @@ -0,0 +1,300 @@ +""" +To determine the quality of an alignment in UTA this script will compute metrics that can be used to evaluate +the quality of the alignment. + +The metrics are: + seq_length: max(exon.ends_i) + exon_count: count of exons linked to transcript (from genbank file) + aligned_exon_count: count of blocks from GFF file + exon_structure_mismatch: True if exon_count != aligned_exon_count + matches: count of matching bases between chromosome and transcript within alignment bounds + mismatches: count of mismatched bases between chromosome and transcript within alignment bounds + gap_count: count of gaps (indels) between chromosome and transcript sequenceswithin alignment bounds + aln_length: total number of alignment blocks, includes counts of indel positions + identity_gapped: matches / aln_length ## not gap compressed identity calculation + identity_ungapped: matches / (matches + mismatches) + coverage: (matches + mismatches + deletions) / seq_length + +Example: + + 1 11 21 31 41 + Chromo+: 1 CCAGTGTGGC CGATACCCCA GGTTGGC-AC GCATCGTTGC CTTGGTAAGC 49 + |||||||||| |||| ||| || || || |||||||||| |||||||||| + Refseq+: 1 CCAGTGTGGC CGATGCCC-- -GT--GCTAC GCATCGTTGC CTTGGTAAGC 45 + + seq_length: 45 + exon_count: 1 + aligned_exon_count: 1 + matches: 43 + mismatches: 1 + gap_count: 3 + aln_length: 43 matches + 1 mismatch + 3bp insertion + 2bp insertion + 1bp deletion = 50 + identity_gapped: 43 / 50 = 0.86 + identity_ungapped: 43 / (43 + 1) = 0.9772 + coverage: (43 + 1 + 1) / 45 = 1.0 + +Usage: + python generate_alignment_metrics.py --db-url --schema-name +""" + +import argparse +import logging +import re +from dataclasses import dataclass, field + +import psycopg2 +import six + +import uta + + +@dataclass +class CigarAln: + op: str + length: int + + +@dataclass +class ExonAln: + tx_start_i: int + tx_end_i: int + alt_start_i: int + alt_end_i: int + cigar: str + cigar_alns: list[CigarAln] = field(default_factory=list) + + +@dataclass +class TxAln: + hgnc: str + tx_ac: str + seq_length: int + exon_count: int + alt_ac: str + alt_aln_method: str + alt_strand: int + aligned_exon_count: int + exon_alignments: list[ExonAln] = field(default_factory=list) + + @staticmethod + def metrics_header(): + return "{}\n".format("\t".join([ + "hgnc", + "tx_ac", + "seq_length", + "exon_count", + "alt_ac", + "alt_aln_method", + "alt_strand", + "aligned_exon_count", + "exon_structure_mismatch", + "matches_bps", + "mismatches_bps", + "gap_count", + "deletions_bps", + "aln_length", + "identity_gap", + "identity_ungap", + "coverage", + ])) + + def to_metric_output_row(self): + return "{}\n".format("\t".join(map(str, [ + self.hgnc, + self.tx_ac, + self.seq_length, + self.exon_count, + self.alt_ac, + self.alt_aln_method, + self.alt_strand, + self.aligned_exon_count, + not self.exon_count == self.aligned_exon_count, + self.matches(), + self.mismatches(), + self.gap_count(), + self.deletions(), + self.aln_length(), + self.identity_gap(), + self.identity_ungap(), + self.coverage(), + ]))) + + def matches(self): + matches = 0 + for exon_aln in self.exon_alignments: + for cigar_aln in exon_aln.cigar_alns: + if cigar_aln.op == MATCH: + matches += cigar_aln.length + return matches + + def mismatches(self): + mismatches = 0 + for exon_aln in self.exon_alignments: + for cigar_aln in exon_aln.cigar_alns: + if cigar_aln.op == MM: + mismatches += cigar_aln.length + return mismatches + + def deletions(self): + deletions = 0 + for exon_aln in self.exon_alignments: + for cigar_aln in exon_aln.cigar_alns: + if cigar_aln.op == DEL: + deletions += cigar_aln.length + return deletions + + def gap_count(self): + gaps = 0 + for exon_aln in self.exon_alignments: + for cigar_aln in exon_aln.cigar_alns: + if cigar_aln.op == DEL or cigar_aln.op == INS: + gaps += 1 + return gaps + + def aln_length(self): + length = 0 + for exon_aln in self.exon_alignments: + for cigar_aln in exon_aln.cigar_alns: + length += cigar_aln.length + return length + + def identity_gap(self): + return f"{self.matches() / float(self.aln_length()):.6f}" + + def identity_ungap(self): + return f"{self.matches() / float(self.matches() + self.mismatches()):.6f}" + + def coverage(self): + return f"{(self.matches() + self.mismatches() + self.deletions()) / float(self.seq_length):.6f}" + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("alignment_metrics") + +p = re.compile("\d+[=DIX]") +MATCH = "=" +INS = "I" +DEL = "D" +MM = "X" + +TX_EXON_SET_SUMMARY_ALL_BUILD37_SQL = """ +select * +from tx_exon_set_summary_mv as mv +where mv.alt_aln_method in ('splign', 'splign-manual') and mv.tx_ac ~ 'N[MR]_*' and mv.tx_ac !~ '/' + and mv.alt_ac in ('NC_000001.10', 'NC_000002.11', 'NC_000003.11', 'NC_000004.11', 'NC_000005.9', 'NC_000006.11', + 'NC_000007.13', 'NC_000008.10', 'NC_000009.11', 'NC_000010.10', 'NC_000011.9', 'NC_000012.11', + 'NC_000013.10', 'NC_000014.8', 'NC_000015.9', 'NC_000016.9', 'NC_000017.10', 'NC_000018.9', + 'NC_000019.9', 'NC_000020.10', 'NC_000021.8', 'NC_000022.10', 'NC_000023.10', 'NC_000024.9' + ); +""" + +TX_EXON_SET_SUMMARY_SQL = """ +select mv.ends_i[mv.n_exons] as tx_length, * +from tx_exon_set_summary_mv as mv +where mv.tx_ac='{tx_ac}' and mv.alt_ac='{alt_ac}' and mv.alt_aln_method='{alt_aln_method}'; +""" + +TX_EXON_ALN_SQL = """ +select v.hgnc, v.tx_ac, v.alt_ac, v.alt_aln_method, v.alt_strand, v.ord, v.tx_start_i, v.tx_end_i, + v.alt_start_i, v.alt_end_i, v.cigar +from tx_exon_aln_v as v +where tx_ac='{tx_ac}' and alt_ac='{alt_ac}' and alt_aln_method='{alt_aln_method}' +order by ord; +""" + + +def parse_args(): + parser = argparse.ArgumentParser(description="Generate alignment metrics for transcript alignments") + parser.add_argument("output_file", type=str) + parser.add_argument("--db-url", default="postgresql://uta_admin@localhost/uta") + parser.add_argument("--schema-name", default="uta_20210129b") + return parser.parse_args() + + +def _get_cursor(con, schema_name): + cur = con.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor) + cur.execute(f"set search_path = {schema_name}") + return cur + + +def _get_alignment(cur, tx_ac, alt_ac, alt_aln_method, aligned_exon_count): + # get transcript exon count + cur.execute( + TX_EXON_SET_SUMMARY_SQL.format(tx_ac=tx_ac, alt_ac=tx_ac, alt_aln_method="transcript") + ) + row = cur.fetchone() + + if row is None: + logger.warn(f"no transcript alignment found for {tx_ac} {alt_ac} {alt_aln_method}") + tx_exon_count = None + else: + tx_exon_count = row.n_exons + tx_seq_length = row.tx_length + + cur.execute( + TX_EXON_ALN_SQL.format( + tx_ac=tx_ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method + ) + ) + rows = cur.fetchall() + tx_aln = None + + for row in rows: + if tx_aln is None: + tx_aln = TxAln( + hgnc=row.hgnc, + tx_ac=row.tx_ac, + seq_length=tx_seq_length, + exon_count=tx_exon_count, + alt_ac=row.alt_ac, + alt_aln_method=row.alt_aln_method, + alt_strand=row.alt_strand, + aligned_exon_count=aligned_exon_count, + ) + exon_aln = ExonAln( + tx_start_i=row.tx_start_i, + tx_end_i=row.tx_end_i, + alt_start_i=row.alt_start_i, + alt_end_i=row.alt_end_i, + cigar=row.cigar, + ) + for match in p.finditer(exon_aln.cigar): + cigar_aln = CigarAln(op=match.group()[-1], length=int(match.group()[:-1])) + exon_aln.cigar_alns.append(cigar_aln) + tx_aln.exon_alignments.append(exon_aln) + return tx_aln + + +def main(db_url, schema_name, output_file): + logger.info(f"connecting to {db_url}") + session = uta.connect(db_url) + + con = session.bind.pool.connect() + cur = _get_cursor(con, schema_name) + + # get tx_ac/alt_ac pairs + tx_alt_ac_pairs = [] + cur.execute(TX_EXON_SET_SUMMARY_ALL_BUILD37_SQL) + rows = cur.fetchall() + + for row in rows: + tx_alt_ac_pairs.append((row.tx_ac, row.alt_ac, row.alt_aln_method, row.n_exons)) + + logger.info(f"writing metrics to {output_file} for {len(tx_alt_ac_pairs)} transcript alignments") + with open(output_file, "w") as f_out: + f_out.write(TxAln.metrics_header()) + i = 0 + for tx_ac, alt_ac, alt_aln_method, aligned_exon_count in tx_alt_ac_pairs: + tx_aln = _get_alignment( + cur, tx_ac, alt_ac, alt_aln_method, aligned_exon_count + ) + f_out.write(tx_aln.to_metric_output_row()) + f_out.flush() + i += 1 + if i % 500 == 0: + logger.info(f" - {i} transcript alignments processed") + + +if __name__ == "__main__": + arguments = parse_args() + main(arguments.db_url, arguments.schema_name, arguments.output_file) diff --git a/misc/mito-transcripts/docker-compose-mito-extract.yml b/misc/mito-transcripts/docker-compose-mito-extract.yml new file mode 100644 index 0000000..c5a8a0d --- /dev/null +++ b/misc/mito-transcripts/docker-compose-mito-extract.yml @@ -0,0 +1,13 @@ +# docker compose file for the mito transcript extraction for the UTA update procedure + +version: '3' + +services: + mito-extract: + image: uta-update + command: sbin/ncbi_process_mito.py NC_012920.1 --output-dir /mito-extract/work | tee /mito-extract/logs/mito.log + volumes: + - ${UTA_ETL_WORK_DIR}:/mito-extract/work + - ${UTA_ETL_LOG_DIR}:/mito-extract/logs + working_dir: /opt/repos/uta + network_mode: host diff --git a/misc/refseq-historical-backfill/docker-compose-backfill.yml b/misc/refseq-historical-backfill/docker-compose-backfill.yml new file mode 100644 index 0000000..6e773b0 --- /dev/null +++ b/misc/refseq-historical-backfill/docker-compose-backfill.yml @@ -0,0 +1,14 @@ +# docker compose file for the RefSeq historical backfill procedure + +version: '3' + +services: + uta-extract-historical: + image: uta-update + command: misc/refseq-historical-backfill/uta-extract-historical /ncbi-dir /uta-extract/work /uta-extract/logs + volumes: + - ${UTA_ETL_NCBI_DIR}:/ncbi-dir + - ${UTA_ETL_WORK_DIR}:/uta-extract/work + - ${UTA_ETL_LOG_DIR}:/uta-extract/logs + working_dir: /opt/repos/uta + network_mode: host diff --git a/misc/refseq-historical-backfill/ncbi_extract_gbff.py b/misc/refseq-historical-backfill/ncbi_extract_gbff.py new file mode 100755 index 0000000..cab3277 --- /dev/null +++ b/misc/refseq-historical-backfill/ncbi_extract_gbff.py @@ -0,0 +1,197 @@ +""" +Extract and write all files needed by UTA load, except alt accession exonsets (aka, alignments). From a single +GBFF file we can create dna fasta, protein fasta, associated accessions, geneinfo, and txinfo files. +""" +import argparse +import gzip +import importlib_resources +import io +import logging +import logging.config +from collections import Counter +from contextlib import ExitStack +from typing import Iterable + +from Bio.Seq import Seq +import Bio.SeqIO +from Bio.SeqRecord import SeqRecord + +from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter +from uta.formats.geneinfo import GeneInfo, GeneInfoWriter +from uta.formats.txinfo import TxInfo, TxInfoWriter +from uta.parsers.seqrecord import SeqRecordFacade, SeqRecordFeatureError +from uta.tools.file_utils import open_file + + +logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf") +logging.config.fileConfig(logging_conf_fn) +logging.getLogger().setLevel(logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args(): + ap = argparse.ArgumentParser( + description=__doc__, + ) + ap.add_argument("GBFF_FILES", nargs="+") + ap.add_argument("--origin", "-o", default="NCBI") + ap.add_argument("--prefix", "-p", default="") + ap.add_argument("--output_dir", "-d", default=".", type=str) + opts = ap.parse_args() + return opts + + +def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> None: + if prefix: + prefix = f"{prefix}." + + # setup context managers for file writers + with ExitStack() as stack: + # DNA fasta file + dna_fasta_fh = stack.enter_context( + io.TextIOWrapper( + gzip.open(f"{output_dir}/{prefix}rna.fna.gz", "wb"), encoding="utf-8" + ) + ) + + # Protein fasta file + protein_fasta_fh = stack.enter_context( + io.TextIOWrapper( + gzip.open(f"{output_dir}/{prefix}protein.faa.gz", "wb"), + encoding="utf-8", + ) + ) + + geneinfo_fh = stack.enter_context( + io.TextIOWrapper( + gzip.open(f"{output_dir}/{prefix}geneinfo.gz", "wb"), encoding="utf-8" + ) + ) + geneinfo_writer = GeneInfoWriter(geneinfo_fh) + + txinfo_fh = stack.enter_context( + io.TextIOWrapper( + gzip.open(f"{output_dir}/{prefix}txinfo.gz", "w"), encoding="utf-8" + ) + ) + txinfo_writer = TxInfoWriter(txinfo_fh) + + assocacs_fh = stack.enter_context( + io.TextIOWrapper( + gzip.open(f"{output_dir}/{prefix}assocacs.gz", "w"), encoding="utf-8" + ) + ) + assocacs_writer = GeneAccessionsWriter(assocacs_fh) + + total_genes = set() + total_skipped = set() + all_prefixes = Counter() + for gbff_fn in gbff_files: + logger.info(f"Processing {gbff_fn}") + gbff_file_handler = stack.enter_context(open_file(gbff_fn)) + i = 0 + genes = set() + skipped = set() + prefixes = Counter() + for r in Bio.SeqIO.parse(gbff_file_handler, "gb"): + srf = SeqRecordFacade(r) + + # skip transcripts where the exon structure is unknown + if not srf.exons_se_i: + skipped.add(srf.id) + continue + + prefixes.update([srf.id[:2]]) + try: + fna_record = SeqRecord( + Seq(srf.feature_seq), id=srf.id, description="" + ) + dna_fasta_fh.write(fna_record.format("fasta")) + + if srf.gene_id not in genes: + geneinfo_writer.write( + GeneInfo( + gene_id=srf.gene_id, + gene_symbol=srf.gene_symbol, + tax_id="9606", + hgnc=srf.gene_symbol, + maploc="", + aliases=srf.gene_synonyms, + type=srf.gene_type, + summary="", + descr="", + xrefs=srf.db_xrefs, + ) + ) + + txinfo_writer.write( + TxInfo( + origin=origin, + ac=srf.id, + gene_id=srf.gene_id, + gene_symbol=srf.gene_symbol, + cds_se_i=TxInfo.serialize_cds_se_i(srf.cds_se_i), + exons_se_i=TxInfo.serialize_exons_se_i(srf.exons_se_i), + codon_table=srf.codon_table, + transl_except=TxInfo.serialize_transl_except( + srf.transl_except + ), + ) + ) + + # only write cds features for protein-coding transcripts + if srf.cds_feature is not None: + pro_record = SeqRecord( + Seq(srf.cds_translation), + id=srf.cds_protein_id, + description=srf.cds_product, + ) + protein_fasta_fh.write(pro_record.format("fasta")) + + assocacs_writer.write( + GeneAccessions( + origin=origin, + gene_id=srf.gene_id, + gene_symbol=srf.gene_symbol, + tx_ac=srf.id, + pro_ac=srf.cds_protein_id, + ) + ) + + genes.add(srf.gene_id) + i += 1 + if i % 5000 == 0: + logger.info( + " - {ng} genes in {fn} ({c}); {s} transcripts skipped".format( + ng=len(genes), + fn=gbff_fn, + c=prefixes, + s=len(skipped), + ) + ) + except SeqRecordFeatureError as e: + logger.error(f"SeqRecordFeatureError processing {r.id}: {e}") + raise + except ValueError as e: + logger.error(f"ValueError processing {r.id}: {e}") + raise + + + logger.info( + "{ng} genes in {fn} ({c}); {s} transcripts skipped".format( + ng=len(genes), fn=gbff_fn, c=prefixes, s=len(skipped) + ) + ) + total_genes ^= genes + total_skipped ^= skipped + all_prefixes += prefixes + logger.info( + "{ng} genes in {nf} ({c}); {s} transcripts skipped".format( + ng=len(total_genes), nf=len(gbff_files), c=all_prefixes, s=len(total_skipped) + ) + ) + + +if __name__ == "__main__": + args = parse_args() + main(args.GBFF_FILES, args.origin, args.prefix, args.output_dir) diff --git a/misc/refseq-historical-backfill/uta-extract-historical b/misc/refseq-historical-backfill/uta-extract-historical new file mode 100755 index 0000000..1586cfe --- /dev/null +++ b/misc/refseq-historical-backfill/uta-extract-historical @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +# Download, then extract intermediate files out of the NCBI historical alignment files. + +set -e + +ncbi_dir=$1 +working_dir=$2 +log_dir=$3 + +if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] +then + echo 'Usage: sbin/uta-extract-historical ' + exit 1 +fi + +download_ncbi_file () { + download_path=$1 + download_dir=$2 + + download_module="${download_path%%/*}" + download_source="ftp.ncbi.nlm.nih.gov::$download_path" + download_destination="$download_dir/$download_module" + + mkdir -p $download_destination + echo "Downloading $download_source to $download_destination" + rsync --no-motd -DHPRprtv "$download_source" "$download_destination" +} + +relative_path="refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical" + +# download historical genbank file +file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz" +download_ncbi_file $file_path $ncbi_dir + +# download historical gff file +file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz" +download_ncbi_file $file_path $ncbi_dir + +# extract intermediate files from genbank file +python misc/refseq-historical-backfill/ncbi_extract_gbff.py \ + "$ncbi_dir/$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz" \ + --output_dir "$working_dir" 2>&1 | tee "$log_dir/ncbi-parse-historical-ggbb.log" + +# extract exonset intermediate file from gff file +python sbin/ncbi_parse_genomic_gff.py "$ncbi_dir/$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz" | \ + gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | tee "$log_dir/ncbi-parse-historical-gff.log" + +# filter exonset alignments by txinfo +sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \ + --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \ + tee "$log_dir/filter_exonset_transcripts.log" diff --git a/misc/splign-manual/docker-compose-splign-manual.yml b/misc/splign-manual/docker-compose-splign-manual.yml new file mode 100644 index 0000000..7ef0e87 --- /dev/null +++ b/misc/splign-manual/docker-compose-splign-manual.yml @@ -0,0 +1,16 @@ +# docker compose file for the splign-manual uta update procedure + +version: '3' + +services: + splign-manual: + image: uta-update + command: sbin/uta-splign-manual ${UTA_ETL_OLD_UTA_VERSION} /uta-splign-manual/input /uta-splign-manual/work /uta-splign-manual/logs + depends_on: + uta: + condition: service_healthy + volumes: + - ${UTA_SPLIGN_MANUAL_DIR}:/uta-splign-manual/input + - ${UTA_ETL_WORK_DIR}:/uta-splign-manual/work + - ${UTA_ETL_LOG_DIR}:/uta-splign-manual/logs + network_mode: host diff --git a/misc/splign-manual/uta-splign-manual b/misc/splign-manual/uta-splign-manual new file mode 100755 index 0000000..cb1e9ee --- /dev/null +++ b/misc/splign-manual/uta-splign-manual @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +# Process splign-manual alignments + +set -euxo pipefail + +source_uta_v=$1 +input_dir=$2 +working_dir=$3 +log_dir=$4 + +if [ -z "$source_uta_v" ] || [ -z "$input_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] +then + echo 'Usage: misc/uta-splign-manual ' + exit 1 +fi + +# set local variables and create working directories +loading_uta_v="uta" +working_dir="$working_dir/splign-manual" +log_dir="$log_dir/splign-manual" +mkdir -p "$log_dir" +mkdir -p "$working_dir" + +# Generate txinfo.gz and exonset.gz files +python sbin/generate-loading-data $input_dir/alignments/*.splign --txdata $input_dir/txdata.yaml \ + --output-dir $working_dir 2>&1 | tee "$log_dir/generate-loading-data.log" + +# Generate fasta files +seqrepo --root-directory "/biocommons/dl.biocommons.org/seqrepo" \ + export $(gzip -cdq $working_dir/txinfo.gz | cut -f2 | tail +2) \ + --instance-name "master" | gzip -c > $working_dir/seqs.fa.gz 2>&1 | tee "$log_dir/seqrepo-export.log" + +# Generate seqinfo.gz file +sbin/fasta-to-seqinfo -o NCBI $working_dir/seqs.fa.gz | gzip -c > $working_dir/seqinfo.gz 2>&1 | \ + tee "$log_dir/fasta-to-seqinfo.log" + +# Load seqinfo +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-seqinfo $working_dir/seqinfo.gz 2>&1 | \ + tee "$log_dir/load-seqinfo.log" + +# Load txinfo +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo $working_dir/txinfo.gz 2>&1 | \ + tee "$log_dir/load-txinfo.log" + +# Load exonset +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset $working_dir/exonset.gz 2>&1 | \ + tee "$log_dir/load-exonset.log" + +# Align exons +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | tee "$log_dir/align-exons.log" + +### run diff +sbin/uta-diff "$source_uta_v" "$loading_uta_v" 2>&1 | tee "$log_dir/uta-diff.log" + +### psql_dump +pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$working_dir/uta.pgd.gz" diff --git a/pyproject.toml b/pyproject.toml index 7bfc17d..af4e7e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "uta" dynamic = ["version"] description = "Universal Transcript Archive" readme = "README.md" -requires-python = ">=3.5" +requires-python = ">=3.9" license = {text = "Apache-2.0"} keywords = [ @@ -39,6 +39,7 @@ classifiers = [ ] dependencies = [ + "alembic", "attrs", "biocommons.seqrepo", "biopython>=1.69", @@ -47,17 +48,24 @@ dependencies = [ "configparser", "docopt", "eutils>=0.3.2", + "importlib_resources", + "more_itertools", "nose", "prettytable", "psycopg2-binary", "pytz", "recordtype", + "retry", "sqlalchemy", - "uta-align", + "uta-align>=0.3", ] [project.optional-dependencies] -test = ["coverage", "testing.postgresql"] +test = [ + "coverage", + "parameterized", + "testing.postgresql", +] [project.urls] # Optional "Homepage" = "https://github.com/biocommons/uta" diff --git a/sbin/assoc-acs-merge b/sbin/assoc-acs-merge index 7c7ec00..1cfbff9 100755 --- a/sbin/assoc-acs-merge +++ b/sbin/assoc-acs-merge @@ -15,7 +15,6 @@ duplicates""" import csv import gzip import io -import os import sys import attr @@ -43,15 +42,15 @@ if __name__ == "__main__": aars = dict() for fn in sys.argv[1:]: ifh = csv.DictReader(anyopen(fn), delimiter="\t") - + if ofh is None: ofh = csv.DictWriter(sys.stdout, fieldnames=out_header, delimiter="\t") ofh.writeheader() - + for r in ifh: if r["tx_ac"] in ("","-") or r["pro_ac"] in ("","-"): continue - + aar = AssAccRec(tx_ac=r["tx_ac"], pro_ac=r["pro_ac"], origin=r["origin"]) k = (aar.tx_ac, aar.origin) if k in aars: diff --git a/sbin/coalesce_exonsets.py b/sbin/coalesce_exonsets.py new file mode 100755 index 0000000..2ba0df8 --- /dev/null +++ b/sbin/coalesce_exonsets.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +""" +This script coalesces exonsets from multiple input files. It builds a cache of tx_ac/alt_ac pairs. If a mapping is +seen in a later input file, the exonset is skipped. The output is written to stdout. +""" + +import argparse +import logging.config +import sys +from typing import Dict, List, Tuple + +import importlib_resources + +from uta.formats.exonset import ExonSetReader, ExonSetWriter +from uta.tools.file_utils import open_file + +logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf") +logging.config.fileConfig(logging_conf_fn) +logging.getLogger().setLevel(logging.INFO) +logger = logging.getLogger(__name__) + + +def coalesce_exonsets(exonset_files: List[str]) -> None: + skipped = 0 + esw = ExonSetWriter(sys.stdout) + seen_ess: Dict[Tuple[str, str], str] = {} + + for exonset_fn in exonset_files: + logger.info(f" - processing exonset file {exonset_fn}") + with open_file(exonset_fn) as f: + exonsets = ExonSetReader(f) + for exonset in exonsets: + key = (exonset.tx_ac, exonset.alt_ac) + if key in seen_ess: + logger.warning(f" - exon set for transcript {exonset.tx_ac}/{exonset.alt_ac} already " + f"seen in {seen_ess[(exonset.tx_ac, exonset.alt_ac)]}. Skipping.") + skipped += 1 + else: + seen_ess[key] = exonset_fn + esw.write(exonset) + + logger.info(f"Coalesced {len(seen_ess)} exonsets from {len(exonset_files)} files, skipped {skipped} duplicates.") + return seen_ess + + +def main(): + parser = argparse.ArgumentParser(description='Coalesce exonsets.') + parser.add_argument('exonsets', nargs="+", help='Path to the exonset file') + args = parser.parse_args() + + logger.info(f"Coalescing exonsets from {len(args.exonsets)} files") + coalesce_exonsets(args.exonsets) + + +if __name__ == '__main__': + main() diff --git a/sbin/exonset-to-seqinfo b/sbin/exonset-to-seqinfo index 1ccbf95..21544f3 100755 --- a/sbin/exonset-to-seqinfo +++ b/sbin/exonset-to-seqinfo @@ -5,18 +5,17 @@ import argparse import configparser as ConfigParser import gzip +import importlib_resources import itertools import logging import logging.config -import pkg_resources import re import sys from bioutils.digests import seq_md5 from biocommons.seqrepo import SeqRepo -# from multifastadb import MultiFastaDB -from uta.formats.exonset import ExonSet, ExonSetReader +from uta.formats.exonset import ExonSetReader from uta.formats.seqinfo import SeqInfo, SeqInfoWriter @@ -32,16 +31,15 @@ def parse_args(argv): required=True) ap.add_argument("--conf", default=[ - pkg_resources.resource_filename("uta", "../etc/global.conf")] - ) + importlib_resources.files("uta").joinpath("../../etc/global.conf") + ]) opts = ap.parse_args(argv) return opts if __name__ == "__main__": - logging_conf_fn = pkg_resources.resource_filename( - "uta", "etc/logging.conf") + logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf") logging.config.fileConfig(logging_conf_fn) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -50,18 +48,16 @@ if __name__ == "__main__": opts = parse_args(sys.argv[1:]) - cf = ConfigParser.SafeConfigParser() + cf = ConfigParser.ConfigParser() for conf_fn in opts.conf: - cf.readfp(open(conf_fn)) - logger.info("loaded " + conf_fn) + cf.read_file(open(conf_fn)) + logger.info("loaded " + str(conf_fn)) in_fn = opts.FILES[0] in_fh = gzip.open(in_fn, 'rt') if in_fn.endswith(".gz") else open(in_fn) esr = ExonSetReader(in_fh) logger.info("opened " + in_fn) - #fa_dirs = cf.get("sequences", "fasta_directories").strip().splitlines() - #mfdb = MultiFastaDB(fa_dirs, use_meta_index=True) sr_dir = cf.get("sequences", "seqrepo") sr = SeqRepo(root_dir=sr_dir) logger.info("Opened sequence directories: " + sr_dir) diff --git a/sbin/filter_exonset_transcripts.py b/sbin/filter_exonset_transcripts.py new file mode 100755 index 0000000..1d97cc4 --- /dev/null +++ b/sbin/filter_exonset_transcripts.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +import argparse +import csv +import logging.config +import sys + +import importlib_resources + +from uta.formats.exonset import ExonSetReader, ExonSetWriter +from uta.formats.txinfo import TxInfoReader +from uta.tools.file_utils import open_file + +logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf") +logging.config.fileConfig(logging_conf_fn) +logging.getLogger().setLevel(logging.INFO) +logger = logging.getLogger(__name__) + + +def filter_exonset(exonset_file, transcript_ids, missing_ids_file): + with open_file(exonset_file) as es_f, open(missing_ids_file, 'w') as missing_f: + exonsets = ExonSetReader(es_f) + esw = ExonSetWriter(sys.stdout) + writer_missing = csv.writer(missing_f) + missing_acs = set() + + for exonset in exonsets: + if exonset.tx_ac in transcript_ids: + esw.write(exonset) + else: + logger.debug(f"Exon set transcript {exonset.tx_ac} not found in txinfo file. Filtering out.") + writer_missing.writerow([exonset.tx_ac]) + missing_acs.add(exonset.tx_ac) + logger.info(f"Filtered out exon sets for {len(missing_acs)} transcript(s)") + + +def main(): + parser = argparse.ArgumentParser(description='Filter exonset data.') + parser.add_argument('--tx-info', help='Path to the transcript info file') + parser.add_argument('--exonsets', help='Path to the exonset file') + parser.add_argument('--missing-ids', help='Path to the missing transcript ids file') + args = parser.parse_args() + + with open_file(args.tx_info) as f: + tx_reader = TxInfoReader(f) + transcript_ids = {row.ac for row in tx_reader} + filter_exonset(args.exonsets, transcript_ids, args.missing_ids) + + +if __name__ == '__main__': + main() diff --git a/loading/data/splign-manual/generate-loading-data b/sbin/generate-loading-data similarity index 70% rename from loading/data/splign-manual/generate-loading-data rename to sbin/generate-loading-data index 86744ed..0b2fad3 100755 --- a/loading/data/splign-manual/generate-loading-data +++ b/sbin/generate-loading-data @@ -6,6 +6,7 @@ import argparse import csv import gzip import logging +import os import sys import yaml @@ -20,7 +21,6 @@ method = "splign-manual" txinfo_fn = "txinfo.gz" exonset_fn = "exonset.gz" -txdata_fn = "txdata.yaml" def parse_args(argv): @@ -31,6 +31,16 @@ def parse_args(argv): "FILES", nargs="*" ) + ap.add_argument( + "--txdata", + required=True, + help="Path to txdata.yaml" + ) + ap.add_argument( + "--output-dir", + required=True, + help="Path to output directory" + ) opts = ap.parse_args(argv) return opts @@ -61,15 +71,19 @@ def parse_splign(fn, txdata): try: txd = txdata[tx_ac] except KeyError: - raise KeyError(f"{tx_ac}: no cds or hgnc info in {txdata_fn}") + raise KeyError(f"{tx_ac}: no cds or gene_symbol info in txdata") + + gene_symbol = txd["hgnc"] - hgnc = txd["hgnc"] - if hgnc is None: - _logger.warn(f"No HGNC symbol in {txdata_fn} for {tx_ac}") + gene_id = txd["gene_id"] + if gene_id is None: + msg = f"No gene id in txdata for {tx_ac}" + _logger.error(msg) + raise ValueError(msg) cds = txd["cds"] if cds is None: - _logger.warning(f"No CDS info {txdata_fn} for {tx_ac}; will be non-coding transcript") + _logger.warning(f"No CDS info txdata for {tx_ac}; will be non-coding transcript") cds_se_i = None else: cds = [int(i) for i in txd["cds"].split(",")] @@ -79,9 +93,12 @@ def parse_splign(fn, txdata): txinfo = uta.formats.txinfo.TxInfo( origin=origin, ac=tx_ac, - hgnc=hgnc, + gene_id=gene_id, + gene_symbol=gene_symbol, cds_se_i=cds_se_i, - exons_se_i=tx_exons_str) + exons_se_i=tx_exons_str, + transl_except=None, + ) exonset = uta.formats.exonset.ExonSet( tx_ac=tx_ac, alt_ac=alt_ac, @@ -97,11 +114,11 @@ if __name__ == "__main__": opts = parse_args(sys.argv[1:]) - txdata = yaml.load(open(txdata_fn), Loader=yaml.SafeLoader) + txdata = yaml.load(open(opts.txdata), Loader=yaml.SafeLoader) + + txinfo_out = uta.formats.txinfo.TxInfoWriter(gzip.open(os.path.join(opts.output_dir, txinfo_fn), "wt")) + exonset_out = uta.formats.exonset.ExonSetWriter(gzip.open(os.path.join(opts.output_dir, exonset_fn), "wt")) - txinfo_out = uta.formats.txinfo.TxInfoWriter(gzip.open(txinfo_fn, "wt")) - exonset_out = uta.formats.exonset.ExonSetWriter(gzip.open(exonset_fn, "wt")) - for fn in opts.FILES: _logger.info("# " + fn) txinfo, exonset = parse_splign(fn, txdata) diff --git a/sbin/ncbi-download b/sbin/ncbi-download new file mode 100755 index 0000000..09ad4c2 --- /dev/null +++ b/sbin/ncbi-download @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +# This script downloads the files needed for a UTA+SeqRepo update into to the given directory. +# +# DESTINATION_DIR will have a directory structure matching the source. + +set -e + +FILE_PATH_CONFIG=$1 +DOWNLOAD_DIR=$2 + +if [ -z "$FILE_PATH_CONFIG" ] || [ -z "$DOWNLOAD_DIR" ] +then + echo 'Usage: sbin/ncbi-download ' + exit 1 +else + echo "Downloading files to $DOWNLOAD_DIR" +fi + +grep -v -e '^#' -e '^$' "$FILE_PATH_CONFIG" | while read -r DOWNLOAD_PATH; do + # each top-level directory in NCBI is an rsync module. + # bash parameter expansion removes all content after first slash. + DOWNLOAD_MODULE="${DOWNLOAD_PATH%%/*}" + DOWNLOAD_SRC="ftp.ncbi.nlm.nih.gov::$DOWNLOAD_PATH" + DOWNLOAD_DST="$DOWNLOAD_DIR/$DOWNLOAD_MODULE" + mkdir -p $DOWNLOAD_DST + echo "Downloading $DOWNLOAD_SRC to $DOWNLOAD_DST" + rsync --no-motd -DHPRprtv "$DOWNLOAD_SRC" "$DOWNLOAD_DST" +done diff --git a/sbin/ncbi-parse-gbff b/sbin/ncbi-parse-gbff index 45897c6..110f01a 100755 --- a/sbin/ncbi-parse-gbff +++ b/sbin/ncbi-parse-gbff @@ -9,8 +9,8 @@ See uta.formats for a description of those file formats. In a nutshell, this means that you'll get data like this: ncbi.txinfo.gz: -origin ac hgnc cds_se_i exons_se_i -NCBI RefSeq NM_053283.2 DCD 62,395 0,120;120,159;159,261;261,351;351,517 +origin ac gene_id gene_symbol cds_se_i exons_se_i +NCBI RefSeq NM_053283.2 117159 DCD 62,395 0,120;120,159;159,261;261,351;351,517 ncbi.exonsets.gz: tx_ac alt_ac method strand exons_se_i @@ -27,23 +27,19 @@ from __future__ import division, unicode_literals import argparse from collections import Counter import gzip +import importlib_resources import io -import itertools import logging import logging.config -import os -import pprint -import pkg_resources import re import sys import Bio.SeqIO -import Bio.SeqRecord from bioutils.digests import seq_md5 -from uta.formats.exonset import ExonSet, ExonSetWriter from uta.formats.txinfo import TxInfo, TxInfoWriter -from uta.formats.geneaccessions import GeneAccessionsReader +from uta.parsers.seqrecord import SeqRecordFacade, SeqRecordFeatureError + origin = "NCBI" @@ -62,38 +58,6 @@ def parse_args(argv): return opts -class SeqRecordFacade(Bio.SeqRecord.SeqRecord): - - def __init__(self, seqrecord): - self._sr = seqrecord - - @property - def id(self): - return self._sr.id - - @property - def hgnc(self): - genes = [f for f in self._sr.features if f.type == "gene"][ - 0].qualifiers["gene"] - assert len(genes) == 1 - return genes[0] - - @property - def cds_se_i(self): - try: - cds = [f for f in self._sr.features if f.type == "CDS"][0] - except IndexError: - return None - return (cds.location.start.real, cds.location.end.real) - - @property - def exons_se_i(self): - # ,"misc_feature"]] - exons = [f for f in self._sr.features if f.type in ["exon"]] - se = [(f.location.start.real, f.location.end.real) for f in exons] - return se - - def gbff_filter(it): """pre-filter genbank file stream for records that match a specific LOCUS pattern""" delim = "//" @@ -107,6 +71,7 @@ def gbff_filter(it): if line.startswith(delim): emit = False + def gbff_block_reader(it): """yield strings, each representing a full genbank record""" delim = "//" @@ -122,9 +87,9 @@ def gbff_block_reader(it): yield SeqRecordFacade(Bio.SeqIO.read(io.StringIO(emit), "gb")) emit = None + if __name__ == "__main__": - logging_conf_fn = pkg_resources.resource_filename( - "uta", "etc/logging.conf") + logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf") logging.config.fileConfig(logging_conf_fn) logging.getLogger().setLevel(logging.INFO) logger = logging.getLogger(__name__) @@ -146,17 +111,18 @@ if __name__ == "__main__": if srf.id.partition("_")[0] not in ["NM", "NR"]: skipped_ids.add(srf.id) continue - cds_se_i = srf.cds_se_i - ti = TxInfo(ac=srf.id, - origin=opts.origin, - hgnc=srf.hgnc, - cds_se_i=None if cds_se_i is None else "{},{}".format( - *cds_se_i), - exons_se_i=";".join( - ["{},{}".format(*ese) for ese in srf.exons_se_i]) - ) + ti = TxInfo( + ac=srf.id, + origin=opts.origin, + gene_id=srf.gene_id, + gene_symbol=srf.gene_symbol, + cds_se_i=TxInfo.serialize_cds_se_i(srf.cds_se_i), + exons_se_i=TxInfo.serialize_exons_se_i(srf.exons_se_i), + codon_table=srf.codon_table, + transl_except=TxInfo.serialize_transl_except(srf.transl_except), + ) tiw.write(ti) - genes.add(srf.hgnc) + genes.add(srf.gene_symbol) logger.info("{ng} genes in {fn} ({c})".format(ng=len(genes), fn=fn, c=prefixes)) total_genes ^= genes all_prefixes += prefixes diff --git a/sbin/ncbi-parse-gene2refseq b/sbin/ncbi-parse-gene2refseq index 5d11206..1f2f6cb 100755 --- a/sbin/ncbi-parse-gene2refseq +++ b/sbin/ncbi-parse-gene2refseq @@ -10,8 +10,6 @@ ftp://ftp.ncbi.nih.gov/gene/DATA/gene2refseq.gz import io import sys -from csv import DictReader - from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter from uta.formats.ncbitsv import NCBITSVReader @@ -38,16 +36,16 @@ if __name__ == "__main__": if rec["rna_nucleotide_accession.version"] == "-" and rec["protein_accession.version"] == "-": continue - ga = GeneAccessions(hgnc=rec["symbol"], + ga = GeneAccessions(gene_symbol=rec["symbol"], tx_ac=rec["rna_nucleotide_accession.version"], gene_id=rec["geneid"], pro_ac=rec["protein_accession.version"], origin="NCBI", ) - key = (ga.hgnc, ga.tx_ac, ga.gene_id, ga.pro_ac) + key = (ga.gene_symbol, ga.tx_ac, ga.gene_id, ga.pro_ac) if key in seen: - continue + continue seen.add(key) gaw.write(ga) diff --git a/sbin/ncbi-parse-geneinfo b/sbin/ncbi-parse-geneinfo index 73966b8..bc33aad 100755 --- a/sbin/ncbi-parse-geneinfo +++ b/sbin/ncbi-parse-geneinfo @@ -30,10 +30,9 @@ if __name__ == "__main__": giw = GeneInfoWriter(sys.stdout) for rec in gi_in: - if rec["symbol_from_nomenclature_authority"] == "-": - continue gi = GeneInfo( tax_id=rec["tax_id"], + gene_symbol=rec["symbol"], gene_id=rec["geneid"], hgnc=rec["symbol_from_nomenclature_authority"], maploc=rec["map_location"], diff --git a/sbin/ncbi-parse-gff b/sbin/ncbi-parse-gff index 4a34ed2..cdc546a 100755 --- a/sbin/ncbi-parse-gff +++ b/sbin/ncbi-parse-gff @@ -1,17 +1,13 @@ #!/usr/bin/env python -"""Write exonsets and txinfo files from NCBI GFF alignments, as obtained from -ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/alignments/ +"""Write exonsets files from NCBI GFF alignments, as obtained from +ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_*/ This service appeared in April 2015 and is due to update weekly. See uta.formats for a description of those file formats. In a nutshell, this means that you'll get data like this: -ncbi.txinfo.gz: -origin ac hgnc cds_se_i exons_se_i -NCBI RefSeq NM_053283.2 DCD 62,395 0,120;120,159;159,261;261,351;351,517 - ncbi.exonsets.gz: tx_ac alt_ac method strand exons_se_i NM_130786.3 NC_000019.9 splign -1 58864769,58864865;588646... @@ -35,12 +31,12 @@ from __future__ import division import argparse import collections import gzip +import importlib_resources import io import itertools import logging.config import os import pprint -import pkg_resources import re import sys @@ -50,6 +46,7 @@ import prettytable from uta.formats.exonset import ExonSet, ExonSetWriter from uta.formats.txinfo import TxInfo, TxInfoWriter, TxInfoReader from uta.formats.geneaccessions import GeneAccessionsReader +from uta.tools.file_utils import open_file origin = "NCBI" @@ -138,23 +135,22 @@ class TranscriptAlignment(object): return self.exon_alignments[0].pct_identity_ungap -def parse_args(argv): +def parse_args(): ap = argparse.ArgumentParser( description=__doc__, ) - ap.add_argument("in_fn") + ap.add_argument("GFF_files", nargs="+", + help="NCBI GFF files to process") ap.add_argument("--origin", "-o", default=origin) ap.add_argument("--prefix", "-p", default="ncbi-gff") - ap.add_argument("--geneacs", "-G") - ap.add_argument("--txinfo", "-T", required=False) ap.add_argument("--strict-coverage", "-C", type=float, default=95.0) ap.add_argument("--min-coverage", "-c", type=float, default=85.0) ap.add_argument("--strict-pct-identity-gap", "-I", type=float, default=95.0) ap.add_argument("--min-pct-identity-gap", "-i", type=float, default=85.0) - opts = ap.parse_args(argv) + opts = ap.parse_args() assert opts.strict_coverage > opts.min_coverage assert opts.strict_pct_identity_gap > opts.min_pct_identity_gap @@ -165,24 +161,28 @@ def parse_args(argv): def read_exon_alignments(fn): """read lines of NCBI's alignment gff file, fn, returning ExonAlignment records""" - # NC_000007.13 RefSeq cDNA_match 50344265 50344518 254 + . ID=aln58042;Target=NM_001220765.2 1 254 +;gap_count=0;identity=0.0691326;idty=1;num_ident=428;num_mismatch=0;pct_coverage=6.91326;pct_identity_gap=100;pct_identity_ungap=100;score=254 - # NC_000002.11 RefSeq cDNA_match 179671939 179672150 212 - . ID=ed951d46-194c-477a-a480-4bc64530c5ba;Target=NM_001267550.2 1 212 +;gap_count=0;identity=0.999991;idty=1;num_ident=109223;num_mismatch=1;pct_coverage=100;pct_identity_gap=99.9991;pct_identity_ungap=99.9991 + # NC_000022.10 RefSeq cDNA_match 20783512 20783627 116 - . ID=7b8c7a437b92bf9dee20d81acadadd8e;Target=NM_182895.5 1496 1611 +;consensus_splices=20;exon_identity=0.99769;for_remapping=2;gap_count=3;identity=0.99769;idty=1;matches=3455;num_ident=3455;num_mismatch=5;pct_coverage=99.9134;pct_coverage_hiqual=99.9134;pct_identity_gap=99.769;pct_identity_ungap=99.8555;product_coverage=1;rank=1;splices=20;weighted_identity=0.996898 + # NC_000022.10 RefSeq cDNA_match 20781685 20781837 153 - . ID=7b8c7a437b92bf9dee20d81acadadd8e;Target=NM_182895.5 1612 1764 +;consensus_splices=20;exon_identity=0.99769;for_remapping=2;gap_count=3;identity=0.99769;idty=1;matches=3455;num_ident=3455;num_mismatch=5;pct_coverage=99.9134;pct_coverage_hiqual=99.9134;pct_identity_gap=99.769;pct_identity_ungap=99.8555;product_coverage=1;rank=1;splices=20;weighted_identity=0.996898 + # NC_000022.10 RefSeq cDNA_match 20778874 20780569 1676.05 - . ID=7b8c7a437b92bf9dee20d81acadadd8e;Target=NM_182895.5 1765 3463 +;consensus_splices=20;exon_identity=0.99769;for_remapping=2;gap_count=3;identity=0.99769;idty=0.995291;matches=3455;num_ident=3455;num_mismatch=5;pct_coverage=99.9134;pct_coverage_hiqual=99.9134;pct_identity_gap=99.769;pct_identity_ungap=99.8555;product_coverage=1;rank=1;splices=20;weighted_identity=0.996898;Gap=M540 I1 M5 I1 M51 I1 M1100 line_re = re.compile( "(?P\S+)\s+(?P\S+)\s+(?P\S+)\s+" "(?P\d+)\s+(?P\d+)\s+(?P\S+)\s+" "(?P[-+])\s+\.\s+ID=(?P[^;]+);Target=(?P\S+)" "\s+(?P\d+)\s+(?P\d+).+?" - "pct_coverage=(?P[^;]+);" + "pct_coverage=(?P[^;]+);.+?" "pct_identity_gap=(?P[^;]+);" "pct_identity_ungap=(?P[^;]+)" ) - fh = io.open(fn, "rt") - for line in fh.readlines(): - if not line.startswith('#'): - try: - yield ExonAlignment(**line_re.match(line).groupdict()) - except (AttributeError, ValueError): - raise Exception("Failed at", line) + + with open_file(fn) as fh: + for line in fh: + if not line.startswith('#'): + try: + re_match = line_re.match(line) + if re_match and re_match["match_type"] == "cDNA_match": + yield ExonAlignment(**line_re.match(line).groupdict()) + except (AttributeError, ValueError): + raise Exception("Failed at", line) def read_transcript_alignments(fn): @@ -217,14 +217,8 @@ def group_transcript_alignments(transcript_alignments): for key, alns_i in itertools.groupby(transcript_alignments, key=_key)) -def convert_exon_data(opts, transcript_alignment): +def convert_exon_data(transcript_alignment): """return (TxInfo,ExonSet) tuple for given exon record data""" - ti = TxInfo(ac=transcript_alignment.tx_ac, - origin=opts.origin, - hgnc=None, - cds_se_i=None, - exons_se_i=transcript_alignment.tx_exons_se_i - ) es = ExonSet( tx_ac=transcript_alignment.tx_ac, alt_ac=transcript_alignment.ref_ac, @@ -232,67 +226,20 @@ def convert_exon_data(opts, transcript_alignment): strand=-1 if transcript_alignment.strand == "-" else 1, exons_se_i=transcript_alignment.ref_exons_se_i ) - return (ti, es) - + return es -if __name__ == "__main__": - logging_conf_fn = pkg_resources.resource_filename( - "uta", "etc/logging.conf") - logging.config.fileConfig(logging_conf_fn) - logging.getLogger().setLevel(logging.INFO) - logger = logging.getLogger(__name__) - - opts = parse_args(sys.argv[1:]) - - if opts.geneacs: - gar = GeneAccessionsReader(gzip.open(opts.geneacs, "rt")) - tx2gene = {ga.tx_ac: ga.hgnc for ga in gar} - logger.info( - "read {} gene-accession mappings from {}".format(len(tx2gene), opts.geneacs)) - else: - tx2gene = None - logger.info("No geneacs (-G) file provided; gene info will be empty.") - - if opts.txinfo: - tir = TxInfoReader(gzip.open(opts.txinfo, "rt")) - tx2ti = {ti.ac: ti for ti in tir} - logger.info( - "read {} CDS data from {}".format(len(tx2ti), opts.txinfo)) - # add any gene-accession mappings from txinfo file if they are not in geneacs file; log warning if they disagree - if tx2gene: - for ti_ac in tx2ti: - if not tx2gene.get(ti_ac): - tx2gene[ti_ac] = tx2ti[ti_ac].hgnc - if tx2gene[ti_ac] != tx2ti[ti_ac].hgnc: - logger.warning('HGNC symbol disagrees in txinfo ({tx2ti_hgnc}) and geneacs ({tx2gene_hgnc}) files for accession {ti_ac}'.format( - tx2ti_hgnc=tx2ti[ti_ac].hgnc, - tx2gene_hgnc=tx2gene[ti_ac].hgnc, - ti_ac=ti_ac - )) - else: - tx2ti = None - logger.info("No gbff txinfo provided (-T); CDS start,end will be undefined for all transcripts and transcript-genome exon structures will not be verified") - - es_fn = opts.prefix + "exonset.gz" - ti_fn = opts.prefix + "txinfo.gz" - - esw = ExonSetWriter(gzip.open(es_fn + ".tmp", "wt")) - tiw = TxInfoWriter(gzip.open(ti_fn + ".tmp", "wt")) - - ties = {} - ti_written = collections.defaultdict(lambda: False) - ac_not_in_gbff = set() - ac_exons_differ = set() +def write_exonsets_from_gff_file(gff_fn, logger, opts, esw): + """write exonsets from a single gff file""" ac_in_source = set() ac_failed = set() - bins = "nogbff esdiffer unique multiple minimum none".split() + bins = "unique multiple minimum none skipped".split() sets = collections.defaultdict(lambda: {k: list() for k in bins}) - transcript_alignments = read_transcript_alignments(opts.in_fn) + transcript_alignments = read_transcript_alignments(gff_fn) logger.info( - "read {} transcript alignments from {}".format(len(transcript_alignments), opts.in_fn)) + "read {} transcript alignments from {}".format(len(transcript_alignments), gff_fn)) for _, txalns in group_transcript_alignments(transcript_alignments): assert len(txalns) > 0 @@ -300,40 +247,13 @@ if __name__ == "__main__": ta0 = txalns[0] tx_ac, ref_ac = ta0.tx_ac, ta0.ref_ac skey = "{:.2s} {:.2s}".format(tx_ac, ref_ac) + if not tx_ac[:2] in ("NM", "NR") or not ref_ac[:2] == "NC": + sets[skey]["skipped"] += [txalns] + continue + bin = None - - # ############################################################ - # Optionally compare exon structure from gbff with input gff - # And get cds s,e from gbff (sigh) - if tx2ti is None: - cds_se_i = None - txalns_esm = txalns - else: - if tx_ac not in tx2ti: - logger.warning("{ta.tx_ac}~{ta.ref_ac}: no transcript info in {opts.txinfo}; skipping transcript".format( - ta=ta0, opts=opts)) - ac_not_in_gbff.add(tx_ac) - bin = "nogbff" - sets[skey][bin] += [txalns] - continue - - gbff_ti = tx2ti[tx_ac] - txalns_esm = [ta for ta in txalns if ta.tx_exons_se_i == gbff_ti.exons_se_i] - n_rm = len(txalns) - len(txalns_esm) - if n_rm > 0: - logger.warning("{ta.tx_ac}~{ta.ref_ac}: Removed {n_rm}/{n_tot} exon structures that differ from gbff definition".format( - n_rm=n_rm, n_tot=len(txalns), ta=ta, opts=opts)) - if len(txalns_esm) == 0: - logger.warning("{ta.tx_ac}~{ta.ref_ac}: All {n} exon structures differ from gbff definition; skipping alignment".format( - ta=ta, opts=opts, n=len(txalns))) - ac_exons_differ.add(tx_ac) - bin = "esdiffer" - sets[skey][bin] += [txalns] - continue - - cds_se_i = gbff_ti.cds_se_i # possibly None - - + txalns_load = [] + # ############################################################ # Filter alignments by coverage and pct_identity_gap # From Terence Murphy, NCBI: @@ -341,7 +261,7 @@ if __name__ == "__main__": # and RefSeqGene alignments that meet the filter: # 'pct_identity_gap >= 99.5 and pct_coverage >= 95'" txalns_strict = [txaln - for txaln in txalns_esm + for txaln in txalns if (txaln.pct_coverage > opts.strict_coverage and txaln.pct_identity_gap > opts.strict_pct_identity_gap)] @@ -354,90 +274,76 @@ if __name__ == "__main__": logger.warning("{ta.tx_ac}~{ta.ref_ac}: Multiple ({n}) strict alignments; cov/pig: {stats}".format( ta=txalns_strict[0], n=len(txalns_strict), opts=opts, - stats = "; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns_strict), - )) + stats="; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns_strict), + )) txalns_load = txalns_strict bin = "multiple" if len(txalns_strict) == 0: txalns_min = [txaln - for txaln in txalns_esm + for txaln in txalns if (txaln.pct_coverage > opts.min_coverage and txaln.pct_identity_gap > opts.min_pct_identity_gap)] if len(txalns_min) == 0: logger.warning("{ta.tx_ac}~{ta.ref_ac}: No usable alignments; cov/pig: {stats}".format( ta=txalns[0], - stats = "; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns_esm), - )) + stats="; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns), + )) bin = "none" + ac_failed.add(skey) else: - logger.warning("{ta.tx_ac}~{ta.ref_ac}: Resorting to minimum criteria; loading {n} alignments; cov/pig: {stats}".format( - ta=txalns_min[0], n=len(txalns_min), - stats = "; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns_min), + logger.warning( + "{ta.tx_ac}~{ta.ref_ac}: Resorting to minimum criteria; loading {n} alignments; cov/pig: {stats}".format( + ta=txalns_min[0], n=len(txalns_min), + stats="; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns_min), )) bin = "minimum" txalns_load = txalns_min - sets[skey][bin] += [txalns_esm] + sets[skey][bin] += [txalns] for ta in txalns_load: - ti, es = convert_exon_data(opts, ta) - ti.cds_se_i = cds_se_i + es = convert_exon_data(ta) ac_in_source.add(tx_ac) - ti.hgnc = tx2gene.get(ti.ac, None) - - if not ti_written[ti.ac]: - # write a single txinfo line once; multiple may occur for multiple alignments of e.g., one NM to NC, NW, NT - tiw.write(ti) - ti_written[ti.ac] = True esw.write(es) # END HEINOUS LOOP - for fn in [ti_fn, es_fn]: - os.rename(fn + ".tmp", fn) - seen_but_failed = ac_failed - ac_in_source if seen_but_failed: logger.warning("{n_acv} acvs seen but failed criteria: {acs}".format( n_acv=len(seen_but_failed), acs=",".join(sorted(seen_but_failed)))) - if ac_not_in_gbff: - s_not_g_b = set(k.partition(".")[ - 0] for k in ac_in_source) - set(k.partition(".")[0] for k in tx2gene.keys()) - logger.warning("{n_acv} acvs ({n_ac} base acs) in source not in geneacs file: {acs}".format( - n_acv=len(ac_not_in_gbff), n_ac=len(s_not_g_b), opts=opts, acs=",".join(sorted(ac_not_in_gbff)))) - - if ac_exons_differ: - logger.warning("{n} accessions in gbff-derived txinfo have different exon coordinates: {acs}".format( - n=len(ac_exons_differ), opts=opts, acs=",".join(sorted(ac_exons_differ)))) - - pprint.pprint(opts) pt = prettytable.PrettyTable(field_names=["ac_pair"] - + bins - + "max_coverage max_pct_identity_gap nobgffs nogbff_noup esdiffers nones".split() + + bins + + "max_coverage max_pct_identity_gap nones".split() ) for ack in sorted(sets.keys()): n = 5 - nogbff_acs = sorted(set(ta.tx_ac for ta in itertools.chain.from_iterable(sets[ack]["nogbff"])))[:n] - esdiffer_acs = sorted(set(ta.tx_ac for ta in itertools.chain.from_iterable(sets[ack]["esdiffer"])))[:n] nones = list(itertools.chain.from_iterable(sets[ack]["none"])) nones_acs = sorted(set(ta.tx_ac for ta in nones))[:n] max_pct_identity_gap = "{:.2f}".format(max(ta.pct_identity_gap for ta in nones)) if nones else "n/a" max_pct_coverage = "{:.2f}".format(max(ta.pct_coverage for ta in nones)) if nones else "n/a" - nogbff_noup = sorted( - set(ta.tx_ac.split('.')[0] for ta in itertools.chain.from_iterable(sets[ack]["nogbff"])) - - set(ta.tx_ac.split('.')[0] for ta in itertools.chain.from_iterable(sets[ack]["unique"])) - - set(ta.tx_ac.split('.')[0] for ta in itertools.chain.from_iterable(sets[ack]["multiple"])) - ) - pt.add_row([ack] + [len(sets[ack][bk]) for bk in bins] + [max_pct_coverage, max_pct_identity_gap, - " ".join(nogbff_acs), - str(len(nogbff_noup)) + ": " + " ".join(nogbff_noup[:n]), - " ".join(esdiffer_acs), - " ".join(nones_acs) ]) - print(pt) + " ".join(nones_acs)]) + logger.info("summary in table below...\n" + str(pt)) + + + +if __name__ == "__main__": + logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf") + logging.config.fileConfig(logging_conf_fn) + logging.getLogger().setLevel(logging.INFO) + logger = logging.getLogger(__name__) + + opts = parse_args() + + esw = ExonSetWriter(sys.stdout) + + for gff_fn in opts.GFF_files: + logger.info("processing {}".format(gff_fn)) + write_exonsets_from_gff_file(gff_fn, logger, opts, esw) diff --git a/sbin/ncbi_parse_genomic_gff.py b/sbin/ncbi_parse_genomic_gff.py new file mode 100755 index 0000000..0035d31 --- /dev/null +++ b/sbin/ncbi_parse_genomic_gff.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python + +"""Write exonsets from NCBI GFF alignments, as obtained from +ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions +This service appeared in April 2015 and is due to update weekly. + +See uta.formats for a description of those file formats. + +In a nutshell, this means that you'll get data like this: + +ncbi-gff.exonsets.gz: +tx_ac alt_ac method strand exons_se_i +NM_130786.3 NC_000019.9 splign -1 58864769,58864865;588646... +NM_130786.3 NC_018930.2 splign -1 58858699,58858795;588585... +NM_130786.3 AC_000151.1 splign -1 55173924,55174020;551738... +NM_138933.2 NC_000010.10 splign -1 52645340,52645435;52... + +UTA requires that the exon structure of a transcript accession as +defined on its own sequence is unique. Although this is mostly true, +there are instances where NCBI reports different exon structures for a +single transcript. For example, NM_001300954.1 aligns with 11 exons on +NC_000011.9 and 5 exons on NW_003871081.1, and the differences are NOT +due merely to concatenation of adjacent spans. +""" + +import argparse +import importlib_resources +import logging.config +import sys +from collections import defaultdict +from dataclasses import dataclass +from typing import List, Optional + +from uta.formats.exonset import ExonSet, ExonSetWriter +from uta.tools.file_utils import open_file + + +@dataclass +class GFFRecord: + seqid: str + start: int + end: int + strand: str + exon_number: int + parent_id: str + transcript_id: str + + @property + def key(self) -> str: + return f"{self.transcript_id}:{self.seqid}" + + +def _sort_exons(exons: List[GFFRecord]) -> List[GFFRecord]: + return sorted(exons, key=lambda e: e.exon_number) + + +def parse_gff_record(line: str) -> Optional[GFFRecord]: + """Parses a single line from a GFF file and returns a GFFRecord if record is an exon aligned to an NC_ chromosome and has a transcript id starting with NM_ or NR_.""" + # NC_000001.10 BestRefSeq exon 11874 12227 . + . ID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD/H-box helicase 11 like 1 (pseudogene);pseudo=true;transcript_id=NR_046018.2 + # NC_000001.10 BestRefSeq exon 12613 12721 . + . ID=exon-NR_046018.2-2;Parent=rna-NR_046018.2;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD/H-box helicase 11 like 1 (pseudogene);pseudo=true;transcript_id=NR_046018.2 + # NC_000001.10 BestRefSeq exon 13221 14409 . + . ID=exon-NR_046018.2-3;Parent=rna-NR_046018.2;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD/H-box helicase 11 like 1 (pseudogene);pseudo=true;transcript_id=NR_046018.2 + + fields = line.strip().split("\t") + if len(fields) != 9: + raise ValueError(f"Expected 9 tab-separated fields, got {len(fields)}") + + seqid, source, feature, start, end, score, strand, phase, attributes_str = fields + + if feature != "exon": + return + + attributes = {} + for attr_str in attributes_str.split(";"): + if "=" in attr_str: + key, value = attr_str.split("=") + attributes[key.lower()] = value + + parent_id = attributes.get("parent") + transcript_id = attributes.get("transcript_id") + if ( + not transcript_id + or (not transcript_id.startswith("NM_") and not transcript_id.startswith("NR_")) + or not parent_id + ): + return + try: + exon_number = _get_exon_number_from_id(alignment_id=attributes.get("id")) + except (ValueError, IndexError): + raise ValueError(f'Failed to parse exon number from {attributes.get("id")}') + + return GFFRecord( + seqid=seqid, + start=int(start), + end=int(end), + strand=strand, + exon_number=exon_number, + parent_id=parent_id, + transcript_id=transcript_id, + ) + + +def _get_exon_number_from_id(alignment_id: str) -> int: + """ + Pulls the exon number from the alignment id. Expects the id to be in the format + exon-- + """ + return int(alignment_id.split("-")[-1]) + + +def parse_gff_files(file_paths: List[str]) -> dict[str, List[GFFRecord]]: + tx_data = defaultdict(list) + for file_path in file_paths: + with open_file(file_path) as f: + for line in f: + if line.startswith("#"): + continue + try: + record = parse_gff_record(line) + except ValueError as e: + raise Exception(f"Failed at line :{line} with error: {e}") + if record: + tx_data[record.key].append(record) + return {k: _sort_exons(v) for k, v in tx_data.items()} + + +def get_zero_based_exon_ranges(transcript_exons: List[GFFRecord]) -> str: + """Convert exon ranges to 0-based half-open format""" + formatted_exons = [] + for ex in transcript_exons: + formatted_exons.append(",".join(map(str, (ex.start - 1, ex.end)))) + return ";".join(formatted_exons) + + +if __name__ == "__main__": + logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf") + logging.config.fileConfig(logging_conf_fn) + logging.getLogger().setLevel(logging.INFO) + logger = logging.getLogger(__name__) + + parser = argparse.ArgumentParser(description="Parse GFF file.") + parser.add_argument("gff_files", nargs="+", type=str, help="Path to GFF file(s)") + args = parser.parse_args() + + gff_files = args.gff_files + esw = ExonSetWriter(sys.stdout) + + transcript_alignments = parse_gff_files(gff_files) + logger.info( + f"read {len(transcript_alignments)} transcript alignments from file(s): {', '.join(gff_files)}" + ) + + for transcript_exons in transcript_alignments.values(): + exons_se = get_zero_based_exon_ranges(transcript_exons) + e = transcript_exons[0] + es = ExonSet( + tx_ac=e.transcript_id, + alt_ac=e.seqid, + method="splign", + strand=-1 if e.strand == "-" else 1, + exons_se_i=exons_se, + ) + esw.write(es) diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py new file mode 100755 index 0000000..b414f2e --- /dev/null +++ b/sbin/ncbi_process_mito.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python + +""" +Download mito fasta and gbff file. Use BioPython to parse the features in the Mitochondrial genbank file to get +the attributes of a region of the genome that correspond to genes along with their attributes. Output gene/tx/alignment +details to intermediate file needed to update UTA database and SeqRepo. + + FEATURES Location/Qualifiers + source 1..16569 + /organism="Homo sapiens" + /organelle="mitochondrion" + /mol_type="genomic DNA" + /isolation_source="caucasian" + /db_xref="taxon:9606" + /tissue_type="placenta" + /country="United Kingdom: Great Britain" + /note="this is the rCRS" + D-loop complement(join(16024..16569,1..576)) + gene 577..647 + /gene="TRNF" + /nomenclature="Official Symbol: MT-TF | Name: + mitochondrially encoded tRNA phenylalanine | Provided by: + HGNC:HGNC:7481" + /db_xref="GeneID:4558" + /db_xref="HGNC:HGNC:7481" + /db_xref="MIM:590070" + tRNA 577..647 + /gene="TRNF" + /product="tRNA-Phe" + /note="NAR: 1455" + /anticodon=(pos:611..613,aa:Phe,seq:gaa) + /codon_recognized="UUC" + /db_xref="GeneID:4558" + /db_xref="HGNC:HGNC:7481" + /db_xref="MIM:590070" + gene 648..1601 + /gene="RNR1" + /gene_synonym="MTRNR1" + /nomenclature="Official Symbol: MT-RNR1 | Name: + mitochondrially encoded 12S RNA | Provided by: + HGNC:HGNC:7470" + /db_xref="GeneID:4549" + /db_xref="HGNC:HGNC:7470" + /db_xref="MIM:561000" + rRNA 648..1601 + /gene="RNR1" + /gene_synonym="MTRNR1" + /product="s-rRNA" + /note="12S rRNA; 12S ribosomal RNA" + /db_xref="GeneID:4549" + /db_xref="HGNC:HGNC:7470" + /db_xref="MIM:561000" + ... +""" +import argparse +import dataclasses +import gzip +import importlib_resources +import logging +import logging.config +from typing import Dict, Iterable, List, Optional + +from Bio.Seq import Seq +import Bio.SeqIO +from Bio.SeqFeature import SeqFeature +from Bio.SeqRecord import SeqRecord +from bioutils.digests import seq_md5 +from more_itertools import one + +from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter +from uta.formats.geneinfo import GeneInfo, GeneInfoWriter +from uta.formats.seqinfo import SeqInfo, SeqInfoWriter +from uta.formats.txinfo import TxInfo, TxInfoWriter +from uta.formats.exonset import ExonSet, ExonSetWriter +from uta.tools.eutils import download_from_eutils, NcbiFileFormatEnum + + +@dataclasses.dataclass +class MitoGeneData: + gene_id: int + gene_symbol: str + name: str + synonym: str + xrefs: List[str] + type: str + tx_ac: str + tx_seq: str + tx_start: int + tx_end: int + alt_ac: str + alt_start: int + alt_end: int + strand: int + origin: str = "NCBI" + alignment_method: str = "splign" + transl_table: Optional[str] = None + transl_except: Optional[List[str]] = None + pro_ac: Optional[str] = None + pro_seq: Optional[str] = None + + def exons_se_i(self) -> str: + return f"{self.tx_start},{self.tx_end}" + + def cds_se_i(self) -> str: + return self.exons_se_i() if self.pro_ac else "" + + def alt_exons_se_i(self) -> str: + return f"{self.alt_start},{self.alt_end}" + + +logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf") +logging.config.fileConfig(logging_conf_fn) +logging.getLogger().setLevel(logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("accession", type=str) + parser.add_argument("--output-dir", "-o", default=".", type=str) + return parser.parse_args() + + +def download_mito_files(output_dir: str, accession: str) -> Dict[str, str]: + logger.info(f"downloading files for {accession}") + mt_gb_filepath = f"{output_dir}/{accession}.gbff" + mt_fa_filepath = f"{output_dir}/{accession}.fna" + + logger.info(f"downloading {NcbiFileFormatEnum.GENBANK} file to {mt_gb_filepath}") + download_from_eutils(accession, NcbiFileFormatEnum.GENBANK, mt_gb_filepath) + + logger.info(f"downloading {NcbiFileFormatEnum.FASTA} file to {mt_fa_filepath}") + download_from_eutils(accession, NcbiFileFormatEnum.FASTA, mt_fa_filepath) + + return {"gbff": mt_gb_filepath, "fna": mt_fa_filepath} + + +def parse_db_xrefs(gb_feature: SeqFeature) -> Dict[str, str]: + """ + Example: + Key: db_xref + Value: ['GeneID:4558', 'HGNC:HGNC:7481', 'MIM:590070'] + """ + return { + x.partition(":")[0].strip(): x.partition(":")[2].strip() + for x in gb_feature.qualifiers.get("db_xref", []) + } + + +def parse_nomenclature_value(gb_feature: SeqFeature) -> Dict[str, str]: + """ + Example: + Key: nomenclature + Value: ['Official Symbol: MT-TF | Name: mitochondrially encoded tRNA phenylalanine | Provided by: HGNC:HGNC:7481'] + """ + nomenclature_key = "nomenclature" + nomenclature_results: Dict[str, str] = {} + if nomenclature_key in gb_feature.qualifiers: + nomenclature_list = list( + map( + lambda x: x.strip(), + one(gb_feature.qualifiers[nomenclature_key]).split("|"), + ) + ) + nomenclature_results = { + x.partition(":")[0].strip(): x.partition(":")[2].strip() + for x in nomenclature_list + } + + return nomenclature_results + + +def get_mito_genes(gbff_filepath: str) -> Iterable[MitoGeneData]: + logger.info(f"processing NCBI GBFF file from {gbff_filepath}") + with open(gbff_filepath) as fh: + # Bio.SeqIO.parse(fh, "gb") returns an empty iterator for .fna files and does not fail + for record in Bio.SeqIO.parse(fh, "gb"): + for feature in record.features: + xrefs = parse_db_xrefs(feature) + + feature_start, feature_end = ( + feature.location.start, + feature.location.end, + ) + + # dependent on feature type, process data and output if appropriate + if feature.type == "gene": + # assert subsequent features represent the same location + assert feature_start == feature.location.start + assert feature_end == feature.location.end + # for gene feature do not yield anything, just set gene level attributes + gene_id = int(xrefs["GeneID"]) + nomenclature = parse_nomenclature_value(feature) + hgnc = nomenclature["Official Symbol"] + name = nomenclature["Name"] + + elif feature.type in ("tRNA", "rRNA", "CDS"): + # assert subsequent features represent the same location and gene + assert int(xrefs["GeneID"]) == gene_id + assert feature_start == feature.location.start + assert feature_end == feature.location.end + + # retrieve sequence, and reverse compliment if on reverse strand + ac = f"{record.id}_{feature.location.start:05}_{feature.location.end:05}" + feature_seq = record.seq[feature_start:feature_end] + gene_synonym = feature.qualifiers.get("gene_synonym", "") + type = feature.type + if feature.location.strand == -1: + feature_seq = feature_seq.reverse_complement() + + if feature.type == "CDS": + # override defaults for CDS features + type = "protein-coding" + pro_ac = one(feature.qualifiers["protein_id"]) + pro_seq = str(one(feature.qualifiers["translation"])) + transl_table = one(feature.qualifiers["transl_table"]) + transl_except = feature.qualifiers.get("transl_except") + else: + pro_ac = None + pro_seq = None + transl_table = None + transl_except = None + + # yield gene data + yield MitoGeneData( + gene_id=gene_id, + gene_symbol=hgnc, + name=name, + synonym=gene_synonym, + xrefs=[f"{k}:{v}" for k, v in xrefs.items()], + type=type, + tx_ac=ac, + tx_seq=str(feature_seq), + tx_start=0, + tx_end=feature.location.end - feature.location.start, + alt_ac=record.id, + alt_start=feature_start, + alt_end=feature_end, + strand=feature.location.strand, + transl_table=transl_table, + transl_except=transl_except, + pro_ac=pro_ac, + pro_seq=pro_seq, + ) + + +def main(ncbi_accession: str, output_dir: str) -> None: + # get input files + input_files = download_mito_files(output_dir=output_dir, accession=ncbi_accession) + + # extract Mitochondrial gene information + mito_genes = [mg for mf in input_files.values() for mg in get_mito_genes(mf)] + logger.info(f"found {len(mito_genes)} genes from parsing {input_files['gbff']}") + + # write gene information + with gzip.open(f"{output_dir}/geneinfo.gz", "wt") as o_file: + giw = GeneInfoWriter(o_file) + for mg in mito_genes: + giw.write( + GeneInfo( + mg.gene_id, + mg.gene_symbol, + 9606, + mg.gene_symbol, + "", + mg.synonym, + mg.type, + mg.name, + mg.name, + mg.xrefs, + ) + ) + + # write gene accession associations + with gzip.open(f"{output_dir}/assocacs.gz", "wt") as o_file: + gaw = GeneAccessionsWriter(o_file) + for mg in mito_genes: + if mg.pro_ac is not None: + gaw.write( + GeneAccessions( + mg.gene_symbol, mg.tx_ac, mg.gene_id, mg.pro_ac, mg.origin + ) + ) + + # write sequence information + with gzip.open(f"{output_dir}/seqinfo.gz", "wt") as o_file: + siw = SeqInfoWriter(o_file) + for mg in mito_genes: + siw.write( + SeqInfo( + seq_md5(mg.tx_seq), + mg.origin, + mg.tx_ac, + mg.name, + len(mg.tx_seq), + None, + ) + ) + if mg.pro_ac is not None: + siw.write( + SeqInfo( + seq_md5(mg.pro_seq), + mg.origin, + mg.pro_ac, + mg.name, + len(mg.pro_seq), + None, + ) + ) + + # write out transcript sequence fasta files. + with gzip.open(f"{output_dir}/{ncbi_accession}.rna.fna.gz", "wt") as o_file: + for mg in mito_genes: + record = SeqRecord( + Seq(mg.tx_seq), + id=mg.tx_ac, + description=mg.name, + ) + o_file.write(record.format("fasta")) + + # write out protein sequence fasta files. + with gzip.open(f"{output_dir}/{ncbi_accession}.protein.faa.gz", "wt") as o_file: + for mg in mito_genes: + if mg.pro_ac is not None: + record = SeqRecord( + Seq(mg.pro_seq), + id=mg.pro_ac, + description=mg.name, + ) + o_file.write(record.format("fasta")) + + # write transcript information + with gzip.open(f"{output_dir}/txinfo.gz", "wt") as o_file: + tiw = TxInfoWriter(o_file) + for mg in mito_genes: + tiw.write( + TxInfo( + mg.origin, + mg.tx_ac, + mg.gene_id, + mg.gene_symbol, + mg.cds_se_i(), + mg.exons_se_i(), + mg.transl_table, + TxInfo.serialize_transl_except(mg.transl_except), + ) + ) + + # write exonset + with gzip.open(f"{output_dir}/exonsets.gz", "wt") as o_file: + esw = ExonSetWriter(o_file) + for mg in mito_genes: + esw.write( + ExonSet( + mg.tx_ac, + mg.alt_ac, + mg.alignment_method, + mg.strand, + mg.alt_exons_se_i(), + ) + ) + + +if __name__ == "__main__": + args = parse_args() + main(args.accession, args.output_dir) diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load new file mode 100755 index 0000000..95b662f --- /dev/null +++ b/sbin/seqrepo-load @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +sequence_dir=$1 +log_dir=$2 + +if [ -z "$sequence_dir" ] || [ -z "$log_dir" ] +then + echo 'Usage: sbin/seqrepo-load ' + exit 1 +fi + +# find all fasta files in the working directory +mapfile -t FASTA_FILES < <(find "$sequence_dir" -type f -name "*.f[an]a*") + +# Load SeqRepo with new sequences +seqrepo --root-directory "/biocommons/dl.biocommons.org/seqrepo" \ + load -n NCBI --instance-name "master" \ + "${FASTA_FILES[@]}" 2>&1 | \ + tee "$log_dir/seqrepo-load.log" diff --git a/sbin/seqrepo-pull b/sbin/seqrepo-pull new file mode 100755 index 0000000..894dffa --- /dev/null +++ b/sbin/seqrepo-pull @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +SEQREPO_DIR="/biocommons/dl.biocommons.org/seqrepo" + +# pull the latest seqrepo version from biocommons +latest_version=$(seqrepo list-remote-instances | tail -n 1 | xargs) +cd "$SEQREPO_DIR" +rsync -rtHP --no-motd dl.biocommons.org::seqrepo/"$latest_version" . + +# setup seqrepo build directory +mkdir -p master/sequences +cd "$latest_version" +cp -av aliases.sqlite3 "$SEQREPO_DIR"/master/ +chmod u+w "$SEQREPO_DIR"/master/aliases.sqlite3 +cd sequences +cp -av db.sqlite3 "$SEQREPO_DIR"/master/sequences/ +chmod u+w "$SEQREPO_DIR"/master/sequences/db.sqlite3 +for d in 2???; do + cp -alv $d "$SEQREPO_DIR"/master/sequences/ +done diff --git a/sbin/update-ncbi b/sbin/update-ncbi index 33211c7..41271d3 100755 --- a/sbin/update-ncbi +++ b/sbin/update-ncbi @@ -88,7 +88,7 @@ if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger() - url = "postgresql://uta_admin@localhost/uta_dev" + url = "postgresql://uta_admin@localhost/uta" uta, ncbi = sys.argv[1:3] conn = psycopg2.connect(url) diff --git a/sbin/uta-diff b/sbin/uta-diff index 489562d..d89798d 100755 --- a/sbin/uta-diff +++ b/sbin/uta-diff @@ -14,7 +14,8 @@ cmp_cols = collections.defaultdict(lambda: ['*']) cmp_cols.update({ "associated_accessions": "tx_ac pro_ac origin".split(), "exon_aln": "exon_aln_id tx_exon_id alt_exon_id cigar added".split(), - "gene": "hgnc".split(), + "gene": "gene_id".split(), + "seq_anno": "seq_anno_id seq_id origin_id ac added".split(), "transcript": "ac".split(), }) @@ -41,10 +42,10 @@ def cmp1(con, tbl, s1, s2): if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG) + logging.basicConfig(level=logging.INFO) logger = logging.getLogger() - url = "postgresql://uta_admin@localhost/uta_dev" + url = "postgresql://uta_admin@localhost/uta" tables = ["associated_accessions", "exon", "exon_aln", "exon_set", "gene", "meta", "origin", "seq", "seq_anno", "transcript",] @@ -66,7 +67,7 @@ if __name__ == "__main__": print("""UTA comparison: url={url}, s1={s1}, s2={s2} t: time taken (seconds) n1, n2: total number of rows in schemas s1 and s2 -nu1, nu2, c: number of rows unique to s1, unique to s2, and common to both +nu1, nu2, nc: number of rows unique to s1, unique to s2, and common to both cols: cols used for comparison """.format(url=url, s1=s1, s2=s2)) print(pt) diff --git a/sbin/uta-extract b/sbin/uta-extract new file mode 100755 index 0000000..9dbf716 --- /dev/null +++ b/sbin/uta-extract @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +# Extract data from NCBI files into intermediate files. + +set -euxo pipefail + +ncbi_dir=$1 +working_dir=$2 +log_dir=$3 + +if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] +then + echo 'Usage: sbin/uta-extract ' + exit 1 +fi + +# genes +sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \ + gzip -c > "$working_dir/geneinfo.gz" 2>&1 | tee "$log_dir/ncbi-parse-geneinfo.log" + +# transcript protein associations +sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2refseq.gz | gzip -c > "$working_dir/assocacs.gz" 2>&1 | \ + tee "$log_dir/ncbi-fetch-assoc-acs.log" + +# parse transcript info from GBFF input files +mapfile -t GBFF_FILES < <(find "$ncbi_dir/refseq" -type f -name "human.*.rna.gbff.gz") +sbin/ncbi-parse-gbff "${GBFF_FILES[@]}" | gzip -c > "$working_dir/txinfo.gz" 2>&1 | \ + tee "$log_dir/ncbi-parse-gbff.log" + +# parse alignments from GFF input files +# Due to NCBI's handling of transcripts with "frameshifting insertions and deletions with micro-introns" we +# need to parse out the cDNA_match alignment and use them preferentially over exons from genome GFF files. +# The cDNA_match records include the indels and do not have micro-introns. +mapfile -t GFF_FILES < <(find "$ncbi_dir/genomes" -type f -name "GCF_*_genomic.gff.gz") +sbin/ncbi-parse-gff "${GFF_FILES[@]}" | gzip -c > "$working_dir/cdna_match.exonsets.gz" 2>&1 | \ + tee "$log_dir/ncbi_parse_gff.log" + +# extract exon blocks from GFF files +sbin/ncbi_parse_genomic_gff.py "${GFF_FILES[@]}" | gzip -c > "$working_dir/exon_block.exonsets.gz" 2>&1 | \ + tee "$log_dir/ncbi-parse-genomic-gff.log" + +# coalesce exonsets +sbin/coalesce_exonsets.py "$working_dir/cdna_match.exonsets.gz" "$working_dir/exon_block.exonsets.gz" | \ + gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | tee "$log_dir/coalesce_exonsets.log" + +# filter transcripts +sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \ + --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \ + tee "$log_dir/filter_exonset_transcripts.log" + +# move fasta files into same dir +find "$ncbi_dir" -type f -name "*.f[an]a.gz" -print0 | xargs -i --null cp {} "$working_dir/" diff --git a/sbin/uta-load b/sbin/uta-load new file mode 100755 index 0000000..d419007 --- /dev/null +++ b/sbin/uta-load @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +# This script updates UTA and SeqRepo using NCBI files. +# source_uta_v is the UTA version before the update. +# ncbi_dir is where the script looks for NCBI data files. +# working_dir stores intermediate data files and the final database dump. +# log_dir stores log files. + +# Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo. + +set -euxo pipefail + +source_uta_v=$1 +dest_uta_v=$2 +ncbi_dir=$3 +working_dir=$4 +log_dir=$5 + +if [ -z "$source_uta_v" ] || [ -z "$dest_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ] +then + echo 'Usage: uta-load ' + exit 1 +fi + +# set local variables and create working directories +loading_uta_v="uta" +mkdir -p "$log_dir" + +## Drop loading schema, and recreate +etc/scripts/delete-schema.sh "$loading_uta_v" +etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" + +## apply any outstanding alembic migrations and update schema version if necessary +alembic -c etc/alembic.ini upgrade head +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf update-meta-data + +# generate seqinfo files from exonsets (this step requires seqrepo) +sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \ + tee "$log_dir/exonset-to-seqinfo.log" + +# Filter out columns from assocacs file. +sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \ + tee "$log_dir/assoc-acs-merge.log" + +# Load genes into gene table. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \ + tee "$log_dir/load-geneinfo.log" + +# Load accessions into associated_accessions table. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \ + tee "$log_dir/load-assoc-ac.log" + +# Load transcript info into transcript and exon_set tables. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$working_dir/txinfo.gz" 2>&1 | \ + tee "$log_dir/load-txinfo.log" + +# Load exon sets into into exon_set and exon tables. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$working_dir/exonsets.gz" 2>&1 | \ + tee "$log_dir/load-exonsets.log" + +# Load seqinfo into the seq and seqanno tables. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-seqinfo "$working_dir/seqinfo.gz" 2>&1 | \ + tee "$log_dir/load-seqinfo.log" + +# Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table. +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | \ + tee "$log_dir/align-exons.log" + +### run diff +sbin/uta-diff "$source_uta_v" "$loading_uta_v" + +## Rename schema to destination schema name and export to dump file +psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $dest_uta_v CASCADE;" +psql -h localhost -U uta_admin -d uta -c "ALTER SCHEMA uta RENAME TO $dest_uta_v"; +pg_dump -h localhost -U uta_admin -d uta -n "$dest_uta_v" | \ + gzip -c > "$working_dir/$dest_uta_v.pgd.gz" diff --git a/src/alembic/README b/src/alembic/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/src/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/src/alembic/env.py b/src/alembic/env.py new file mode 100644 index 0000000..1e8e830 --- /dev/null +++ b/src/alembic/env.py @@ -0,0 +1,92 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +from uta.models import Base +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def include_name(name, type_, parent_names) -> bool: + if type_ == "schema": + return name in ["uta"] + else: + return True + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, + target_metadata=target_metadata, + version_table_schema=target_metadata.schema, + include_schemas=True, + include_name=include_name, + ) + + with context.begin_transaction(): + context.execute(f'create schema if not exists {target_metadata.schema};') + context.execute(f'set search_path to {target_metadata.schema}') + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/src/alembic/script.py.mako b/src/alembic/script.py.mako new file mode 100644 index 0000000..fbc4b07 --- /dev/null +++ b/src/alembic/script.py.mako @@ -0,0 +1,26 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/src/alembic/versions/14eed54ff90d_create_translation_exception_table.py b/src/alembic/versions/14eed54ff90d_create_translation_exception_table.py new file mode 100644 index 0000000..b55a336 --- /dev/null +++ b/src/alembic/versions/14eed54ff90d_create_translation_exception_table.py @@ -0,0 +1,37 @@ +"""create translation_exception table + +Revision ID: 14eed54ff90d +Revises: f85dd97bd9f5 +Create Date: 2024-04-25 23:57:12.455316 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '14eed54ff90d' +down_revision: Union[str, None] = 'f85dd97bd9f5' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + 'translation_exception', + sa.Column('translation_exception_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('tx_ac', sa.Text(), nullable=False), + sa.Column('start_position', sa.Integer(), nullable=False), + sa.Column('end_position', sa.Integer(), nullable=False), + sa.Column('amino_acid', sa.Text(), nullable=False), + sa.CheckConstraint('start_position <= end_position', name='start_less_than_or_equal_to_end'), + sa.ForeignKeyConstraint(['tx_ac'], ['uta.transcript.ac'], onupdate='CASCADE', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('translation_exception_id'), + schema='uta', + ) + + +def downgrade() -> None: + op.drop_table('translation_exception', schema='uta') diff --git a/src/alembic/versions/19561fe444c8_create_materialized_view_for_tx_exon_.py b/src/alembic/versions/19561fe444c8_create_materialized_view_for_tx_exon_.py new file mode 100644 index 0000000..ae21036 --- /dev/null +++ b/src/alembic/versions/19561fe444c8_create_materialized_view_for_tx_exon_.py @@ -0,0 +1,32 @@ +"""create materialized view for tx_exon_aln_v + +Revision ID: 19561fe444c8 +Revises: f885cb84efce +Create Date: 2024-05-07 21:59:09.078549 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '19561fe444c8' +down_revision: Union[str, None] = 'f885cb84efce' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_aln_mv CASCADE;") + op.execute(""" + CREATE MATERIALIZED VIEW tx_exon_aln_mv AS SELECT * FROM tx_exon_aln_v WITH NO DATA; + CREATE INDEX tx_exon_aln_mv_tx_alt_ac_ix ON tx_exon_aln_mv(tx_ac, alt_ac, alt_aln_method); + GRANT SELECT ON tx_exon_set_summary_mv TO public; + REFRESH MATERIALIZED VIEW tx_exon_aln_mv; + """) + + +def downgrade() -> None: + op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_aln_mv CASCADE;") diff --git a/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py b/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py new file mode 100644 index 0000000..06156bb --- /dev/null +++ b/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py @@ -0,0 +1,44 @@ +"""add gene_id to gene and transcript + +Revision ID: 595a586e6de7 +Revises: a697b584f699 +Create Date: 2024-04-10 19:47:43.685672 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '595a586e6de7' +down_revision: Union[str, None] = 'a697b584f699' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('gene', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta') + op.add_column('gene', sa.Column('type', sa.Text(), nullable=True), schema='uta') + op.add_column('gene', sa.Column('xrefs', sa.Text(), nullable=True), schema='uta') + op.add_column('transcript', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta') + # ### end Alembic commands ### + + # ### commands to drop existing primary key on gene table ### + op.drop_constraint('gene_pkey', 'gene', schema='uta') + # ### end of commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('transcript', 'gene_id', schema='uta') + op.drop_column('gene', 'xrefs', schema='uta') + op.drop_column('gene', 'type', schema='uta') + op.drop_column('gene', 'gene_id', schema='uta') + # ### end Alembic commands ### + + # ### commands to add primary key on gene table ### + op.create_primary_key('gene_pkey', 'gene', ['hgnc'], schema='uta') + # ### end of commands ### diff --git a/src/alembic/versions/77076df4224c_add_tx_hgnc_index.py b/src/alembic/versions/77076df4224c_add_tx_hgnc_index.py new file mode 100644 index 0000000..0a684a2 --- /dev/null +++ b/src/alembic/versions/77076df4224c_add_tx_hgnc_index.py @@ -0,0 +1,30 @@ +"""hgnc test + +Revision ID: 77076df4224c +Revises: 19561fe444c8 +Create Date: 2024-08-26 17:08:13.160259 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '77076df4224c' +down_revision: Union[str, None] = '19561fe444c8' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_index(op.f('ix_uta_transcript_hgnc'), 'transcript', ['hgnc'], unique=False, schema='uta') + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_uta_transcript_hgnc'), table_name='transcript', schema='uta') + # ### end Alembic commands ### diff --git a/src/alembic/versions/a697b584f699_add_codon_table_to_transcript.py b/src/alembic/versions/a697b584f699_add_codon_table_to_transcript.py new file mode 100644 index 0000000..d440495 --- /dev/null +++ b/src/alembic/versions/a697b584f699_add_codon_table_to_transcript.py @@ -0,0 +1,33 @@ +"""add codon_table to Transcript + +Revision ID: a697b584f699 +Revises: cc51f50ae896 +Create Date: 2024-04-08 17:27:41.570024 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'a697b584f699' +down_revision: Union[str, None] = 'cc51f50ae896' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('transcript', sa.Column('codon_table', sa.Text(), nullable=True), schema='uta') + # ### end Alembic commands ### + # ### population of codon_table column with data ### + op.execute("UPDATE transcript SET codon_table = '1' WHERE cds_start_i NOTNULL;") + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('transcript', 'codon_table', schema='uta') + # ### end Alembic commands ### diff --git a/src/alembic/versions/cc51f50ae896_add_sqlalchemy_model_for_assocacs.py b/src/alembic/versions/cc51f50ae896_add_sqlalchemy_model_for_assocacs.py new file mode 100644 index 0000000..c8ee756 --- /dev/null +++ b/src/alembic/versions/cc51f50ae896_add_sqlalchemy_model_for_assocacs.py @@ -0,0 +1,44 @@ +"""add sqlalchemy model for assocacs + +Revision ID: cc51f50ae896 +Revises: edadb97f6502 +Create Date: 2024-04-05 00:33:40.105587 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'cc51f50ae896' +down_revision: Union[str, None] = 'edadb97f6502' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('associated_accessions', 'tx_ac', + existing_type=sa.TEXT(), + nullable=False, + schema='uta') + op.alter_column('associated_accessions', 'pro_ac', + existing_type=sa.TEXT(), + nullable=False, + schema='uta') + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('associated_accessions', 'pro_ac', + existing_type=sa.TEXT(), + nullable=True, + schema='uta') + op.alter_column('associated_accessions', 'tx_ac', + existing_type=sa.TEXT(), + nullable=True, + schema='uta') + # ### end Alembic commands ### diff --git a/src/alembic/versions/edadb97f6502_initial_state.py b/src/alembic/versions/edadb97f6502_initial_state.py new file mode 100644 index 0000000..ba46093 --- /dev/null +++ b/src/alembic/versions/edadb97f6502_initial_state.py @@ -0,0 +1,390 @@ +"""initial state + +Revision ID: edadb97f6502 +Revises: +Create Date: 2024-04-03 21:41:05.875580 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + + +# revision identifiers, used by Alembic. +revision: str = 'edadb97f6502' +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('gene', + sa.Column('hgnc', sa.Text(), nullable=False), + sa.Column('maploc', sa.Text(), nullable=True), + sa.Column('descr', sa.Text(), nullable=True), + sa.Column('summary', sa.Text(), nullable=True), + sa.Column('aliases', sa.Text(), nullable=True), + sa.Column('added', sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint('hgnc'), + schema='uta' + ) + op.create_table('meta', + sa.Column('key', sa.Text(), nullable=False), + sa.Column('value', sa.Text(), nullable=False), + sa.PrimaryKeyConstraint('key'), + schema='uta' + ) + op.create_table('origin', + sa.Column('origin_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('name', sa.Text(), nullable=False), + sa.Column('descr', sa.Text(), nullable=True), + sa.Column('updated', sa.DateTime(), nullable=True), + sa.Column('url', sa.Text(), nullable=True), + sa.Column('url_ac_fmt', sa.Text(), nullable=True), + sa.PrimaryKeyConstraint('origin_id'), + sa.UniqueConstraint('name'), + schema='uta' + ) + op.create_table('seq', + sa.Column('seq_id', sa.Text(), nullable=False), + sa.Column('len', sa.Integer(), nullable=False), + sa.Column('seq', sa.Text(), nullable=True), + sa.PrimaryKeyConstraint('seq_id'), + schema='uta' + ) + op.create_table('seq_anno', + sa.Column('seq_anno_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('seq_id', sa.Text(), nullable=True), + sa.Column('origin_id', sa.Integer(), nullable=False), + sa.Column('ac', sa.Text(), nullable=False), + sa.Column('descr', sa.Text(), nullable=True), + sa.Column('added', sa.DateTime(), nullable=False), + sa.ForeignKeyConstraint(['origin_id'], ['uta.origin.origin_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.ForeignKeyConstraint(['seq_id'], ['uta.seq.seq_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('seq_anno_id'), + schema='uta' + ) + op.create_index(op.f('ix_uta_seq_anno_ac'), 'seq_anno', ['ac'], unique=False, schema='uta') + op.create_index(op.f('ix_uta_seq_anno_seq_id'), 'seq_anno', ['seq_id'], unique=False, schema='uta') + op.create_index('seq_anno_ac_unique_in_origin', 'seq_anno', ['origin_id', 'ac'], unique=True, schema='uta') + op.create_table('transcript', + sa.Column('ac', sa.Text(), nullable=False), + sa.Column('origin_id', sa.Integer(), nullable=False), + sa.Column('hgnc', sa.Text(), nullable=True), + sa.Column('cds_start_i', sa.Integer(), nullable=True), + sa.Column('cds_end_i', sa.Integer(), nullable=True), + sa.Column('cds_md5', sa.Text(), nullable=True), + sa.Column('added', sa.DateTime(), nullable=False), + sa.CheckConstraint('cds_start_i <= cds_end_i', name='cds_start_i_must_be_le_cds_end_i'), + sa.ForeignKeyConstraint(['origin_id'], ['uta.origin.origin_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('ac'), + schema='uta' + ) + op.create_index(op.f('ix_uta_transcript_cds_md5'), 'transcript', ['cds_md5'], unique=False, schema='uta') + op.create_index(op.f('ix_uta_transcript_origin_id'), 'transcript', ['origin_id'], unique=False, schema='uta') + op.create_table('exon_set', + sa.Column('exon_set_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('tx_ac', sa.Text(), nullable=False), + sa.Column('alt_ac', sa.Text(), nullable=False), + sa.Column('alt_strand', sa.SmallInteger(), nullable=False), + sa.Column('alt_aln_method', sa.Text(), nullable=False), + sa.Column('added', sa.DateTime(), nullable=False), + sa.ForeignKeyConstraint(['tx_ac'], ['uta.transcript.ac'], onupdate='CASCADE', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('exon_set_id'), + sa.UniqueConstraint('tx_ac', 'alt_ac', 'alt_aln_method', name=' must be unique'), + schema='uta' + ) + op.create_table('exon', + sa.Column('exon_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('exon_set_id', sa.Integer(), nullable=False), + sa.Column('start_i', sa.Integer(), nullable=False), + sa.Column('end_i', sa.Integer(), nullable=False), + sa.Column('ord', sa.Integer(), nullable=False), + sa.Column('name', sa.Text(), nullable=True), + sa.CheckConstraint('start_i < end_i', name='exon_start_i_must_be_lt_end_i'), + sa.ForeignKeyConstraint(['exon_set_id'], ['uta.exon_set.exon_set_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('exon_id'), + sa.UniqueConstraint('exon_set_id', 'end_i', name='end_i_must_be_unique_in_exon_set'), + sa.UniqueConstraint('exon_set_id', 'start_i', name='start_i_must_be_unique_in_exon_set'), + schema='uta' + ) + op.create_index(op.f('ix_uta_exon_exon_set_id'), 'exon', ['exon_set_id'], unique=False, schema='uta') + op.create_table('exon_aln', + sa.Column('exon_aln_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('tx_exon_id', sa.Integer(), nullable=False), + sa.Column('alt_exon_id', sa.Integer(), nullable=False), + sa.Column('cigar', sa.Text(), nullable=False), + sa.Column('added', sa.DateTime(), nullable=False), + sa.Column('tx_aseq', sa.Text(), nullable=True), + sa.Column('alt_aseq', sa.Text(), nullable=True), + sa.ForeignKeyConstraint(['alt_exon_id'], ['uta.exon.exon_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.ForeignKeyConstraint(['tx_exon_id'], ['uta.exon.exon_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('exon_aln_id'), + schema='uta' + ) + op.create_index(op.f('ix_uta_exon_aln_alt_exon_id'), 'exon_aln', ['alt_exon_id'], unique=False, schema='uta') + op.create_index(op.f('ix_uta_exon_aln_tx_exon_id'), 'exon_aln', ['tx_exon_id'], unique=False, schema='uta') + # ### end Alembic commands ### + + # ### custom commands to match the initial UTA database schema 1.1 ### + op.create_table('associated_accessions', + sa.Column('associated_accession_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('tx_ac', sa.Text(), nullable=True), + sa.Column('pro_ac', sa.Text(), nullable=True), + sa.Column('origin', sa.Text(), nullable=False), + sa.Column('added', postgresql.TIMESTAMP(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.PrimaryKeyConstraint('associated_accession_id'), + schema='uta' + ) + op.create_index('associated_accessions_tx_ac', 'associated_accessions', ['tx_ac'], unique=False, schema='uta') + op.create_index('associated_accessions_pro_ac', 'associated_accessions', ['pro_ac'], unique=False, schema='uta') + op.create_index('unique_pair_in_origin', 'associated_accessions', ['origin', 'tx_ac', 'pro_ac'], unique=True, schema='uta') + op.create_table_comment('associated_accessions', 'transcript-protein accession pairs associated in source databases', schema='uta') + # ### end custom commands ### + + # ### custom SQL to add views to match the initial UTA database schema 1.1 ### + op.execute(""" + CREATE VIEW _cds_exons_v AS + WITH cds_exons as ( + SELECT ES.exon_set_id, T.ac AS tx_ac, E.ord, + E.start_i, E.end_i, + CASE WHEN E.end_i >= T.cds_start_i AND E.start_i <= T.cds_end_i THEN greatest(E.start_i,T.cds_start_i) ELSE NULL end AS cds_ex_start_i, + CASE WHEN E.end_i >= T.cds_start_i AND E.start_i <= T.cds_end_i THEN least(E.end_i,T.cds_end_i) ELSE NULL end AS cds_ex_end_i + FROM transcript T + JOIN exon_set ES ON T.ac = ES.tx_ac AND ES.alt_aln_METHOD = 'transcript' + JOIN exon E ON ES.exon_set_id=E.exon_set_id + WHERE T.cds_start_i IS NOT NULL AND T.cds_end_i IS NOT NULL + ) + select *, end_i - start_i as ex_len, cds_ex_end_i - cds_ex_start_i as cds_ex_len from cds_exons; + """) + op.execute(""" + CREATE VIEW _cds_exons_flat_v AS + SELECT exon_set_id,tx_ac,MIN(ord) AS cds_start_exon,MAX(ord) AS cds_end_exon, + ARRAY_TO_STRING(ARRAY_AGG(format('%s,%s',cds_ex_start_i,cds_ex_end_i) ORDER BY ord),';') AS cds_se_i, + ARRAY_TO_STRING(ARRAY_AGG(cds_ex_len ORDER BY ord),';') AS cds_exon_lengths + FROM _cds_exons_v + WHERE cds_ex_start_i IS NOT NULL + GROUP BY exon_set_id, tx_ac; + """) + op.execute(""" + CREATE VIEW _seq_anno_most_recent AS + SELECT DISTINCT ON (ac) * + FROM seq_anno + ORDER BY ac,added DESC; + """) + op.execute(""" + CREATE VIEW _cds_exons_fp_v AS + SELECT SA.seq_id, md5(format('%s;%s',LOWER(SA.seq_id),CTEF.cds_se_i)) AS cds_es_fp, + md5(cds_exon_lengths) AS cds_exon_lengths_fp, CTEF.* + FROM _cds_exons_flat_v CTEF + JOIN _seq_anno_most_recent SA ON CTEF.tx_ac=SA.ac; + """) + op.execute(""" + CREATE VIEW _discontiguous_tx AS + SELECT t.hgnc, + es.exon_set_id, + es.tx_ac, + format('[%s-%s]'::text, e1.end_i, e2.start_i) AS gap, + e1.exon_id AS e1_exon_id, + e1.ord AS e1_ord, + e1.start_i AS e1_start_i, + e1.end_i AS e1_end_i, + e2.exon_id AS e2_exon_id, + e2.ord AS e2_ord, + e2.start_i AS e2_start_i, + e2.end_i AS e2_end_i + FROM exon_set es + LEFT JOIN transcript t ON es.tx_ac = t.ac + JOIN exon e1 ON es.exon_set_id = e1.exon_set_id + JOIN exon e2 ON es.exon_set_id = e2.exon_set_id AND e2.ord = (e1.ord + 1) AND e1.end_i <> e2.start_i + WHERE es.alt_aln_method = 'transcript'::text; + """) + op.execute(""" + CREATE VIEW tx_alt_exon_pairs_v AS + SELECT T.hgnc,TES.exon_SET_id AS tes_exon_SET_id,AES.exon_SET_id AS aes_exon_SET_id, + TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,AES.alt_strand,AES.alt_aln_method, + TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,AEX.exon_id AS alt_exon_id, + TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, AEX.start_i AS alt_start_i,AEX.END_i AS alt_END_i, + EA.exon_aln_id,EA.cigar + FROM exon_SET tes + JOIN transcript t ON tes.tx_ac=t.ac + JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method!='transcript' + JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id + JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD + LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id; + """) + op.execute(""" + CREATE VIEW tx_exon_aln_v AS + SELECT T.hgnc,T.ac as tx_ac,AES.alt_ac,AES.alt_aln_method,AES.alt_strand, + TE.ord, TE.start_i as tx_start_i,TE.end_i as tx_end_i, + AE.start_i as alt_start_i, AE.end_i as alt_end_i, + EA.cigar, EA.tx_aseq, EA.alt_aseq, + TES.exon_set_id AS tx_exon_set_id,AES.exon_set_id as alt_exon_set_id, + TE.exon_id as tx_exon_id, AE.exon_id as alt_exon_id, + EA.exon_aln_id + FROM transcript T + JOIN exon_set TES ON T.ac=TES.tx_ac AND TES.alt_aln_method ='transcript' + JOIN exon_set AES on T.ac=AES.tx_ac and AES.alt_aln_method!='transcript' + JOIN exon TE ON TES.exon_set_id=TE.exon_set_id + JOIN exon AE ON AES.exon_set_id=AE.exon_set_id AND TE.ord=AE.ord + LEFT JOIN exon_aln EA ON TE.exon_id=EA.tx_exon_id AND AE.exon_id=EA.alt_exon_id; + """) + op.execute(""" + CREATE VIEW exon_set_exons_v AS + SELECT ES.*,EL.n_exons,EL.se_i,EL.starts_i,EL.ends_i,EL.lengths + FROM exon_set ES + JOIN (SELECT + iES.exon_set_id, + count(*) AS n_exons, + array_to_string(array_agg(format('%s,%s',iE.start_i,iE.end_i) ORDER BY iE.ord),';') AS se_i, + array_agg(iE.start_i ORDER BY iE.ord) AS starts_i, + array_agg(iE.end_i ORDER BY iE.ord) AS ends_i, + array_agg((iE.end_i-iE.start_i) ORDER BY iE.ord) AS lengths + FROM exon_set iES + JOIN exon iE ON iES.exon_set_id=iE.exon_set_id + GROUP BY iES.exon_set_id) EL + ON ES.exon_set_id = EL.exon_set_id; + """) + op.execute(""" + COMMENT ON VIEW exon_set_exons_v IS 'defining view of "flat" (aggregated) exons on a sequence; use _mv; for faster materialized version'; + """) + op.execute(""" + CREATE VIEW exon_set_exons_fp_v AS + SELECT ESE.*,md5(format('%s;%s',lower(ASA.seq_id),ESE.se_i)) AS es_fingerprint + FROM exon_set_exons_v ESE + JOIN _seq_anno_most_recent ASA ON ESE.alt_ac=ASA.ac; + """) + op.execute(""" + COMMENT ON VIEW exon_set_exons_fp_v IS 'flattened (aggregated) exons with exon set fingerprint'; + """) + op.execute(""" + CREATE MATERIALIZED VIEW exon_set_exons_fp_mv AS SELECT * FROM exon_set_exons_fp_v WITH NO DATA; + CREATE INDEX exon_set_exons_fp_mv_tx_ac_ix ON exon_set_exons_fp_mv(tx_ac); + CREATE INDEX exon_set_exons_fp_mv_alt_ac_ix ON exon_set_exons_fp_mv(alt_ac); + CREATE INDEX exon_set_exons_fp_mv_alt_aln_method_ix ON exon_set_exons_fp_mv(alt_aln_method); + GRANT SELECT ON exon_set_exons_fp_mv TO public; + """) + op.execute(""" + CREATE OR replace VIEW tx_exon_set_summary_dv AS + SELECT hgnc,cds_md5,es_fingerprint,tx_ac,alt_ac,alt_aln_method,alt_strand,exon_set_id,n_exons,se_i,starts_i,ends_i,lengths + FROM transcript T + JOIN exon_set_exons_fp_mv ESE ON T.ac=ESE.tx_ac; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_exon_set_summary_mv AS SELECT * FROM tx_exon_set_summary_dv WITH NO DATA; + CREATE INDEX tx_exon_set_summary_mv_cds_md5_ix ON tx_exon_set_summary_mv(cds_md5); + CREATE INDEX tx_exon_set_summary_mv_es_fingerprint_ix ON tx_exon_set_summary_mv(es_fingerprint); + CREATE INDEX tx_exon_set_summary_mv_tx_ac_ix ON tx_exon_set_summary_mv(tx_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_ac_ix ON tx_exon_set_summary_mv(alt_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_aln_method_ix ON tx_exon_set_summary_mv(alt_aln_method); + GRANT SELECT ON tx_exon_set_summary_mv TO public; + """) + op.execute(""" + CREATE VIEW tx_def_summary_dv AS + SELECT TESS.exon_set_id, TESS.tx_ac, TESS.alt_ac, TESS.alt_aln_method, TESS.alt_strand, + TESS.hgnc, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp, CEF.cds_exon_lengths_fp, + TESS.n_exons, TESS.se_i, CEF.cds_se_i, TESS.starts_i, TESS.ends_i, TESS.lengths, + T.cds_start_i, T.cds_end_i, CEF.cds_start_exon, CEF.cds_end_exon + FROM tx_exon_set_summary_mv TESS + JOIN transcript T ON TESS.tx_ac=T.ac + LEFT JOIN _cds_exons_fp_v CEF ON TESS.exon_set_id=CEF.exon_set_id + WHERE TESS.alt_aln_method = 'transcript'; + """) + op.execute(""" + COMMENT ON VIEW tx_def_summary_dv IS 'transcript definitions, with exon structures'; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_def_summary_mv AS SELECT * FROM tx_def_summary_dv WITH NO DATA; + """) + op.execute(""" + CREATE VIEW tx_def_summary_v AS SELECT * FROM tx_def_summary_mv; + """) + op.execute(""" + COMMENT ON MATERIALIZED VIEW tx_def_summary_mv IS 'transcript definitions, with exon structures and fingerprints'; + """) + op.execute(""" + create index tx_def_summary_mv_tx_ac on tx_def_summary_mv (tx_ac); + create index tx_def_summary_mv_alt_ac on tx_def_summary_mv (alt_ac); + create index tx_def_summary_mv_alt_aln_method on tx_def_summary_mv (alt_aln_method); + create index tx_def_summary_mv_hgnc on tx_def_summary_mv (hgnc); + """) + op.execute(""" + CREATE VIEW tx_def_summary_v AS + SELECT * FROM tx_def_summary_mv; + """) + op.execute(""" + CREATE OR REPLACE VIEW tx_similarity_v AS + SELECT DISTINCT + D1.tx_ac as tx_ac1, D2.tx_ac as tx_ac2, + D1.hgnc = D2.hgnc as hgnc_eq, + D1.cds_md5=D2.cds_md5 as cds_eq, + D1.es_fingerprint=D2.es_fingerprint as es_fp_eq, + D1.cds_es_fp=D2.cds_es_fp as cds_es_fp_eq, + D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp as cds_exon_lengths_fp_eq + FROM tx_def_summary_mv D1 + JOIN tx_def_summary_mv D2 on (D1.tx_ac != D2.tx_ac + and (D1.hgnc=D2.hgnc + or D1.cds_md5=D2.cds_md5 + or D1.es_fingerprint=D2.es_fingerprint + or D1.cds_es_fp=D2.cds_es_fp + or D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp + )); + """) + # ### end custom SQL commands ### + + +def downgrade() -> None: + # ### custom SQL to remove views ### + op.execute("DROP VIEW tx_similarity_v CASCADE;") + op.execute("DROP VIEW tx_def_summary_v CASCADE;") + op.execute("DROP INDEX tx_def_summary_mv_hgnc CASCADE") + op.execute("DROP INDEX tx_def_summary_mv_alt_aln_method CASCADE") + op.execute("DROP INDEX tx_def_summary_mv_alt_ac CASCADE") + op.execute("DROP INDEX tx_def_summary_mv_tx_ac CASCADE") + op.execute("DROP VIEW tx_def_summary_v CASCADE;") + op.execute("DROP MATERIALIZED VIEW tx_def_summary_mv CASCADE;") + op.execute("DROP VIEW tx_def_summary_dv CASCADE;") + op.execute("DROP MATERIALIZED VIEW tx_exon_set_summary_mv CASCADE;") + op.execute("DROP VIEW tx_exon_set_summary_dv CASCADE;") + op.execute("DROP MATERIALIZED VIEW exon_set_exons_fp_mv CASCADE;") + op.execute("DROP VIEW exon_set_exons_fp_v CASCADE;") + op.execute("DROP VIEW exon_set_exons_v CASCADE;") + op.execute("DROP VIEW tx_exon_aln_v CASCADE;") + op.execute("DROP VIEW tx_alt_exon_pairs_v CASCADE;") + op.execute("DROP VIEW _discontiguous_tx CASCADE;") + op.execute("DROP VIEW _cds_exons_fp_v CASCADE;") + op.execute("DROP VIEW _seq_anno_most_recent CASCADE;") + op.execute("DROP VIEW _cds_exons_flat_v CASCADE;") + op.execute("DROP VIEW _cds_exons_v CASCADE;") + # ### end custom SQL commands ### + + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_uta_exon_aln_tx_exon_id'), table_name='exon_aln', schema='uta') + op.drop_index(op.f('ix_uta_exon_aln_alt_exon_id'), table_name='exon_aln', schema='uta') + op.drop_table('exon_aln', schema='uta') + op.drop_index(op.f('ix_uta_exon_exon_set_id'), table_name='exon', schema='uta') + op.drop_table('exon', schema='uta') + op.drop_table('exon_set', schema='uta') + op.drop_index(op.f('ix_uta_transcript_origin_id'), table_name='transcript', schema='uta') + op.drop_index(op.f('ix_uta_transcript_cds_md5'), table_name='transcript', schema='uta') + op.drop_table('transcript', schema='uta') + op.drop_index('seq_anno_ac_unique_in_origin', table_name='seq_anno', schema='uta') + op.drop_index(op.f('ix_uta_seq_anno_seq_id'), table_name='seq_anno', schema='uta') + op.drop_index(op.f('ix_uta_seq_anno_ac'), table_name='seq_anno', schema='uta') + op.drop_table('seq_anno', schema='uta') + op.drop_table('seq', schema='uta') + op.drop_table('origin', schema='uta') + op.drop_table('meta', schema='uta') + op.drop_table('gene', schema='uta') + # ### end Alembic commands ### + + # ### custom commands to remove items not autogenerated by Alembic ### + op.drop_index('unique_pair_in_origin', table_name='associated_accessions') + op.drop_index('associated_accessions_pro_ac', table_name='associated_accessions') + op.drop_index('associated_accessions_tx_ac', table_name='associated_accessions') + op.drop_table_comment('associated_accessions', existing_comment='transcript-protein accession pairs associated in source databases', schema='uta') + op.drop_table('associated_accessions') + # ### end custom commands ### \ No newline at end of file diff --git a/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py b/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py new file mode 100644 index 0000000..1548b95 --- /dev/null +++ b/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py @@ -0,0 +1,305 @@ +"""set gene_id and primary and foreign keys + +Revision ID: f85dd97bd9f5 +Revises: 595a586e6de7 +Create Date: 2024-04-10 22:14:14.055461 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'f85dd97bd9f5' +down_revision: Union[str, None] = '595a586e6de7' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "gene", "gene_id", existing_type=sa.TEXT(), nullable=False, schema="uta" + ) + op.create_primary_key("gene_pkey", "gene", ["gene_id"], schema="uta") + op.create_index(op.f("ix_uta_gene_hgnc"), "gene", ["hgnc"], unique=False, schema="uta") + op.alter_column( + "transcript", "gene_id", existing_type=sa.TEXT(), nullable=False, schema="uta" + ) + op.create_index( + op.f("ix_uta_transcript_gene_id"), + "transcript", + ["gene_id"], + unique=False, + schema="uta", + ) + op.create_foreign_key(None, 'transcript', 'gene', ['gene_id'], ['gene_id'], source_schema='uta', referent_schema='uta') + # ### end Alembic commands ### + + # ### handle first part of hgnc -> gene_symbol column rename ### + op.add_column("gene", sa.Column("symbol", sa.Text(), nullable=True), schema="uta") + op.create_index(op.f("ix_uta_gene_symbol"), "gene", ["symbol"], unique=False, schema="uta") + op.execute("UPDATE gene SET symbol = hgnc;") + op.alter_column('gene', 'symbol', + existing_type=sa.TEXT(), + nullable=False, + schema='uta') + # ### end of hgnc -> gene_symbol column rename ### + + # ### updates required to existing views needed to drop hgnc from transcript. ### + op.execute("DROP VIEW IF EXISTS tx_similarity_v CASCADE;") + op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;") + op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;") + op.execute("DROP VIEW IF EXISTS _discontiguous_tx CASCADE;") + op.execute(""" + CREATE VIEW _discontiguous_tx AS + SELECT g.symbol, + g.symbol as hgnc, + g.gene_id, + es.exon_set_id, + es.tx_ac, + format('[%s-%s]'::text, e1.end_i, e2.start_i) AS gap, + e1.exon_id AS e1_exon_id, + e1.ord AS e1_ord, + e1.start_i AS e1_start_i, + e1.end_i AS e1_end_i, + e2.exon_id AS e2_exon_id, + e2.ord AS e2_ord, + e2.start_i AS e2_start_i, + e2.end_i AS e2_end_i + FROM exon_set es + JOIN transcript t ON es.tx_ac = t.ac + JOIN gene as g ON t.gene_id = g.gene_id + JOIN exon e1 ON es.exon_set_id = e1.exon_set_id + JOIN exon e2 ON es.exon_set_id = e2.exon_set_id AND e2.ord = (e1.ord + 1) AND e1.end_i <> e2.start_i + WHERE es.alt_aln_method = 'transcript'::text; + """) + op.execute(""" + CREATE VIEW tx_alt_exon_pairs_v AS + SELECT g.symbol, g.symbol as hgnc, g.gene_id,TES.exon_SET_id AS tes_exon_SET_id, + AES.exon_SET_id AS aes_exon_SET_id, TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac, + AES.alt_strand,AES.alt_aln_method, TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id, + AEX.exon_id AS alt_exon_id, TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, + AEX.start_i AS alt_start_i, AEX.END_i AS alt_END_i, EA.exon_aln_id,EA.cigar + FROM exon_SET tes + JOIN transcript t ON tes.tx_ac=t.ac + JOIN gene g ON t.gene_id=g.gene_id + JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method!='transcript' + JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id + JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD + LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id; + """) + op.execute(""" + CREATE VIEW tx_exon_aln_v AS + SELECT G.symbol, G.symbol AS hgnc, G.gene_id, T.ac as tx_ac, AES.alt_ac, + AES.alt_aln_method,AES.alt_strand, TE.ord, TE.start_i as tx_start_i, + TE.end_i as tx_end_i, AE.start_i as alt_start_i, AE.end_i as alt_end_i, + EA.cigar, EA.tx_aseq, EA.alt_aseq, TES.exon_set_id AS tx_exon_set_id, + AES.exon_set_id as alt_exon_set_id, TE.exon_id as tx_exon_id, + AE.exon_id as alt_exon_id, EA.exon_aln_id + FROM transcript T + JOIN gene G ON T.gene_id=G.gene_id + JOIN exon_set TES ON T.ac=TES.tx_ac AND TES.alt_aln_method ='transcript' + JOIN exon_set AES on T.ac=AES.tx_ac and AES.alt_aln_method!='transcript' + JOIN exon TE ON TES.exon_set_id=TE.exon_set_id + JOIN exon AE ON AES.exon_set_id=AE.exon_set_id AND TE.ord=AE.ord + LEFT JOIN exon_aln EA ON TE.exon_id=EA.tx_exon_id AND AE.exon_id=EA.alt_exon_id; + """) + op.execute(""" + CREATE VIEW tx_exon_set_summary_dv AS + SELECT G.symbol, G.symbol as hgnc, G.gene_id, cds_md5, es_fingerprint, tx_ac, alt_ac, + alt_aln_method, alt_strand, exon_set_id, n_exons, se_i, starts_i, ends_i, lengths + FROM transcript T + JOIN gene G ON T.gene_id=G.gene_id + JOIN exon_set_exons_fp_mv ESE ON T.ac=ESE.tx_ac; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_exon_set_summary_mv AS SELECT * FROM tx_exon_set_summary_dv WITH NO DATA; + CREATE INDEX tx_exon_set_summary_mv_cds_md5_ix ON tx_exon_set_summary_mv(cds_md5); + CREATE INDEX tx_exon_set_summary_mv_es_fingerprint_ix ON tx_exon_set_summary_mv(es_fingerprint); + CREATE INDEX tx_exon_set_summary_mv_tx_ac_ix ON tx_exon_set_summary_mv(tx_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_ac_ix ON tx_exon_set_summary_mv(alt_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_aln_method_ix ON tx_exon_set_summary_mv(alt_aln_method); + GRANT SELECT ON tx_exon_set_summary_mv TO public; + REFRESH MATERIALIZED VIEW tx_exon_set_summary_mv; + """) + op.execute(""" + CREATE VIEW tx_def_summary_dv AS + SELECT TESS.exon_set_id, TESS.tx_ac, TESS.alt_ac, TESS.alt_aln_method, TESS.alt_strand, + TESS.symbol, TESS.hgnc, TESS.gene_id, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp, + CEF.cds_exon_lengths_fp, TESS.n_exons, TESS.se_i, CEF.cds_se_i, TESS.starts_i, + TESS.ends_i, TESS.lengths, T.cds_start_i, T.cds_end_i, CEF.cds_start_exon, CEF.cds_end_exon + FROM tx_exon_set_summary_mv TESS + JOIN transcript T ON TESS.tx_ac=T.ac + LEFT JOIN _cds_exons_fp_v CEF ON TESS.exon_set_id=CEF.exon_set_id + WHERE TESS.alt_aln_method = 'transcript'; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_def_summary_mv AS SELECT * FROM tx_def_summary_dv WITH NO DATA; + CREATE INDEX tx_def_summary_mv_tx_ac ON tx_def_summary_mv (tx_ac); + CREATE INDEX tx_def_summary_mv_alt_ac ON tx_def_summary_mv (alt_ac); + CREATE INDEX tx_def_summary_mv_alt_aln_method ON tx_def_summary_mv (alt_aln_method); + CREATE INDEX tx_def_summary_mv_hgnc ON tx_def_summary_mv (hgnc); + CREATE INDEX tx_def_summary_mv_symbol ON tx_def_summary_mv (symbol); + CREATE INDEX tx_def_summary_mv_gene_id ON tx_def_summary_mv (gene_id); + REFRESH MATERIALIZED VIEW tx_def_summary_mv; + """) + op.execute(""" + CREATE VIEW tx_similarity_v AS + SELECT DISTINCT + D1.tx_ac as tx_ac1, D2.tx_ac as tx_ac2, + D1.hgnc = D2.hgnc as hgnc_eq, + D1.symbol = D2.symbol as symbol_eq, + D1.cds_md5=D2.cds_md5 as cds_eq, + D1.es_fingerprint=D2.es_fingerprint as es_fp_eq, + D1.cds_es_fp=D2.cds_es_fp as cds_es_fp_eq, + D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp as cds_exon_lengths_fp_eq + FROM tx_def_summary_mv D1 + JOIN tx_def_summary_mv D2 on (D1.tx_ac != D2.tx_ac + and (D1.symbol=D2.symbol + or D1.cds_md5=D2.cds_md5 + or D1.es_fingerprint=D2.es_fingerprint + or D1.cds_es_fp=D2.cds_es_fp + or D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp + )); + """) + # ### end of updates to existing views ### + + +def downgrade() -> None: + # ### commands to downgrade views before adding hgnc to transcript ### + op.execute("DROP VIEW IF EXISTS tx_similarity_v CASCADE;") + op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;") + op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;") + op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;") + op.execute("DROP VIEW IF EXISTS _discontiguous_tx CASCADE;") + op.execute(""" + CREATE VIEW _discontiguous_tx AS + SELECT t.hgnc, + es.exon_set_id, + es.tx_ac, + format('[%s-%s]'::text, e1.end_i, e2.start_i) AS gap, + e1.exon_id AS e1_exon_id, + e1.ord AS e1_ord, + e1.start_i AS e1_start_i, + e1.end_i AS e1_end_i, + e2.exon_id AS e2_exon_id, + e2.ord AS e2_ord, + e2.start_i AS e2_start_i, + e2.end_i AS e2_end_i + FROM exon_set es + JOIN transcript t ON es.tx_ac = t.ac + JOIN exon e1 ON es.exon_set_id = e1.exon_set_id + JOIN exon e2 ON es.exon_set_id = e2.exon_set_id AND e2.ord = (e1.ord + 1) AND e1.end_i <> e2.start_i + WHERE es.alt_aln_method = 'transcript'::text; + """) + op.execute(""" + CREATE VIEW tx_alt_exon_pairs_v AS + SELECT t.hgnc,TES.exon_SET_id AS tes_exon_SET_id,AES.exon_SET_id AS aes_exon_SET_id, + TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,AES.alt_strand,AES.alt_aln_method, + TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,AEX.exon_id AS alt_exon_id, + TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, AEX.start_i AS alt_start_i,AEX.END_i AS alt_END_i, + EA.exon_aln_id,EA.cigar + FROM exon_SET tes + JOIN transcript t ON tes.tx_ac=t.ac + JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method!='transcript' + JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id + JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD + LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id; + """) + op.execute(""" + CREATE VIEW tx_exon_aln_v AS + SELECT T.hgnc,T.ac as tx_ac,AES.alt_ac,AES.alt_aln_method,AES.alt_strand, + TE.ord, TE.start_i as tx_start_i,TE.end_i as tx_end_i, + AE.start_i as alt_start_i, AE.end_i as alt_end_i, + EA.cigar, EA.tx_aseq, EA.alt_aseq, + TES.exon_set_id AS tx_exon_set_id,AES.exon_set_id as alt_exon_set_id, + TE.exon_id as tx_exon_id, AE.exon_id as alt_exon_id, + EA.exon_aln_id + FROM transcript T + JOIN exon_set TES ON T.ac=TES.tx_ac AND TES.alt_aln_method ='transcript' + JOIN exon_set AES on T.ac=AES.tx_ac and AES.alt_aln_method!='transcript' + JOIN exon TE ON TES.exon_set_id=TE.exon_set_id + JOIN exon AE ON AES.exon_set_id=AE.exon_set_id AND TE.ord=AE.ord + LEFT JOIN exon_aln EA ON TE.exon_id=EA.tx_exon_id AND AE.exon_id=EA.alt_exon_id; + """) + op.execute(""" + CREATE VIEW tx_exon_set_summary_dv AS + SELECT T.hgnc,cds_md5,es_fingerprint,tx_ac,alt_ac,alt_aln_method,alt_strand,exon_set_id,n_exons,se_i,starts_i,ends_i,lengths + FROM transcript T + JOIN exon_set_exons_fp_mv ESE ON T.ac=ESE.tx_ac; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_exon_set_summary_mv AS SELECT * FROM tx_exon_set_summary_dv WITH NO DATA; + CREATE INDEX tx_exon_set_summary_mv_cds_md5_ix ON tx_exon_set_summary_mv(cds_md5); + CREATE INDEX tx_exon_set_summary_mv_es_fingerprint_ix ON tx_exon_set_summary_mv(es_fingerprint); + CREATE INDEX tx_exon_set_summary_mv_tx_ac_ix ON tx_exon_set_summary_mv(tx_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_ac_ix ON tx_exon_set_summary_mv(alt_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_aln_method_ix ON tx_exon_set_summary_mv(alt_aln_method); + GRANT SELECT ON tx_exon_set_summary_mv TO public; + REFRESH MATERIALIZED VIEW tx_exon_set_summary_mv; + """) + op.execute(""" + CREATE VIEW tx_def_summary_dv AS + SELECT TESS.exon_set_id, TESS.tx_ac, TESS.alt_ac, TESS.alt_aln_method, TESS.alt_strand, + TESS.hgnc, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp, + CEF.cds_exon_lengths_fp, TESS.n_exons, TESS.se_i, CEF.cds_se_i, TESS.starts_i, + TESS.ends_i, TESS.lengths, T.cds_start_i, T.cds_end_i, CEF.cds_start_exon, CEF.cds_end_exon + FROM tx_exon_set_summary_mv TESS + JOIN transcript T ON TESS.tx_ac=T.ac + LEFT JOIN _cds_exons_fp_v CEF ON TESS.exon_set_id=CEF.exon_set_id + WHERE TESS.alt_aln_method = 'transcript'; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_def_summary_mv AS SELECT * FROM tx_def_summary_dv WITH NO DATA; + CREATE INDEX tx_def_summary_mv_tx_ac ON tx_def_summary_mv (tx_ac); + CREATE INDEX tx_def_summary_mv_alt_ac ON tx_def_summary_mv (alt_ac); + CREATE INDEX tx_def_summary_mv_alt_aln_method ON tx_def_summary_mv (alt_aln_method); + CREATE INDEX tx_def_summary_mv_hgnc ON tx_def_summary_mv (hgnc); + REFRESH MATERIALIZED VIEW tx_def_summary_mv; + """) + op.execute(""" + CREATE VIEW tx_similarity_v AS + SELECT DISTINCT + D1.tx_ac as tx_ac1, D2.tx_ac as tx_ac2, + D1.hgnc = D2.hgnc as hgnc_eq, + D1.cds_md5=D2.cds_md5 as cds_eq, + D1.es_fingerprint=D2.es_fingerprint as es_fp_eq, + D1.cds_es_fp=D2.cds_es_fp as cds_es_fp_eq, + D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp as cds_exon_lengths_fp_eq + FROM tx_def_summary_mv D1 + JOIN tx_def_summary_mv D2 on (D1.tx_ac != D2.tx_ac + and (D1.hgnc=D2.hgnc + or D1.cds_md5=D2.cds_md5 + or D1.es_fingerprint=D2.es_fingerprint + or D1.cds_es_fp=D2.cds_es_fp + or D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp + )); + """) + # ### end of updates to views ### + + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(None, 'transcript', schema='uta', type_='foreignkey') + op.drop_index(op.f("ix_uta_transcript_gene_id"), table_name="transcript", schema="uta") + op.alter_column("transcript", "gene_id", + existing_type=sa.TEXT(), + nullable=True, + schema="uta") + op.drop_index(op.f("ix_uta_gene_hgnc"), table_name="gene", schema="uta") + op.drop_constraint("gene_pkey", "gene", schema="uta") + op.alter_column("gene", "gene_id", + existing_type=sa.TEXT(), + nullable=True, + schema="uta") + op.drop_index(op.f("ix_uta_gene_symbol"), table_name="gene", schema="uta") + op.drop_column("gene", "symbol", schema="uta") + # ### end Alembic commands ### diff --git a/src/alembic/versions/f885cb84efce_update_tx_alt_exon_pairs_v.py b/src/alembic/versions/f885cb84efce_update_tx_alt_exon_pairs_v.py new file mode 100644 index 0000000..5ecaa9a --- /dev/null +++ b/src/alembic/versions/f885cb84efce_update_tx_alt_exon_pairs_v.py @@ -0,0 +1,57 @@ +"""update tx_alt_exon_pairs_v + +Revision ID: f885cb84efce +Revises: 14eed54ff90d +Create Date: 2024-05-07 21:01:03.693969 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'f885cb84efce' +down_revision: Union[str, None] = '14eed54ff90d' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;") + op.execute(""" + CREATE VIEW tx_alt_exon_pairs_v AS + SELECT g.symbol, g.symbol as hgnc, g.gene_id,TES.exon_SET_id AS tes_exon_SET_id, + AES.exon_SET_id AS aes_exon_SET_id, TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac, + AES.alt_strand,AES.alt_aln_method, TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id, + AEX.exon_id AS alt_exon_id, TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, + AEX.start_i AS alt_start_i, AEX.END_i AS alt_END_i, EA.exon_aln_id,EA.cigar + FROM exon_SET tes + JOIN transcript t ON tes.tx_ac=t.ac + JOIN gene g ON t.gene_id=g.gene_id + JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method !~ 'transcript' + JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id + JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD + LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id; + """) + + +def downgrade() -> None: + op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;") + op.execute(""" + CREATE VIEW tx_alt_exon_pairs_v AS + SELECT g.symbol, g.symbol as hgnc, g.gene_id,TES.exon_SET_id AS tes_exon_SET_id, + AES.exon_SET_id AS aes_exon_SET_id, TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac, + AES.alt_strand,AES.alt_aln_method, TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id, + AEX.exon_id AS alt_exon_id, TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, + AEX.start_i AS alt_start_i, AEX.END_i AS alt_END_i, EA.exon_aln_id,EA.cigar + FROM exon_SET tes + JOIN transcript t ON tes.tx_ac=t.ac + JOIN gene g ON t.gene_id=g.gene_id + JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method!='transcript' + JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id + JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD + LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id; + """) + diff --git a/src/uta/__init__.py b/src/uta/__init__.py index ff05e64..eb783bc 100644 --- a/src/uta/__init__.py +++ b/src/uta/__init__.py @@ -24,7 +24,7 @@ def connect(db_url=default_db_url): """ - Connect to a UTA database instance and return a UTA0 interface instance. + Connect to a UTA database instance and return a sqlalchemy Session. When called with an explicit db_url argument, that db_url is used for connecting. diff --git a/src/uta/cli.py b/src/uta/cli.py index 4b85440..bda63b0 100644 --- a/src/uta/cli.py +++ b/src/uta/cli.py @@ -6,6 +6,7 @@ uta (-C CONF ...) [options] shell uta (-C CONF ...) [options] drop-schema uta (-C CONF ...) [options] create-schema + uta (-C CONF ...) [options] update-meta-data uta (-C CONF ...) [options] load-sql FILES ... uta (-C CONF ...) [options] rebuild uta (-C CONF ...) [options] load-origin FILE @@ -13,13 +14,14 @@ uta (-C CONF ...) [options] load-geneinfo FILE uta (-C CONF ...) [options] load-txinfo FILE uta (-C CONF ...) [options] load-exonset FILE + uta (-C CONF ...) [options] load-assoc-ac FILE uta (-C CONF ...) [options] load-sequences uta (-C CONF ...) [options] align-exons [--sql SQL] uta (-C CONF ...) [options] load-ncbi-seqgene FILE uta (-C CONF ...) [options] grant-permissions uta (-C CONF ...) [options] refresh-matviews uta (-C CONF ...) [options] analyze - + Options: -C CONF, --conf CONF Configuration to read (required) @@ -67,8 +69,10 @@ def main(): ("align-exons", ul.align_exons), ("analyze", ul.analyze), ("create-schema", ul.create_schema), + ("update-meta-data", ul.update_meta_data), ("drop-schema", ul.drop_schema), ("grant-permissions", ul.grant_permissions), + ("load-assoc-ac", ul.load_assoc_ac), ("load-exonset", ul.load_exonset), ("load-geneinfo", ul.load_geneinfo), ("load-origin", ul.load_origin), @@ -118,13 +122,10 @@ def main(): cmd=cmd, elapsed=time.time() - t0)) - if __name__ == "__main__": main() - - # # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta) ## diff --git a/src/uta/exceptions.py b/src/uta/exceptions.py index 36454a2..4691ed8 100644 --- a/src/uta/exceptions.py +++ b/src/uta/exceptions.py @@ -17,6 +17,15 @@ class InvalidIntervalError(UTAError): class InvalidHGVSVariantError(UTAError): pass + +class EutilsDownloadError(Exception): + pass + + +class ExonStructureMismatchError(UTAError): + pass + + # # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta) ## diff --git a/src/uta/formats/geneaccessions.py b/src/uta/formats/geneaccessions.py index 9837d82..3b5985b 100644 --- a/src/uta/formats/geneaccessions.py +++ b/src/uta/formats/geneaccessions.py @@ -3,7 +3,7 @@ class GeneAccessions(recordtype.recordtype('GeneAccessions', - ['hgnc', 'tx_ac', 'gene_id', 'pro_ac', 'origin'])): + ['gene_symbol', 'tx_ac', 'gene_id', 'pro_ac', 'origin'])): pass diff --git a/src/uta/formats/geneinfo.py b/src/uta/formats/geneinfo.py index f4a6015..e094adf 100644 --- a/src/uta/formats/geneinfo.py +++ b/src/uta/formats/geneinfo.py @@ -5,7 +5,7 @@ class GeneInfo(recordtype.recordtype('GeneInfo', - ['gene_id', 'tax_id', 'hgnc', 'maploc', 'aliases', 'type', 'summary', 'descr', 'xrefs'])): + ['gene_id', 'gene_symbol', 'tax_id', 'hgnc', 'maploc', 'aliases', 'type', 'summary', 'descr', 'xrefs'])): pass @@ -38,7 +38,6 @@ def __next__(self): return GeneInfo(**d) - if __name__ == '__main__': tmpfn = '/tmp/exonset' diff --git a/src/uta/formats/txinfo.py b/src/uta/formats/txinfo.py index f8d6cf5..42660a4 100644 --- a/src/uta/formats/txinfo.py +++ b/src/uta/formats/txinfo.py @@ -1,10 +1,36 @@ import csv import recordtype +from typing import List, Optional -class TxInfo(recordtype.recordtype('TxInfo', - ['origin', 'ac', 'hgnc', 'cds_se_i', 'exons_se_i'])): - pass +# transl_except should be a semicolon-separated list: +# (pos:333..335,aa:Sec);(pos:1017,aa:TERM) +class TxInfo( + recordtype.recordtype( + 'TxInfo', + ['origin', 'ac', 'gene_id', 'gene_symbol', 'cds_se_i', 'exons_se_i', 'codon_table', 'transl_except'], +)): + + @staticmethod + def serialize_transl_except(transl_except_list: Optional[List[str]]) -> Optional[str]: + """Helper for formatting transl_except list as a string.""" + if transl_except_list is None: + return None + else: + return ";".join(transl_except_list) + + @staticmethod + def serialize_cds_se_i(cds_se_i: Optional[tuple]) -> Optional[str]: + """Helper for formatting cds_se_i tuple as a string.""" + if cds_se_i is None: + return None + else: + return "{},{}".format(*cds_se_i) + + @staticmethod + def serialize_exons_se_i(exons_se_i: List[tuple]) -> str: + """Helper for formatting exons_se_i list as a string.""" + return ";".join(["{},{}".format(*ese) for ese in exons_se_i]) class TxInfoWriter(csv.DictWriter): diff --git a/src/uta/loading.py b/src/uta/loading.py index 2bc9b5e..3301c40 100644 --- a/src/uta/loading.py +++ b/src/uta/loading.py @@ -7,16 +7,19 @@ import itertools import logging import time +from typing import Any, Dict, List from biocommons.seqrepo import SeqRepo from bioutils.coordinates import strand_pm_to_int, MINUS_STRAND from bioutils.digests import seq_md5 from bioutils.sequences import reverse_complement from sqlalchemy.exc import IntegrityError +from sqlalchemy.orm import Session from sqlalchemy.orm.exc import NoResultFound +from sqlalchemy import text import psycopg2.extras import six -import uta_align.align.algorithms as utaaa +from uta_align.align.algorithms import cigar_alignment, needleman_wunsch_gotoh_align from uta.lru_cache import lru_cache @@ -27,6 +30,7 @@ import uta.formats.txinfo as ufti import uta.parsers.geneinfo import uta.parsers.seqgene +from uta.exceptions import ExonStructureMismatchError usam = uta.models @@ -46,10 +50,10 @@ def _get_cursor(con): return cur def align(s1, s2): - score, cigar = utaaa.needleman_wunsch_gotoh_align(s1.encode("ascii"), - s2.encode("ascii"), - extended_cigar=True) - tx_aseq, alt_aseq = utaaa.cigar_alignment( + score, cigar = needleman_wunsch_gotoh_align(s1.encode("ascii"), + s2.encode("ascii"), + extended_cigar=True) + tx_aseq, alt_aseq = cigar_alignment( tx_seq, alt_seq, cigar, hide_match=False) return tx_aseq.decode("ascii"), alt_aseq.decode("ascii"), cigar.to_string().decode("ascii") @@ -150,27 +154,27 @@ def _fetch_seq(ac, s, e): def analyze(session, opts, cf): - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) cmds = [ "analyze verbose" ] for cmd in cmds: logger.info(cmd) - session.execute(cmd) + session.execute(text(cmd)) session.commit() def create_schema(session, opts, cf): """Create and populate initial schema""" - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) if session.bind.name == "postgresql" and usam.use_schema: - session.execute("create schema " + usam.schema_name) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("create schema " + usam.schema_name)) + session.execute(text("set search_path = " + usam.schema_name)) session.commit() usam.Base.metadata.create_all(session.bind) @@ -184,11 +188,38 @@ def create_schema(session, opts, cf): logger.info("created schema") +def update_meta_data(session, opts, cf): + """Update Meta table with schema version""" + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) + + # check if schema version is up-to-date + md_schema_version = session.query(usam.Meta).filter_by(key="schema_version").one() + if md_schema_version.value != usam.schema_version: + logger.info(f"updating schema version from {md_schema_version.value} to {usam.schema_version}") + md_schema_version.value = usam.schema_version + session.commit() + else: + logger.info(f"schema version {md_schema_version.value} is already up-to-date") + + # set updated on + md_updated_on = session.query(usam.Meta).filter_by(key="updated on").one_or_none() + if md_updated_on is None: + session.add(usam.Meta(key="updated on", value=datetime.datetime.now().isoformat())) + session.commit() + logger.info("added updated on") + else: + md_updated_on.value = datetime.datetime.now().isoformat() + session.commit() + logger.info("updated updated on") + + def drop_schema(session, opts, cf): if session.bind.name == "postgresql" and usam.use_schema: session.execute( - "set role {admin_role};".format(admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + text("set role {admin_role};".format(admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) ddl = "drop schema if exists " + usam.schema_name + " cascade" session.execute(ddl) @@ -199,9 +230,9 @@ def drop_schema(session, opts, cf): def grant_permissions(session, opts, cf): schema = usam.schema_name - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) cmds = [ # alter db doesn't belong here, and probably better to avoid the implicit behevior this encourages @@ -211,60 +242,134 @@ def grant_permissions(session, opts, cf): sql = "select concat(schemaname,'.',tablename) as fqrn from pg_tables where schemaname='{schema}'".format( schema=schema) - rows = list(session.execute(sql)) + rows = list(session.execute(text(sql))) cmds += ["grant select on {fqrn} to PUBLIC".format( - fqrn=row["fqrn"]) for row in rows] + fqrn=row.fqrn) for row in rows] cmds += ["alter table {fqrn} owner to uta_admin".format( - fqrn=row["fqrn"]) for row in rows] + fqrn=row.fqrn) for row in rows] sql = "select concat(schemaname,'.',viewname) as fqrn from pg_views where schemaname='{schema}'".format( schema=schema) - rows = list(session.execute(sql)) + rows = list(session.execute(text(sql))) cmds += ["grant select on {fqrn} to PUBLIC".format( - fqrn=row["fqrn"]) for row in rows] + fqrn=row.fqrn) for row in rows] cmds += ["alter view {fqrn} owner to uta_admin".format( - fqrn=row["fqrn"]) for row in rows] + fqrn=row.fqrn) for row in rows] sql = "select concat(schemaname,'.',matviewname) as fqrn from pg_matviews where schemaname='{schema}'".format( schema=schema) - rows = list(session.execute(sql)) + rows = list(session.execute(text(sql))) cmds += ["grant select on {fqrn} to PUBLIC".format( - fqrn=row["fqrn"]) for row in rows] + fqrn=row.fqrn) for row in rows] cmds += ["alter materialized view {fqrn} owner to uta_admin".format( - fqrn=row["fqrn"]) for row in rows] + fqrn=row.fqrn) for row in rows] for cmd in sorted(cmds): logger.info(cmd) - session.execute(cmd) + session.execute(text(cmd)) session.commit() +def load_assoc_ac(session, opts, cf): + """ + Insert rows into `associated_accessions` table in the UTA database, + using data from a file written by sbin/assoc-acs-merge. + """ + logger.info("load_assoc_ac") + + admin_role = cf.get("uta", "admin_role") + session.execute(text(f"set role {admin_role};")) + session.execute(text(f"set search_path = {usam.schema_name};")) + fname = opts["FILE"] + + with gzip.open(fname, "rt") as fhandle: + for file_row in csv.DictReader(fhandle, delimiter="\t"): + row = { + "origin": file_row["origin"], + "pro_ac": file_row["pro_ac"], + "tx_ac": file_row["tx_ac"], + } + aa, created = _get_or_insert( + session=session, + table=usam.AssociatedAccessions, + row=row, + row_identifier=('origin', 'tx_ac', 'pro_ac'), + ) + if created: + # If committing on every insert is too slow, we can + # look into committing in batches like load_txinfo does. + session.commit() + logger.info(f"Added: {aa.tx_ac}, {aa.pro_ac}, {aa.origin}") + else: + logger.info(f"Already exists: {file_row}") + # All fields should should match when unique identifiers match. + # Discrepancies should be investigated. + existing_row = { + "origin": aa.origin, + "pro_ac": aa.pro_ac, + "tx_ac": aa.tx_ac, + } + + def load_exonset(session, opts, cf): # exonsets and associated exons are loaded together update_period = 25 - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute( + text("set role {admin_role};".format(admin_role=cf.get("uta", "admin_role"))) + ) + session.execute(text("set search_path = " + usam.schema_name)) - n_rows = len(gzip.open(opts["FILE"], 'rt').readlines()) - 1 - esr = ufes.ExonSetReader(gzip.open(opts["FILE"], 'rt')) + n_rows = len(gzip.open(opts["FILE"], "rt").readlines()) - 1 + esr = ufes.ExonSetReader(gzip.open(opts["FILE"], "rt")) logger.info("opened " + opts["FILE"]) n_new = 0 n_unchanged = 0 n_deprecated = 0 + n_skipped = 0 n_errors = 0 for i_es, es in enumerate(esr): + skipped = False try: - n, o = _upsert_exon_set_record(session, es.tx_ac, es.alt_ac, es.strand, es.method, es.exons_se_i) - session.commit() + # determine if alignment and transcript have the same exon structure + tx_es = ( + session.query(usam.ExonSet) + .filter( + usam.ExonSet.tx_ac == es.tx_ac, + usam.ExonSet.alt_ac == es.tx_ac, + usam.ExonSet.alt_aln_method == "transcript", + ) + .one() + ) + tx_exon_count = len(tx_es.exons_se_i()) + aln_exon_count = len(es.exons_se_i.split(";")) + if tx_exon_count == aln_exon_count: + n, o = _upsert_exon_set_record( + session, es.tx_ac, es.alt_ac, es.strand, es.method, es.exons_se_i + ) + session.commit() + else: + raise ExonStructureMismatchError( + "Exon structure mismatch: {tx_exon_count} exons in transcript {es.tx_ac}; {aln_exon_count} in alignment {es.alt_ac}".format( + tx_exon_count=tx_exon_count, + aln_exon_count=aln_exon_count, + es=es, + ) + ) except IntegrityError as e: logger.exception(e) session.rollback() n_errors += 1 - finally: + except NoResultFound as e: + logger.exception(e) + logger.warning("NoResultFound for transcript ExonSet: {es.tx_ac}".format(es=es)) + skipped = True + except ExonStructureMismatchError as e: + logger.exception(e) + skipped = True + else: (no) = (n is not None, o is not None) if no == (True, False): n_new += 1 @@ -272,19 +377,30 @@ def load_exonset(session, opts, cf): n_deprecated += 1 elif no == (False, True): n_unchanged += 1 - + finally: + if skipped: + n_skipped += 1 if i_es % update_period == 0 or i_es + 1 == n_rows: - logger.info("{i_es}/{n_rows} {p:.1f}%; {n_new} new, {n_unchanged} unchanged, {n_deprecated} deprecated, {n_errors} n_errors".format( - i_es=i_es, n_rows=n_rows, - n_new=n_new, n_unchanged=n_unchanged, n_deprecated=n_deprecated, n_errors=n_errors, - p=(i_es + 1) / n_rows * 100)) + logger.info( + "{i_es}/{n_rows} {p:.1f}%; {n_new} new, {n_unchanged} unchanged, {n_deprecated} deprecated, {n_skipped} skipped, {n_errors} n_errors".format( + i_es=i_es, + n_rows=n_rows, + n_new=n_new, + n_unchanged=n_unchanged, + n_deprecated=n_deprecated, + n_skipped=n_skipped, + n_errors=n_errors, + p=(i_es + 1) / n_rows * 100, + ) + ) + session.commit() def load_geneinfo(session, opts, cf): - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) gir = ufgi.GeneInfoReader(gzip.open(opts["FILE"], 'rt')) logger.info("opened " + opts["FILE"]) @@ -292,40 +408,17 @@ def load_geneinfo(session, opts, cf): for i_gi, gi in enumerate(gir): session.merge( usam.Gene( + gene_id=gi.gene_id, hgnc=gi.hgnc, + symbol=gi.gene_symbol, maploc=gi.maploc, descr=gi.descr, summary=gi.summary, aliases=gi.aliases, + type=gi.type, + xrefs=gi.xrefs, )) - logger.info("Added {gi.hgnc} ({gi.summary})".format(gi=gi)) - session.commit() - - -def load_ncbi_geneinfo(session, opts, cf): - """ - import data as downloaded (by you) from - ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz - """ - - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) - - gip = uta.parsers.geneinfo.GeneInfoParser(gzip.open(opts["FILE"], 'rt')) - for gi in gip: - if gi["tax_id"] != "9606" or gi["Symbol_from_nomenclature_authority"] == "-": - continue - g = usam.Gene( - gene_id=gi["GeneID"], - hgnc=gi["Symbol_from_nomenclature_authority"], - maploc=gi["map_location"], - descr=gi["Full_name_from_nomenclature_authority"], - aliases=gi["Synonyms"], - strand=gi[""], - ) - session.add(g) - logger.info("loaded gene {g.hgnc} ({g.descr})".format(g=g)) + logger.debug("Added {gi.gene_symbol}: {gi.gene_id} ({gi.summary})".format(gi=gi)) session.commit() @@ -362,9 +455,9 @@ def _seqgene_recs_to_tx_info(ac, assy, recs): return ti - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) o_refseq = session.query(usam.Origin).filter( usam.Origin.name == "NCBI RefSeq").one() @@ -404,9 +497,9 @@ def load_origin(session, opts, cf): def _none_if_empty(s): return None if s == "" else s - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) orir = csv.DictReader(open(opts["FILE"]), delimiter='\t') for rec in orir: @@ -441,9 +534,9 @@ def load_seqinfo(session, opts, cf): max_len = int(2e6) - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) n_rows = len(gzip.open(opts["FILE"]).readlines()) - 1 @@ -474,7 +567,7 @@ def _upsert_seq(si): for md5, si_iter in itertools.groupby(sorted(sir, key=lambda si: si.md5), key=lambda si: si.md5): sis = list(si_iter) - + # if sequence doesn't exist in sequence table, make it # this is to satisfy a FK dependency, which should be reconsidered si = sis[0] @@ -502,6 +595,7 @@ def _upsert_seq(si): session.merge(u_seqanno) else: # create the new annotation + logger.debug("creating seq_anno({si.origin},{si.ac},{si.md5})".format(si=si)) u_seqanno = usam.SeqAnno(origin_id=u_ori.origin_id, seq_id=si.md5, ac=si.ac, descr=si.descr) session.add(u_seqanno) @@ -512,6 +606,7 @@ def _upsert_seq(si): logger.info("{n_created} annotations created/{i_md5} sequences seen ({p:.1f}%)/{n_rows} sequences total".format( n_created=n_created, i_md5=i_md5, n_rows=n_rows, md5=md5, p=i_md5 / n_rows * 100)) session.commit() + session.commit() def load_sequences(session, opts, cf): @@ -521,9 +616,9 @@ def load_sequences(session, opts, cf): # 2e6 was chosen empirically based on sizes of NMs, NGs, NWs, NTs, NCs max_len = int(2e6) - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) sf = _get_seqfetcher(cf) @@ -567,9 +662,9 @@ def _fetch_first(acs): def load_sql(session, opts, cf): """Create views""" - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) for fn in opts["FILES"]: logger.info("loading " + fn) @@ -596,9 +691,9 @@ def _fetch_origin_by_name(name): tir = ufti.TxInfoReader(gzip.open(opts["FILE"], 'rt')) logger.info("opened " + opts["FILE"]) - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) n_new = 0 n_unchanged = 0 @@ -612,8 +707,10 @@ def _fetch_origin_by_name(name): if ti.cds_se_i: cds_start_i, cds_end_i = map(int, ti.cds_se_i.split(",")) + codon_table = ti.codon_table else: cds_start_i = cds_end_i = None + codon_table = None cds_md5 = None # 1. Fetch or make the Transcript record @@ -633,6 +730,29 @@ def _fetch_origin_by_name(name): u_tx = None n_cds_changed += 1 + if ti.transl_except: + # if the transl_except exists, make sure it exists in the database. + te_list = _create_translation_exceptions( + transcript=ti.ac, transl_except_list=ti.transl_except.split(";") + ) + for te_data in te_list: + te, created = _get_or_insert( + session=session, + table=usam.TranslationException, + row=te_data, + row_identifier=("tx_ac", "start_position", "end_position", "amino_acid"), + ) + if created: + logger.info( + f"TranslationException added: {te.tx_ac}, {te.start_position}, {te.end_position}, {te.amino_acid}" + ) + else: + logger.info( + f"TranslationException already exists: {te.tx_ac}, {te.start_position}, {te.end_position}, {te.amino_acid}" + ) + + + # state: u_tx is set if a transcript was found and was # unchanged, or None if 1) no such was found or 2) was found # and had updated CDS coords. @@ -654,17 +774,24 @@ def _fetch_origin_by_name(name): u_tx = usam.Transcript( ac=ti.ac, origin=ori, - hgnc=ti.hgnc, + gene_id=ti.gene_id, cds_start_i=cds_start_i, cds_end_i=cds_end_i, cds_md5=cds_md5, + codon_table=codon_table, ) session.add(u_tx) - if u_tx.hgnc != ti.hgnc: - logger.warn("{ti.ac}: HGNC symbol changed from {u_tx.hgnc} to {ti.hgnc}".format( - u_tx=u_tx, ti=ti)) - u_tx.hgnc = ti.hgnc + if ti.transl_except: + # if transl_except exists, it looks like this: + # (pos:333..335,aa:Sec);(pos:1017,aa:TERM) + transl_except_list = ti.transl_except.split(';') + te_list = _create_translation_exceptions(transcript=ti.ac, transl_except_list=transl_except_list) + for te in te_list: + session.add(usam.TranslationException(**te)) + + if u_tx.gene_id != ti.gene_id: + logger.warning("{ti.ac}: GeneID changed from {u_tx.gene_id} to {ti.gene_id}".format(u_tx=u_tx, ti=ti)) # state: transcript now exists, either existing or freshly-created @@ -675,7 +802,7 @@ def _fetch_origin_by_name(name): if no == (True, False): n_new += 1 elif no == (True, True): - logger.warn("Transcript {ti.ac} exon structure changed".format(ti=ti)) + logger.warning("Transcript {ti.ac} exon structure changed".format(ti=ti)) n_exons_changed += 1 elif no == (False, True): logger.debug("Transcript {ti.ac} exon structure unchanged".format(ti=ti)) @@ -688,14 +815,46 @@ def _fetch_origin_by_name(name): i_ti=i_ti, n_rows=n_rows, n_new=n_new, n_unchanged=n_unchanged, n_cds_changed=n_cds_changed, n_exons_changed=n_exons_changed, p=(i_ti + 1) / n_rows * 100)) - +def _create_translation_exceptions(transcript: str, transl_except_list: List[str]) -> List[Dict]: + """ + Create TranslationException object data where start and end positions are 0-based, from transl_except data that is 1-based. + For example, [(pos:333..335,aa:Sec), (pos:1017,aa:TERM)] should result in start and end positions [(332, 335), (1016, 1017)] + """ + result = [] + + for te in transl_except_list: + # remove parens + te = te.replace('(','').replace(')','') + + # extract positions + pos_str, aa_str = te.split(',') + pos_str = pos_str.removeprefix('pos:') + if '..' in pos_str: + start_position, _, end_position = pos_str.partition('..') + else: + start_position = end_position = pos_str + + # extract amino acid + amino_acid = aa_str.removeprefix('aa:') + + result.append( + { + 'tx_ac': transcript, + 'start_position': int(start_position) - 1, + 'end_position': int(end_position), + 'amino_acid': amino_acid, + } + ) + + return result + def refresh_matviews(session, opts, cf): - session.execute("set role {admin_role};".format( - admin_role=cf.get("uta", "admin_role"))) - session.execute("set search_path = " + usam.schema_name) + session.execute(text("set role {admin_role};".format( + admin_role=cf.get("uta", "admin_role")))) + session.execute(text("set search_path = " + usam.schema_name)) # matviews must be updated in dependency order. Unfortunately, # it's difficult to determine this programmatically. The "right" @@ -714,13 +873,12 @@ def refresh_matviews(session, opts, cf): "refresh materialized view exon_set_exons_fp_mv", "refresh materialized view tx_exon_set_summary_mv", "refresh materialized view tx_def_summary_mv", - # "refresh materialized view tx_aln_cigar_mv", - # "refresh materialized view tx_aln_summary_mv", + "refresh materialized view tx_exon_aln_mv", ] for cmd in cmds: logger.info(cmd) - session.execute(cmd) + session.execute(text(cmd)) session.commit() @@ -740,6 +898,34 @@ def _get_seqrepo(cf): _get_seqfetcher = _get_seqrepo +def _get_or_insert( + session: Session, + table: type[usam.Base], + row: dict[str, Any], + row_identifier: str | tuple[str, ...], +) -> tuple[usam.Base, bool]: + """ + Returns a sqlalchemy model of the inserted or fetched row. + + `session` is a sqlalchemy session. + `table` is the database table in which to insert `row`. + `row` is the a list of key-value pairs to insert into the table. + `row_identifier` is a map of key-value pairs which define a match between `row` and an existing row in the table. + + sqlalchemy.orm.exc.MultipleResultsFound may be raised if `row_identifier` does not uniquely identify a row. + KeyError may be raised if `row_identifier` refers to columns not present as keys in `row`. + sqlalchemy.exc.IntegrityError (raised from psycopg2.errors.ForeignKeyViolation) may be raised if a foreign key reference does not exist + """ + row_filter = {ri: row[ri] for ri in row_identifier} + try: + row_instance = session.query(table).filter_by(**row_filter).one() + created = False + except NoResultFound: + row_instance = table(**row) + session.add(row_instance) + created = True + return row_instance, created + def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess): @@ -747,9 +933,9 @@ def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess): returns tuple of (new_record, old_record) as follows: (new, None) -- no prior record; new was inserted - (None, old) -- prior record and unchaged; nothing was inserted + (None, old) -- prior record and unchanged; nothing was inserted (new, old) -- prior record existed and was changed - + """ key = (tx_ac, alt_ac, method) @@ -781,9 +967,29 @@ def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess): usam.ExonSet.alt_aln_method == alt_aln_method_with_hash, ) if existing.count() == 1: + logger.warning( + "Exon set {tx_ac}/{alt_ac} with method {method} already exists with hash {esh}".format( + tx_ac=tx_ac, + alt_ac=alt_ac, + method=method, + esh=alt_aln_method_with_hash, + ) + ) return (None, existing[0]) # update aln_method to add a unique exon set hash based on the *existing* exon set string + logger.warning( + "Exon set {tx_ac}/{alt_ac} with method {method} already exists, but with different exons; " + "existing exon set: {es_ess}; new exon set: {ess}; updated alt_aln_method of exonset to " + "{alt_aln_method_with_hash}".format( + tx_ac=tx_ac, + alt_ac=alt_ac, + method=method, + es_ess=es_ess, + ess=ess, + alt_aln_method_with_hash=alt_aln_method_with_hash, + ) + ) es.alt_aln_method = alt_aln_method_with_hash session.flush() old_es = es diff --git a/src/uta/models.py b/src/uta/models.py index 2305666..a8ec1dd 100644 --- a/src/uta/models.py +++ b/src/uta/models.py @@ -6,21 +6,22 @@ import sqlalchemy as sa import sqlalchemy.orm as sao +import sqlalchemy.types +import sqlalchemy.sql.functions from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.dialects import postgresql ############################################################################ # schema name support # also see etc/uta.conf -schema_version = "1.1" +schema_version = "1.2" use_schema = True if use_schema: - schema_name = "uta_" + schema_version.replace(".","_") - schema_name_dot = schema_name + "." + schema_name = "uta" else: schema_name = None - schema_name_dot = "" ############################################################################ @@ -97,13 +98,17 @@ class Gene(Base): __tablename__ = "gene" # columns: - hgnc = sa.Column(sa.Text, primary_key=True) + gene_id = sa.Column(sa.Text, primary_key=True) + hgnc = sa.Column(sa.Text, nullable=False, index=True) + symbol = sa.Column(sa.Text, nullable=False, index=True) maploc = sa.Column(sa.Text) descr = sa.Column(sa.Text) summary = sa.Column(sa.Text) aliases = sa.Column(sa.Text) added = sa.Column( sa.DateTime, nullable=False, default=datetime.datetime.now()) + type = sa.Column(sa.Text) + xrefs = sa.Column(sa.Text) # methods: @@ -123,17 +128,43 @@ class Transcript(Base): ac = sa.Column(sa.Text, primary_key=True) origin_id = sa.Column( sa.Integer, sa.ForeignKey("origin.origin_id", onupdate="CASCADE", ondelete="CASCADE"), nullable=False, index=True) - hgnc = sa.Column(sa.Text) # , sa.ForeignKey("gene.hgnc")) - cds_start_i = sa.Column(sa.Integer) #, nullable=False) - cds_end_i = sa.Column(sa.Integer) #, nullable=False) + gene_id = sa.Column(sa.Text, sa.ForeignKey("gene.gene_id"), nullable=False, index=True) + hgnc = sa.Column(sa.Text, nullable=True, index=True) + cds_start_i = sa.Column(sa.Integer) + cds_end_i = sa.Column(sa.Integer) cds_md5 = sa.Column(sa.Text, index=True) added = sa.Column( sa.DateTime, default=datetime.datetime.now(), nullable=False) + codon_table = sa.Column(sa.Text, nullable=True, server_default='1') # 1 = standard, 2 = mitochondrial # relationships: origin = sao.relationship("Origin", backref="transcripts") +class TranslationException(Base): + """ + Represents `transl_except` annotations on CDS features in transcript records from NCBI. + + Examples: + /transl_except=(pos:333..335,aa:Sec) + /transl_except=(pos:1017,aa:TERM) + """ + + __tablename__ = "translation_exception" + __table_args__ = ( + sa.CheckConstraint("start_position <= end_position", "start_less_than_or_equal_to_end"), + ) + + translation_exception_id = sa.Column(sa.Integer, autoincrement=True, primary_key=True) + tx_ac = sa.Column(sa.Text, sa.ForeignKey("transcript.ac", onupdate="CASCADE", ondelete="CASCADE"), nullable=False) + start_position = sa.Column(sa.Integer, nullable=False) + end_position = sa.Column(sa.Integer, nullable=False) + amino_acid = sa.Column(sa.Text, nullable=False) + + # relationships: + transcript = sao.relationship("Transcript", backref="translation_exceptions") + + class ExonSet(Base): __tablename__ = "exon_set" __table_args__ = ( @@ -208,8 +239,8 @@ class ExonAln(Base): cigar = sa.Column(sa.Text, nullable=False) added = sa.Column( sa.DateTime, default=datetime.datetime.now(), nullable=False) - tx_aseq = sa.Column(sa.Text, nullable=False) - alt_aseq = sa.Column(sa.Text, nullable=False) + tx_aseq = sa.Column(sa.Text, nullable=True) + alt_aseq = sa.Column(sa.Text, nullable=True) # relationships: tx_exon = sao.relationship( @@ -220,6 +251,27 @@ class ExonAln(Base): # methods: +class AssociatedAccessions(Base): + __tablename__ = "associated_accessions" + __table_args__ = ( + sa.Index("unique_pair_in_origin", "origin", "tx_ac", "pro_ac", unique=True), + sa.Index("associated_accessions_pro_ac", "pro_ac"), + sa.Index("associated_accessions_tx_ac", "tx_ac"), + {"comment": "transcript-protein accession pairs associated in source databases"}, + ) + + # columns: + associated_accession_id = sa.Column(sa.Integer, primary_key=True, autoincrement=True) + tx_ac = sa.Column(sa.Text, nullable=False) + pro_ac = sa.Column(sa.Text, nullable=False) + origin = sa.Column(sa.Text, nullable=False) + added = sa.Column( + postgresql.TIMESTAMP(timezone=True), + server_default=sqlalchemy.sql.functions.now(), + nullable=False, + ) + + # # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta) ## diff --git a/src/uta/parsers/seqrecord.py b/src/uta/parsers/seqrecord.py new file mode 100644 index 0000000..9168b42 --- /dev/null +++ b/src/uta/parsers/seqrecord.py @@ -0,0 +1,189 @@ +from collections import defaultdict +from functools import cached_property +from typing import List, Optional + +import Bio.SeqRecord +from Bio.SeqFeature import SeqFeature + + +class SeqRecordFeatureError(Exception): + """Raised when SeqRecord does not have the expected features.""" + + +class SeqRecordFacade: + def __init__(self, seqrecord: Bio.SeqRecord.SeqRecord): + self._sr = seqrecord + + @cached_property + def features_by_type(self) -> dict[str, list]: + result = defaultdict(list) + for feat in self._sr.features: + result[feat.type].append(feat) + return result + + @cached_property + def cds_feature(self) -> Optional[SeqFeature]: + """ + Returns the CDS feature for any coding transcript, None for any non-coding transcript. + Some NCBI records will contain multiple CDS features. In these one CDS describes a protein + with accession and protein sequence, the other CDS features describes a pseudogene. This method + will preferentially choose the CDS feature with a protein sequence. + Example: + CDS 422..778 + /gene="C6orf119" + /gene_synonym="dJ427A4.2" + /codon_start=1 + /product="chromosome 6 open reading frame 119" + /protein_id="NP_001012240.1" + /db_xref="GI:59276067" + /db_xref="GeneID:353267" + /translation="MTDTAEAVPNFEEMFASRFTENDKEYQEYLKRPPESPPIVEEWN + SRAGGNQRNRGNRLQDNRQFRGRDNRWGWPSDNRSNQWHGRSWGNNYPQHRQEPYYPQ + QYGHYGYNQRPPYGYY" + CDS 422..775 + /locus_tag="RP3-427A4.2-001" + /note="match: proteins: Q9BTL3 Q9CQY2 Q9CWI1" + /pseudo + /codon_start=1 + /product="Novel pseudogene" + """ + cds_features = self.features_by_type.get("CDS") + if cds_features is None: + return None + else: + # Prefer CDS with protein accession and translated sequence. + translated_cds_features = [ + f + for f in cds_features + if all([key in f.qualifiers for key in ("protein_id", "translation")]) + ] + if len(translated_cds_features) != 1: + raise SeqRecordFeatureError("Expected one `CDS` feature at most") + return translated_cds_features[0] + + @cached_property + def gene_feature(self) -> SeqFeature: + """Returns the gene feature, which should exist for all transcripts.""" + gene_features = self.features_by_type.get("gene") + if gene_features is None or len(gene_features) != 1: + raise SeqRecordFeatureError(f"Expected exactly one `gene` feature, for {self.id} " + f"found {len(gene_features) if gene_features is not None else None}") + + return gene_features[0] + + @property + def id(self): + return self._sr.id + + @property + def gene_symbol(self): + return self.gene_feature.qualifiers["gene"][0] + + @property + def gene_synonyms(self): + if "gene_synonym" in self.gene_feature.qualifiers: + return [gs.strip() for gs in self.gene_feature.qualifiers["gene_synonym"][0].split(";")] + else: + return [] + + @property + def gene_type(self): + if self.cds_feature: + return "protein-coding" + elif "ncRNA" in self.features_by_type: + return "ncRNA" + elif "pseudo" in self.features_by_type: + return "pseudo" + elif "rRNA" in self.features_by_type: + return "rRNA" + elif "snoRNA" in self.features_by_type: + return "snoRNA" + elif "tRNA" in self.features_by_type: + return "tRNA" + elif "scRNA" in self.features_by_type: + return "scRNA" + elif "snRNA" in self.features_by_type: + return "snRNA" + elif "misc_RNA" in self.features_by_type: + return "misc_RNA" + elif "other" in self.features_by_type: + return "other" + else: + return "unknown" + + @property + def gene_id(self): + # db_xref="GeneID:1234" + db_xrefs = self.gene_feature.qualifiers["db_xref"] + gene_ids = [x.partition(":")[2] for x in db_xrefs if x.startswith("GeneID:")] + assert len(gene_ids) == 1 + return gene_ids[0] + + @property + def db_xrefs(self): + """ + gene 1..4577 + /gene="A2M" + /gene_synonym="DKFZp779B086; FWP007; S863-7" + /db_xref="GeneID:2" + /db_xref="HPRD:00072" + /db_xref="MIM:103950" + """ + db_xrefs = self.gene_feature.qualifiers["db_xref"] + return [xref for xref in db_xrefs] + + @property + def cds_se_i(self): + if self.cds_feature is not None: + return self.cds_feature.location.start.real, self.cds_feature.location.end.real + else: + return None + + @property + def cds_product(self): + if self.cds_feature is not None: + return self.cds_feature.qualifiers["product"][0] + else: + return None + + @property + def cds_protein_id(self): + if self.cds_feature is not None: + return self.cds_feature.qualifiers["protein_id"][0] + else: + return None + + @property + def cds_translation(self): + if self.cds_feature is not None: + return str(self.cds_feature.qualifiers["translation"][0]) + else: + return None + + @property + def exons_se_i(self): + se_i = [] + if "exon" in self.features_by_type: + exons = self.features_by_type["exon"] + se_i = [(f.location.start.real, f.location.end.real) for f in exons] + return se_i + + @property + def codon_table(self) -> Optional[str]: + if self.cds_feature is None: + return None + else: + # default codon table is the standard table, aka "1" + # https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi + return "1" + + @property + def transl_except(self) -> Optional[List[str]]: + if self.cds_feature is None: + return None + else: + return self.cds_feature.qualifiers.get("transl_except") + + @property + def feature_seq(self): + return str(self._sr.seq) diff --git a/src/uta/tools/eutils.py b/src/uta/tools/eutils.py new file mode 100644 index 0000000..7abfaa4 --- /dev/null +++ b/src/uta/tools/eutils.py @@ -0,0 +1,35 @@ +from enum import Enum + +import requests + +from uta import EutilsDownloadError + + +class NcbiFileFormatEnum(str, Enum): + FASTA = "fasta" + GENBANK = "gb" + + +def download_from_eutils(accession: str, file_format: NcbiFileFormatEnum, output_file: str) -> None: + """ + Download a file from NCBI using the eutils endpoint. + Args: + - accession: NCBI accession ID + - file_format: File format to download ("fasta" or "gb") + - output_file: Path to the file where the downloaded content will be saved + """ + + base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + params = { + "db": "nuccore", + "id": accession, + "retmode": "text", + "rettype": file_format + } + response = requests.get(base_url, params=params) + + if response.status_code == 200: + with open(output_file, 'w') as file: + file.write(response.text) + else: + raise EutilsDownloadError(f"Failed to download {file_format} file for {accession}. HTTP status code: {response.status_code}") \ No newline at end of file diff --git a/src/uta/tools/file_utils.py b/src/uta/tools/file_utils.py new file mode 100644 index 0000000..78d8d3d --- /dev/null +++ b/src/uta/tools/file_utils.py @@ -0,0 +1,12 @@ +import gzip +from contextlib import contextmanager + + +@contextmanager +def open_file(filename): + if filename.endswith(".gz"): + with gzip.open(filename, "rt") as f: + yield f + else: + with open(filename) as f: + yield f diff --git a/tests/data/NC_012920.1.gbff b/tests/data/NC_012920.1.gbff new file mode 100644 index 0000000..afea7f3 --- /dev/null +++ b/tests/data/NC_012920.1.gbff @@ -0,0 +1,1165 @@ +LOCUS NC_012920 16569 bp DNA circular PRI 03-APR-2023 +DEFINITION Homo sapiens mitochondrion, complete genome. +ACCESSION NC_012920 AC_000021 +VERSION NC_012920.1 +DBLINK BioProject: PRJNA927338 +KEYWORDS RefSeq. +SOURCE mitochondrion Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. +REFERENCE 1 (bases 324 to 743) + AUTHORS Andrews,R.M., Kubacka,I., Chinnery,P.F., Lightowlers,R.N., + Turnbull,D.M. and Howell,N. + TITLE Reanalysis and revision of the Cambridge reference sequence for + human mitochondrial DNA + JOURNAL Nat. Genet. 23 (2), 147 (1999) + PUBMED 10508508 +REFERENCE 2 (bases 15888 to 15954) + AUTHORS Anderson,S., Bankier,A.T., Barrell,B.G., de Bruijn,M.H., + Coulson,A.R., Drouin,J., Eperon,I.C., Nierlich,D.P., Roe,B.A., + Sanger,F., Schreier,P.H., Smith,A.J., Staden,R. and Young,I.G. + TITLE Sequence and organization of the human mitochondrial genome + JOURNAL Nature 290 (5806), 457-465 (1981) + PUBMED 7219534 +REFERENCE 3 (bases 1 to 16569) + CONSRTM NCBI Genome Project + TITLE Direct Submission + JOURNAL Submitted (08-JUL-2009) National Center for Biotechnology + Information, NIH, Bethesda, MD 20894, USA +REFERENCE 4 (bases 1 to 16569) + AUTHORS Kogelnik,A.M. and Lott,M.T. + TITLE Direct Submission + JOURNAL Submitted (24-AUG-2006) Mitomap.org, Center for Molecular and + Mitochondrial Medicine and Genetics (MAMMAG) University of + California, University of California, Irvine, Irvine, CA + 92697-3940, USA + REMARK Sequence update by submitter +REFERENCE 5 (bases 1 to 16569) + AUTHORS Kogelnik,A.M. and Lott,M.T. + TITLE Direct Submission + JOURNAL Submitted (18-APR-1997) Center for Molecular Medicine, Emory + University School of Medicine, 1462 Clifton Road, Suite 420, + Atlanta, GA 30322, USA + REMARK sequence updated +COMMENT PROVISIONAL REFSEQ: This record has not yet been subject to final + NCBI review. The reference sequence was derived from J01415. + + On Jul 8, 2009 this sequence version replaced AC_000021.2. + This sequence is a corrected version of the HUMMTCG reference + sequence. The original Cambridge reference sequence (CRS) is + preserved as GenBank J01415 gi:337188 [PMID:7219534]. Corrections + have been made and annotated per the re-sequencing of the original + material by Andrews et al [PMID:10508508]. + + This Revised Cambridge Reference Sequence (rCRS) has eighteen + specific corrections or confirmations of the original 1981 sequence + of Anderson et al [PMID:7219534]. Seven nucleotides are confirmed + as rare polymorphisms, maintained as: 263A, 311C-315C, 750A, 1438A, + 4769A, 8860A, and 15326A. Eleven nucleotides are error + corrections: 3107del, 3423T, 4985A, 9559C, 11335C, 13702C, 14199T, + 14272C, 14365C, 14368C, and 14766C. These 11 errors in the + original Cambridge sequence were determined to be either outright + sequencing errors (8 instances) or due to the presence of bovine + DNA (2 instances) or HeLa DNA (1 instance) mixed in with the + original human placental DNA [PMID:10508508]. HISTORICAL + NUCLEOTIDE NUMBERS ARE MAINTAINED by indicating 3107del as 'N'. + A summary table of the reanalysis data is available online at + http://www.mitomap.org/MITOMAP/CambridgeReanalysis + + L-strand is shown. + COMPLETENESS: full length. +FEATURES Location/Qualifiers + source 1..16569 + /organism="Homo sapiens" + /organelle="mitochondrion" + /mol_type="genomic DNA" + /isolation_source="caucasian" + /db_xref="taxon:9606" + /tissue_type="placenta" + /country="United Kingdom: Great Britain" + /note="this is the rCRS" + D-loop complement(join(16024..16569,1..576)) + gene 577..647 + /gene="TRNF" + /nomenclature="Official Symbol: MT-TF | Name: + mitochondrially encoded tRNA phenylalanine | Provided by: + HGNC:HGNC:7481" + /db_xref="GeneID:4558" + /db_xref="HGNC:HGNC:7481" + /db_xref="MIM:590070" + tRNA 577..647 + /gene="TRNF" + /product="tRNA-Phe" + /note="NAR: 1455" + /anticodon=(pos:611..613,aa:Phe,seq:gaa) + /codon_recognized="UUC" + /db_xref="GeneID:4558" + /db_xref="HGNC:HGNC:7481" + /db_xref="MIM:590070" + gene 648..1601 + /gene="RNR1" + /gene_synonym="MTRNR1" + /nomenclature="Official Symbol: MT-RNR1 | Name: + mitochondrially encoded 12S RNA | Provided by: + HGNC:HGNC:7470" + /db_xref="GeneID:4549" + /db_xref="HGNC:HGNC:7470" + /db_xref="MIM:561000" + rRNA 648..1601 + /gene="RNR1" + /gene_synonym="MTRNR1" + /product="s-rRNA" + /note="12S rRNA; 12S ribosomal RNA" + /db_xref="GeneID:4549" + /db_xref="HGNC:HGNC:7470" + /db_xref="MIM:561000" + gene 1602..1670 + /gene="TRNV" + /gene_synonym="MTTV" + /nomenclature="Official Symbol: MT-TV | Name: + mitochondrially encoded tRNA valine | Provided by: + HGNC:HGNC:7500" + /db_xref="GeneID:4577" + /db_xref="HGNC:HGNC:7500" + /db_xref="MIM:590105" + tRNA 1602..1670 + /gene="TRNV" + /gene_synonym="MTTV" + /product="tRNA-Val" + /note="NAR: 2053" + /anticodon=(pos:1633..1635,aa:Val,seq:tac) + /codon_recognized="GUA" + /db_xref="GeneID:4577" + /db_xref="HGNC:HGNC:7500" + /db_xref="MIM:590105" + gene 1671..3229 + /gene="RNR2" + /gene_synonym="MTRNR2" + /nomenclature="Official Symbol: MT-RNR2 | Name: + mitochondrially encoded 16S RNA | Provided by: + HGNC:HGNC:7471" + /db_xref="GeneID:4550" + /db_xref="HGNC:HGNC:7471" + /db_xref="MIM:561010" + rRNA 1671..3229 + /gene="RNR2" + /gene_synonym="MTRNR2" + /product="l-rRNA" + /note="16S ribosomal RNA; 16S rRNA" + /db_xref="GeneID:4550" + /db_xref="HGNC:HGNC:7471" + /db_xref="MIM:561010" + misc_feature 3107 + /note="preserves historical genome annotation numbering" + gene 3230..3304 + /gene="TRNL1" + /gene_synonym="MTTL1" + /nomenclature="Official Symbol: MT-TL1 | Name: + mitochondrially encoded tRNA leucine 1 (UUA/G) | Provided + by: HGNC:HGNC:7490" + /db_xref="GeneID:4567" + /db_xref="HGNC:HGNC:7490" + /db_xref="MIM:590050" + tRNA 3230..3304 + /gene="TRNL1" + /gene_synonym="MTTL1" + /product="tRNA-Leu" + /note="NAR: 1054" + /anticodon=(pos:3265..3267,aa:Leu,seq:taa) + /codon_recognized="UUR" + /db_xref="GeneID:4567" + /db_xref="HGNC:HGNC:7490" + /db_xref="MIM:590050" + gene 3307..4262 + /gene="ND1" + /gene_synonym="MTND1" + /nomenclature="Official Symbol: MT-ND1 | Name: + mitochondrially encoded NADH dehydrogenase 1 | Provided + by: HGNC:HGNC:7455" + /db_xref="GeneID:4535" + /db_xref="HGNC:HGNC:7455" + /db_xref="MIM:516000" + CDS 3307..4262 + /gene="ND1" + /gene_synonym="MTND1" + /note="NADH dehydrogenase, subunit 1 (complex I); TAA stop + codon is completed by the addition of 3' A residues to the + mRNA" + /codon_start=1 + /transl_except=(pos:4261..4262,aa:TERM) + /transl_table=2 + /product="NADH dehydrogenase subunit 1" + /protein_id="YP_003024026.1" + /db_xref="GeneID:4535" + /db_xref="HGNC:HGNC:7455" + /db_xref="MIM:516000" + /translation="MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYG + LLQPFADAMKLFTKEPLKPATSTITLYITAPTLALTIALLLWTPLPMPNPLVNLNLGL + LFILATSSLAVYSILWSGWASNSNYALIGALRAVAQTISYEVTLAIILLSTLLMSGSF + NLSTLITTQEHLWLLLPSWPLAMMWFISTLAETNRTPFDLAEGESELVSGFNIEYAAG + PFALFFMAEYTNIIMMNTLTTTIFLGTTYDALSPELYTTYFVTKTLLLTSLFLWIRTA + YPRFRYDQLMHLLWKNFLPLTLALLMWYVSMPITISSIPPQT" + gene 4263..4331 + /gene="TRNI" + /gene_synonym="MTTI" + /nomenclature="Official Symbol: MT-TI | Name: + mitochondrially encoded tRNA isoleucine | Provided by: + HGNC:HGNC:7488" + /db_xref="GeneID:4565" + /db_xref="HGNC:HGNC:7488" + /db_xref="MIM:590045" + tRNA 4263..4331 + /gene="TRNI" + /gene_synonym="MTTI" + /product="tRNA-Ile" + /note="NAR: 0997" + /anticodon=(pos:4292..4294,aa:Ile,seq:gat) + /codon_recognized="AUC" + /db_xref="GeneID:4565" + /db_xref="HGNC:HGNC:7488" + /db_xref="MIM:590045" + gene complement(4329..4400) + /gene="TRNQ" + /gene_synonym="MTTQ" + /nomenclature="Official Symbol: MT-TQ | Name: + mitochondrially encoded tRNA glutamine | Provided by: + HGNC:HGNC:7495" + /db_xref="GeneID:4572" + /db_xref="HGNC:HGNC:7495" + /db_xref="MIM:590030" + tRNA complement(4329..4400) + /gene="TRNQ" + /gene_synonym="MTTQ" + /product="tRNA-Gln" + /note="NAR: 0597" + /anticodon=(pos:complement(4365..4367),aa:Gln,seq:ttg) + /codon_recognized="CAA" + /db_xref="GeneID:4572" + /db_xref="HGNC:HGNC:7495" + /db_xref="MIM:590030" + gene 4402..4469 + /gene="TRNM" + /gene_synonym="MTTM" + /nomenclature="Official Symbol: MT-TM | Name: + mitochondrially encoded tRNA methionine | Provided by: + HGNC:HGNC:7492" + /db_xref="GeneID:4569" + /db_xref="HGNC:HGNC:7492" + /db_xref="MIM:590065" + tRNA 4402..4469 + /gene="TRNM" + /gene_synonym="MTTM" + /product="tRNA-Met" + /note="NAR: 1297" + /anticodon=(pos:4432..4434,aa:Met,seq:cat) + /codon_recognized="AUG" + /db_xref="GeneID:4569" + /db_xref="HGNC:HGNC:7492" + /db_xref="MIM:590065" + gene 4470..5511 + /gene="ND2" + /gene_synonym="MTND2" + /nomenclature="Official Symbol: MT-ND2 | Name: + mitochondrially encoded NADH dehydrogenase 2 | Provided + by: HGNC:HGNC:7456" + /db_xref="GeneID:4536" + /db_xref="HGNC:HGNC:7456" + /db_xref="MIM:516001" + CDS 4470..5511 + /gene="ND2" + /gene_synonym="MTND2" + /note="TAA stop codon is completed by the addition of 3' A + residues to the mRNA" + /codon_start=1 + /transl_except=(pos:5511,aa:TERM) + /transl_table=2 + /product="NADH dehydrogenase subunit 2" + /protein_id="YP_003024027.1" + /db_xref="GeneID:4536" + /db_xref="HGNC:HGNC:7456" + /db_xref="MIM:516001" + /translation="MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVL + TKKMNPRSTEAAIKYFLTQATASMILLMAILFNNMLSGQWTMTNTTNQYSSLMIMMAM + AMKLGMAPFHFWVPEVTQGTPLTSGLLLLTWQKLAPISIMYQISPSLNVSLLLTLSIL + SIMAGSWGGLNQTQLRKILAYSSITHMGWMMAVLPYNPNMTILNLTIYIILTTTAFLL + LNLNSSTTTLLLSRTWNKLTWLTPLIPSTLLSLGGLPPLTGFLPKWAIIEEFTKNNSL + IIPTIMATITLLNLYFYLRLIYSTSITLLPMSNNVKMKWQFEHTKPTPFLPTLIALTT + LLLPISPFMLMIL" + gene 5512..5579 + /gene="TRNW" + /gene_synonym="MTTW" + /nomenclature="Official Symbol: MT-TW | Name: + mitochondrially encoded tRNA tryptophan | Provided by: + HGNC:HGNC:7501" + /db_xref="GeneID:4578" + /db_xref="HGNC:HGNC:7501" + /db_xref="MIM:590095" + tRNA 5512..5579 + /gene="TRNW" + /gene_synonym="MTTW" + /product="tRNA-Trp" + /note="NAR: 1897" + /anticodon=(pos:5544..5546,aa:Trp,seq:tca) + /codon_recognized="UGA" + /db_xref="GeneID:4578" + /db_xref="HGNC:HGNC:7501" + /db_xref="MIM:590095" + gene complement(5587..5655) + /gene="TRNA" + /gene_synonym="MTTA" + /nomenclature="Official Symbol: MT-TA | Name: + mitochondrially encoded tRNA alanine | Provided by: + HGNC:HGNC:7475" + /db_xref="GeneID:4553" + /db_xref="HGNC:HGNC:7475" + /db_xref="MIM:590000" + tRNA complement(5587..5655) + /gene="TRNA" + /gene_synonym="MTTA" + /product="tRNA-Ala" + /note="NAR: 0097" + /anticodon=(pos:complement(5623..5625),aa:Ala,seq:tgc) + /codon_recognized="GCA" + /db_xref="GeneID:4553" + /db_xref="HGNC:HGNC:7475" + /db_xref="MIM:590000" + gene complement(5657..5729) + /gene="TRNN" + /gene_synonym="MTTN" + /nomenclature="Official Symbol: MT-TN | Name: + mitochondrially encoded tRNA asparagine | Provided by: + HGNC:HGNC:7493" + /db_xref="GeneID:4570" + /db_xref="HGNC:HGNC:7493" + /db_xref="MIM:590010" + tRNA complement(5657..5729) + /gene="TRNN" + /gene_synonym="MTTN" + /product="tRNA-Asn" + /note="NAR: 0297" + /anticodon=(pos:complement(5694..5696),aa:Asn,seq:gtt) + /codon_recognized="AAC" + /db_xref="GeneID:4570" + /db_xref="HGNC:HGNC:7493" + /db_xref="MIM:590010" + gene complement(5761..5826) + /gene="TRNC" + /gene_synonym="MTTC" + /nomenclature="Official Symbol: MT-TC | Name: + mitochondrially encoded tRNA cysteine | Provided by: + HGNC:HGNC:7477" + /db_xref="GeneID:4511" + /db_xref="HGNC:HGNC:7477" + /db_xref="MIM:590020" + tRNA complement(5761..5826) + /gene="TRNC" + /gene_synonym="MTTC" + /product="tRNA-Cys" + /note="NAR: 0497" + /anticodon=(pos:complement(5796..5798),aa:Cys,seq:gca) + /codon_recognized="UGC" + /db_xref="GeneID:4511" + /db_xref="HGNC:HGNC:7477" + /db_xref="MIM:590020" + gene complement(5826..5891) + /gene="TRNY" + /gene_synonym="MTTY" + /nomenclature="Official Symbol: MT-TY | Name: + mitochondrially encoded tRNA tyrosine | Provided by: + HGNC:HGNC:7502" + /db_xref="GeneID:4579" + /db_xref="HGNC:HGNC:7502" + /db_xref="MIM:590100" + tRNA complement(5826..5891) + /gene="TRNY" + /gene_synonym="MTTY" + /product="tRNA-Tyr" + /note="NAR: 1997" + /anticodon=(pos:complement(5860..5862),aa:Tyr,seq:gta) + /codon_recognized="UAC" + /db_xref="GeneID:4579" + /db_xref="HGNC:HGNC:7502" + /db_xref="MIM:590100" + gene 5904..7445 + /gene="COX1" + /gene_synonym="COI; MTCO1" + /nomenclature="Official Symbol: MT-CO1 | Name: + mitochondrially encoded cytochrome c oxidase I | Provided + by: HGNC:HGNC:7419" + /db_xref="GeneID:4512" + /db_xref="HGNC:HGNC:7419" + /db_xref="MIM:516030" + CDS 5904..7445 + /gene="COX1" + /gene_synonym="COI; MTCO1" + /note="cytochrome c oxidase I" + /codon_start=1 + /transl_table=2 + /product="cytochrome c oxidase subunit I" + /protein_id="YP_003024028.1" + /db_xref="GeneID:4512" + /db_xref="HGNC:HGNC:7419" + /db_xref="MIM:516030" + /translation="MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQP + GNLLGNDHIYNVIVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSF + WLLPPSLLLLLASAMVEAGAGTGWTVYPPLAGNYSHPGASVDLTIFSLHLAGVSSILG + AINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTT + FFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWA + MMSIGFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKW + SAAVLWALGFIFLFTVGGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGG + FIHWFPLFSGYTLDQTYAKIHFTIMFIGVNLTFFPQHFLGLSGMPRRYSDYPDAYTTW + NILSSVGSFISLTAVMLMIFMIWEAFASKRKVLMVEEPSMNLEWLYGCPPPYHTFEEP + VYMKS" + gene complement(7446..7514) + /gene="TRNS1" + /gene_synonym="MTTS1" + /nomenclature="Official Symbol: MT-TS1 | Name: + mitochondrially encoded tRNA serine 1 (UCN)" + /db_xref="GeneID:4574" + /db_xref="HGNC:HGNC:7497" + /db_xref="MIM:590080" + tRNA complement(7446..7514) + /gene="TRNS1" + /gene_synonym="MTTS1" + /product="tRNA-Ser" + /note="NAR: 1697" + /anticodon=(pos:complement(7482..7484),aa:Ser,seq:tga) + /codon_recognized="UCN" + /db_xref="GeneID:4574" + /db_xref="HGNC:HGNC:7497" + /db_xref="MIM:590080" + gene 7518..7585 + /gene="TRND" + /gene_synonym="MTTD" + /nomenclature="Official Symbol: MT-TD | Name: + mitochondrially encoded tRNA aspartic acid | Provided by: + HGNC:HGNC:7478" + /db_xref="GeneID:4555" + /db_xref="HGNC:HGNC:7478" + /db_xref="MIM:590015" + tRNA 7518..7585 + /gene="TRND" + /gene_synonym="MTTD" + /product="tRNA-Asp" + /note="NAR: 0397" + /anticodon=(pos:7548..7550,aa:Asp,seq:gtc) + /codon_recognized="GAC" + /db_xref="GeneID:4555" + /db_xref="HGNC:HGNC:7478" + /db_xref="MIM:590015" + gene 7586..8269 + /gene="COX2" + /gene_synonym="COII; MTCO2" + /nomenclature="Official Symbol: MT-CO2 | Name: + mitochondrially encoded cytochrome c oxidase II | Provided + by: HGNC:HGNC:7421" + /db_xref="GeneID:4513" + /db_xref="HGNC:HGNC:7421" + /db_xref="MIM:516040" + CDS 7586..8269 + /gene="COX2" + /gene_synonym="COII; MTCO2" + /note="cytochrome c oxidase II" + /codon_start=1 + /transl_table=2 + /product="cytochrome c oxidase subunit II" + /protein_id="YP_003024029.1" + /db_xref="GeneID:4513" + /db_xref="HGNC:HGNC:7421" + /db_xref="MIM:516040" + /translation="MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFL + TLTTKLTNTNISDAQEMETVWTILPAIILVLIALPSLRILYMTDEVNDPSLTIKSIGH + QWYWTYEYTDYGGLIFNSYMLPPLFLEPGDLRLLDVDNRVVLPIEAPIRMMITSQDVL + HSWAVPTLGLKTDAIPGRLNQTTFTATRPGVYYGQCSEICGANHSFMPIVLELIPLKI + FEMGPVFTL" + gene 8295..8364 + /gene="TRNK" + /gene_synonym="MTTK" + /nomenclature="Official Symbol: MT-TK | Name: + mitochondrially encoded tRNA lysine | Provided by: + HGNC:HGNC:7489" + /db_xref="GeneID:4566" + /db_xref="HGNC:HGNC:7489" + /db_xref="MIM:590060" + tRNA 8295..8364 + /gene="TRNK" + /gene_synonym="MTTK" + /product="tRNA-Lys" + /note="NAR: 1197" + /anticodon=(pos:8323..8325,aa:Lys,seq:ttt) + /codon_recognized="AAA" + /db_xref="GeneID:4566" + /db_xref="HGNC:HGNC:7489" + /db_xref="MIM:590060" + gene 8366..8572 + /gene="ATP8" + /gene_synonym="ATPase8; MTATP8" + /nomenclature="Official Symbol: MT-ATP8 | Name: + mitochondrially encoded ATP synthase 8 | Provided by: + HGNC:HGNC:7415" + /db_xref="GeneID:4509" + /db_xref="HGNC:HGNC:7415" + /db_xref="MIM:516070" + CDS 8366..8572 + /gene="ATP8" + /gene_synonym="ATPase8; MTATP8" + /note="ATP synthase 8; ATPase subunit 8" + /codon_start=1 + /transl_table=2 + /product="ATP synthase F0 subunit 8" + /protein_id="YP_003024030.1" + /db_xref="GeneID:4509" + /db_xref="HGNC:HGNC:7415" + /db_xref="MIM:516070" + /translation="MPQLNTTVWPTMITPMLLTLFLITQLKMLNTNYHLPPSPKPMKM + KNYNKPWEPKWTKICSLHSLPPQS" + gene 8527..9207 + /gene="ATP6" + /gene_synonym="ATPase6; MTATP6" + /nomenclature="Official Symbol: MT-ATP6 | Name: + mitochondrially encoded ATP synthase 6 | Provided by: + HGNC:HGNC:7414" + /db_xref="GeneID:4508" + /db_xref="HGNC:HGNC:7414" + /db_xref="MIM:516060" + CDS 8527..9207 + /gene="ATP6" + /gene_synonym="ATPase6; MTATP6" + /note="ATP synthase 6; ATPase subunit 6" + /codon_start=1 + /transl_table=2 + /product="ATP synthase F0 subunit 6" + /protein_id="YP_003024031.1" + /db_xref="GeneID:4508" + /db_xref="HGNC:HGNC:7414" + /db_xref="MIM:516060" + /translation="MNENLFASFIAPTILGLPAAVLIILFPPLLIPTSKYLINNRLIT + TQQWLIKLTSKQMMTMHNTKGRTWSLMLVSLIIFIATTNLLGLLPHSFTPTTQLSMNL + AMAIPLWAGTVIMGFRSKIKNALAHFLPQGTPTPLIPMLVIIETISLLIQPMALAVRL + TANITAGHLLMHLIGSATLAMSTINLPSTLIIFTILILLTILEIAVALIQAYVFTLLV + SLYLHDNT" + gene 9207..9990 + /gene="COX3" + /gene_synonym="COIII; MTCO3" + /nomenclature="Official Symbol: MT-CO3 | Name: + mitochondrially encoded cytochrome c oxidase III | + Provided by: HGNC:HGNC:7422" + /db_xref="GeneID:4514" + /db_xref="HGNC:HGNC:7422" + /db_xref="MIM:516050" + CDS 9207..9990 + /gene="COX3" + /gene_synonym="COIII; MTCO3" + /note="cytochrome c oxidase III; TAA stop codon is + completed by the addition of 3' A residues to the mRNA" + /codon_start=1 + /transl_except=(pos:9990,aa:TERM) + /transl_table=2 + /product="cytochrome c oxidase subunit III" + /protein_id="YP_003024032.1" + /db_xref="GeneID:4514" + /db_xref="HGNC:HGNC:7422" + /db_xref="MIM:516050" + /translation="MTHQSHAYHMVKPSPWPLTGALSALLMTSGLAMWFHFHSMTLLM + LGLLTNTLTMYQWWRDVTRESTYQGHHTPPVQKGLRYGMILFITSEVFFFAGFFWAFY + HSSLAPTPQLGGHWPPTGITPLNPLEVPLLNTSVLLASGVSITWAHHSLMENNRNQMI + QALLITILLGLYFTLLQASEYFESPFTISDGIYGSTFFVATGFHGLHVIIGSTFLTIC + FIRQLMFHFTSKHHFGFEAAAWYWHFVDVVWLFLYVSIYWWGS" + gene 9991..10058 + /gene="TRNG" + /gene_synonym="MTTG" + /nomenclature="Official Symbol: MT-TG | Name: + mitochondrially encoded tRNA glycine | Provided by: + HGNC:HGNC:7486" + /db_xref="GeneID:4563" + /db_xref="HGNC:HGNC:7486" + /db_xref="MIM:590035" + tRNA 9991..10058 + /gene="TRNG" + /gene_synonym="MTTG" + /product="tRNA-Gly" + /note="NAR: 0797" + /anticodon=(pos:10021..10023,aa:Gly,seq:tcc) + /codon_recognized="GGA" + /db_xref="GeneID:4563" + /db_xref="HGNC:HGNC:7486" + /db_xref="MIM:590035" + gene 10059..10404 + /gene="ND3" + /gene_synonym="MTND3" + /nomenclature="Official Symbol: MT-ND3 | Name: + mitochondrially encoded NADH dehydrogenase 3 | Provided + by: HGNC:HGNC:7458" + /db_xref="GeneID:4537" + /db_xref="HGNC:HGNC:7458" + /db_xref="MIM:516002" + CDS 10059..10404 + /gene="ND3" + /gene_synonym="MTND3" + /note="NADH dehydrogenase, subunit 3 (complex I); TAA stop + codon is completed by the addition of 3' A residues to the + mRNA" + /codon_start=1 + /transl_except=(pos:10404,aa:TERM) + /transl_table=2 + /product="NADH dehydrogenase subunit 3" + /protein_id="YP_003024033.1" + /db_xref="GeneID:4537" + /db_xref="HGNC:HGNC:7458" + /db_xref="MIM:516002" + /translation="MNFALILMINTLLALLLMIITFWLPQLNGYMEKSTPYECGFDPM + SPARVPFSMKFFLVAITFLLFDLEIALLLPLPWALQTTNLPLMVMSSLLLIIILALSL + AYEWLQKGLDWTE" + gene 10405..10469 + /gene="TRNR" + /gene_synonym="MTTR" + /nomenclature="Official Symbol: MT-TR | Name: + mitochondrially encoded tRNA arginine | Provided by: + HGNC:HGNC:7496" + /db_xref="GeneID:4573" + /db_xref="HGNC:HGNC:7496" + /db_xref="MIM:590005" + tRNA 10405..10469 + /gene="TRNR" + /gene_synonym="MTTR" + /product="tRNA-Arg" + /note="NAR: 0197" + /anticodon=(pos:10435..10437,aa:Arg,seq:tcg) + /codon_recognized="CGA" + /db_xref="GeneID:4573" + /db_xref="HGNC:HGNC:7496" + /db_xref="MIM:590005" + gene 10470..10766 + /gene="ND4L" + /gene_synonym="MTND4L" + /nomenclature="Official Symbol: MT-ND4L | Name: + mitochondrially encoded NADH 4L dehydrogenase | Provided + by: HGNC:HGNC:7460" + /db_xref="GeneID:4539" + /db_xref="HGNC:HGNC:7460" + /db_xref="MIM:516004" + CDS 10470..10766 + /gene="ND4L" + /gene_synonym="MTND4L" + /note="NADH dehydrogenase, subunit 4L (complex I)" + /codon_start=1 + /transl_table=2 + /product="NADH dehydrogenase subunit 4L" + /protein_id="YP_003024034.1" + /db_xref="GeneID:4539" + /db_xref="HGNC:HGNC:7460" + /db_xref="MIM:516004" + /translation="MPLIYMNIMLAFTISLLGMLVYRSHLMSSLLCLEGMMLSLFIMA + TLMTLNTHSLLANIVPIAMLVFAACEAAVGLALLVSISNTYGLDYVHNLNLLQC" + gene 10760..12137 + /gene="ND4" + /gene_synonym="MTND4" + /nomenclature="Official Symbol: MT-ND4 | Name: + mitochondrially encoded NADH dehydrogenase 4 | Provided + by: HGNC:HGNC:7459" + /db_xref="GeneID:4538" + /db_xref="HGNC:HGNC:7459" + /db_xref="MIM:516003" + CDS 10760..12137 + /gene="ND4" + /gene_synonym="MTND4" + /note="NADH dehydrogenase, subunit 4 (complex I); TAA stop + codon is completed by the addition of 3' A residues to the + mRNA" + /codon_start=1 + /transl_except=(pos:12137,aa:TERM) + /transl_table=2 + /product="NADH dehydrogenase subunit 4" + /protein_id="YP_003024035.1" + /db_xref="GeneID:4538" + /db_xref="HGNC:HGNC:7459" + /db_xref="MIM:516003" + /translation="MLKLIVPTIMLLPLTWLSKKHMIWINTTTHSLIISIIPLLFFNQ + INNNLFSCSPTFSSDPLTTPLLMLTTWLLPLTIMASQRHLSSEPLSRKKLYLSMLISL + QISLIMTFTATELIMFYIFFETTLIPTLAIITRWGNQPERLNAGTYFLFYTLVGSLPL + LIALIYTHNTLGSLNILLLTLTAQELSNSWANNLMWLAYTMAFMVKMPLYGLHLWLPK + AHVEAPIAGSMVLAAVLLKLGGYGMMRLTLILNPLTKHMAYPFLVLSLWGMIMTSSIC + LRQTDLKSLIAYSSISHMALVVTAILIQTPWSFTGAVILMIAHGLTSSLLFCLANSNY + ERTHSRIMILSQGLQTLLPLMAFWWLLASLANLALPPTINLLGELSVLVTTFSWSNIT + LLLTGLNMLVTALYSLYMFTTTQWGSLTHHINNMKPSFTRENTLMFMHLSPILLLSLN + PDIITGFSS" + gene 12138..12206 + /gene="TRNH" + /gene_synonym="MTTH" + /nomenclature="Official Symbol: MT-TH | Name: + mitochondrially encoded tRNA histidine | Provided by: + HGNC:HGNC:7487" + /db_xref="GeneID:4564" + /db_xref="HGNC:HGNC:7487" + /db_xref="MIM:590040" + tRNA 12138..12206 + /gene="TRNH" + /gene_synonym="MTTH" + /product="tRNA-His" + /note="NAR: 0897" + /anticodon=(pos:12168..12170,aa:His,seq:gtg) + /codon_recognized="CAC" + /db_xref="GeneID:4564" + /db_xref="HGNC:HGNC:7487" + /db_xref="MIM:590040" + gene 12207..12265 + /gene="TRNS2" + /gene_synonym="MTTS2" + /nomenclature="Official Symbol: MT-TS2 | Name: + mitochondrially encoded tRNA serine 2 (AGU/C) | Provided + by: HGNC:HGNC:7498" + /db_xref="GeneID:4575" + /db_xref="HGNC:HGNC:7498" + /db_xref="MIM:590085" + tRNA 12207..12265 + /gene="TRNS2" + /gene_synonym="MTTS2" + /product="tRNA-Ser" + /note="NAR: 1656" + /anticodon=(pos:12226..12228,aa:Ser,seq:gct) + /codon_recognized="AGY" + /db_xref="GeneID:4575" + /db_xref="HGNC:HGNC:7498" + /db_xref="MIM:590085" + gene 12266..12336 + /gene="TRNL2" + /gene_synonym="MTTL2" + /nomenclature="Official Symbol: MT-TL2 | Name: + mitochondrially encoded tRNA leucine 2 (CUN) | Provided + by: HGNC:HGNC:7491" + /db_xref="GeneID:4568" + /db_xref="HGNC:HGNC:7491" + /db_xref="MIM:590055" + tRNA 12266..12336 + /gene="TRNL2" + /gene_synonym="MTTL2" + /product="tRNA-Leu" + /note="NAR: 1097" + /anticodon=(pos:12298..12300,aa:Leu,seq:tag) + /codon_recognized="CUN" + /db_xref="GeneID:4568" + /db_xref="HGNC:HGNC:7491" + /db_xref="MIM:590055" + gene 12337..14148 + /gene="ND5" + /gene_synonym="MTND5" + /nomenclature="Official Symbol: MT-ND5 | Name: + mitochondrially encoded NADH dehydrogenase 5 | Provided + by: HGNC:HGNC:7461" + /db_xref="GeneID:4540" + /db_xref="HGNC:HGNC:7461" + /db_xref="MIM:516005" + CDS 12337..14148 + /gene="ND5" + /gene_synonym="MTND5" + /note="NADH dehydrogenase, subunit 5 (complex I)" + /codon_start=1 + /transl_table=2 + /product="NADH dehydrogenase subunit 5" + /protein_id="YP_003024036.1" + /db_xref="GeneID:4540" + /db_xref="HGNC:HGNC:7461" + /db_xref="MIM:516005" + /translation="MTMHTTMTTLTLTSLIPPILTTLVNPNKKNSYPHYVKSIVASTF + IISLFPTTMFMCLDQEVIISNWHWATTQTTQLSLSFKLDYFSMMFIPVALFVTWSIME + FSLWYMNSDPNINQFFKYLLIFLITMLILVTANNLFQLFIGWEGVGIMSFLLISWWYA + RADANTAAIQAILYNRIGDIGFILALAWFILHSNSWDPQQMALLNANPSLTPLLGLLL + AAAGKSAQLGLHPWLPSAMEGPTPVSALLHSSTMVVAGIFLLIRFHPLAENSPLIQTL + TLCLGAITTLFAAVCALTQNDIKKIVAFSTSSQLGLMMVTIGINQPHLAFLHICTHAF + FKAMLFMCSGSIIHNLNNEQDIRKMGGLLKTMPLTSTSLTIGSLALAGMPFLTGFYSK + DHIIETANMSYTNAWALSITLIATSLTSAYSTRMILLTLTGQPRFPTLTNINENNPTL + LNPIKRLAAGSLFAGFLITNNISPASPFQTTIPLYLKLTALAVTFLGLLTALDLNYLT + NKLKMKSPLCTFYFSNMLGFYPSITHRTIPYLGLLTSQNLPLLLLDLTWLEKLLPKTI + SQHQISTSIITSTQKGMIKLYFLSFFFPLILTLLLIT" + gene complement(14149..14673) + /gene="ND6" + /gene_synonym="MTND6" + /nomenclature="Official Symbol: MT-ND6 | Name: + mitochondrially encoded NADH dehydrogenase 6 | Provided + by: HGNC:HGNC:7462" + /db_xref="GeneID:4541" + /db_xref="HGNC:HGNC:7462" + /db_xref="MIM:516006" + CDS complement(14149..14673) + /gene="ND6" + /gene_synonym="MTND6" + /note="NADH dehydrogenase, subunit 6 (complex I)" + /codon_start=1 + /transl_table=2 + /product="NADH dehydrogenase subunit 6" + /protein_id="YP_003024037.1" + /db_xref="GeneID:4541" + /db_xref="HGNC:HGNC:7462" + /db_xref="MIM:516006" + /translation="MMYALFLLSVGLVMGFVGFSSKPSPIYGGLVLIVSGVVGCVIIL + NFGGGYMGLMVFLIYLGGMMVVFGYTTAMAIEEYPEAWGSGVEVLVSVLVGLAMEVGL + VLWVKEYDGVVVVVNFNSVGSWMIYEGEGSGLIREDPIGAGALYDYGRWLVVVTGWTL + FVGVYIVIEIARGN" + gene complement(14674..14742) + /gene="TRNE" + /gene_synonym="MTTE" + /nomenclature="Official Symbol: MT-TE | Name: + mitochondrially encoded tRNA glutamic acid | Provided by: + HGNC:HGNC:7479" + /db_xref="GeneID:4556" + /db_xref="HGNC:HGNC:7479" + /db_xref="MIM:590025" + tRNA complement(14674..14742) + /gene="TRNE" + /gene_synonym="MTTE" + /product="tRNA-Glu" + /note="NAR: 0697" + /anticodon=(pos:complement(14710..14712),aa:Glu,seq:ttc) + /codon_recognized="GAA" + /db_xref="GeneID:4556" + /db_xref="HGNC:HGNC:7479" + /db_xref="MIM:590025" + gene 14747..15887 + /gene="CYTB" + /gene_synonym="MTCYB" + /nomenclature="Official Symbol: MT-CYB | Name: + mitochondrially encoded cytochrome b | Provided by: + HGNC:HGNC:7427" + /db_xref="GeneID:4519" + /db_xref="HGNC:HGNC:7427" + /db_xref="MIM:516020" + CDS 14747..15887 + /gene="CYTB" + /gene_synonym="MTCYB" + /note="TAA stop codon is completed by the addition of 3' A + residues to the mRNA" + /codon_start=1 + /transl_except=(pos:15887,aa:TERM) + /transl_table=2 + /product="cytochrome b" + /protein_id="YP_003024038.1" + /db_xref="GeneID:4519" + /db_xref="HGNC:HGNC:7427" + /db_xref="MIM:516020" + /translation="MTPMRKTNPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQ + ITTGLFLAMHYSPDASTAFSSIAHITRDVNYGWIIRYLHANGASMFFICLFLHIGRGL + YYGSFLYSETWNIGIILLLATMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGTDL + VQWIWGGYSVDSPTLTRFFTFHFILPFIIAALATLHLLFLHETGSNNPLGITSHSDKI + TFHPYYTIKDALGLLLFLLSLMTLTLFSPDLLGDPDNYTLANPLNTPPHIKPEWYFLF + AYTILRSVPNKLGGVLALLLSILILAMIPILHMSKQQSMMFRPLSQSLYWLLAADLLI + LTWIGGQPVSYPFTIIGQVASVLYFTTILILMPTISLIENKMLKWA" + gene 15888..15953 + /gene="TRNT" + /gene_synonym="MTTT" + /nomenclature="Official Symbol: MT-TT | Name: + mitochondrially encoded tRNA threonine | Provided by: + HGNC:HGNC:7499" + /db_xref="GeneID:4576" + /db_xref="HGNC:HGNC:7499" + /db_xref="MIM:590090" + tRNA 15888..15953 + /gene="TRNT" + /gene_synonym="MTTT" + /product="tRNA-Thr" + /note="NAR: 1797" + /anticodon=(pos:15919..15921,aa:Thr,seq:tgt) + /codon_recognized="ACA" + /db_xref="GeneID:4576" + /db_xref="HGNC:HGNC:7499" + /db_xref="MIM:590090" + gene complement(15956..16023) + /gene="TRNP" + /gene_synonym="MTTP" + /nomenclature="Official Symbol: MT-TP | Name: + mitochondrially encoded tRNA proline | Provided by: + HGNC:HGNC:7494" + /db_xref="GeneID:4571" + /db_xref="HGNC:HGNC:7494" + /db_xref="MIM:590075" + tRNA complement(15956..16023) + /gene="TRNP" + /gene_synonym="MTTP" + /product="tRNA-Pro" + /note="NAR: 1597" + /anticodon=(pos:complement(15990..15992),aa:Pro,seq:tgg) + /codon_recognized="CCA" + /db_xref="GeneID:4571" + /db_xref="HGNC:HGNC:7494" + /db_xref="MIM:590075" +ORIGIN + 1 gatcacaggt ctatcaccct attaaccact cacgggagct ctccatgcat ttggtatttt + 61 cgtctggggg gtatgcacgc gatagcattg cgagacgctg gagccggagc accctatgtc + 121 gcagtatctg tctttgattc ctgcctcatc ctattattta tcgcacctac gttcaatatt + 181 acaggcgaac atacttacta aagtgtgtta attaattaat gcttgtagga cataataata + 241 acaattgaat gtctgcacag ccactttcca cacagacatc ataacaaaaa atttccacca + 301 aaccccccct cccccgcttc tggccacagc acttaaacac atctctgcca aaccccaaaa + 361 acaaagaacc ctaacaccag cctaaccaga tttcaaattt tatcttttgg cggtatgcac + 421 ttttaacagt caccccccaa ctaacacatt attttcccct cccactccca tactactaat + 481 ctcatcaata caacccccgc ccatcctacc cagcacacac acaccgctgc taaccccata + 541 ccccgaacca accaaacccc aaagacaccc cccacagttt atgtagctta cctcctcaaa + 601 gcaatacact gaaaatgttt agacgggctc acatcacccc ataaacaaat aggtttggtc + 661 ctagcctttc tattagctct tagtaagatt acacatgcaa gcatccccgt tccagtgagt + 721 tcaccctcta aatcaccacg atcaaaagga acaagcatca agcacgcagc aatgcagctc + 781 aaaacgctta gcctagccac acccccacgg gaaacagcag tgattaacct ttagcaataa + 841 acgaaagttt aactaagcta tactaacccc agggttggtc aatttcgtgc cagccaccgc + 901 ggtcacacga ttaacccaag tcaatagaag ccggcgtaaa gagtgtttta gatcaccccc + 961 tccccaataa agctaaaact cacctgagtt gtaaaaaact ccagttgaca caaaatagac + 1021 tacgaaagtg gctttaacat atctgaacac acaatagcta agacccaaac tgggattaga + 1081 taccccacta tgcttagccc taaacctcaa cagttaaatc aacaaaactg ctcgccagaa + 1141 cactacgagc cacagcttaa aactcaaagg acctggcggt gcttcatatc cctctagagg + 1201 agcctgttct gtaatcgata aaccccgatc aacctcacca cctcttgctc agcctatata + 1261 ccgccatctt cagcaaaccc tgatgaaggc tacaaagtaa gcgcaagtac ccacgtaaag + 1321 acgttaggtc aaggtgtagc ccatgaggtg gcaagaaatg ggctacattt tctaccccag + 1381 aaaactacga tagcccttat gaaacttaag ggtcgaaggt ggatttagca gtaaactaag + 1441 agtagagtgc ttagttgaac agggccctga agcgcgtaca caccgcccgt caccctcctc + 1501 aagtatactt caaaggacat ttaactaaaa cccctacgca tttatataga ggagacaagt + 1561 cgtaacatgg taagtgtact ggaaagtgca cttggacgaa ccagagtgta gcttaacaca + 1621 aagcacccaa cttacactta ggagatttca acttaacttg accgctctga gctaaaccta + 1681 gccccaaacc cactccacct tactaccaga caaccttagc caaaccattt acccaaataa + 1741 agtataggcg atagaaattg aaacctggcg caatagatat agtaccgcaa gggaaagatg + 1801 aaaaattata accaagcata atatagcaag gactaacccc tataccttct gcataatgaa + 1861 ttaactagaa ataactttgc aaggagagcc aaagctaaga cccccgaaac cagacgagct + 1921 acctaagaac agctaaaaga gcacacccgt ctatgtagca aaatagtggg aagatttata + 1981 ggtagaggcg acaaacctac cgagcctggt gatagctggt tgtccaagat agaatcttag + 2041 ttcaacttta aatttgccca cagaaccctc taaatcccct tgtaaattta actgttagtc + 2101 caaagaggaa cagctctttg gacactagga aaaaaccttg tagagagagt aaaaaattta + 2161 acacccatag taggcctaaa agcagccacc aattaagaaa gcgttcaagc tcaacaccca + 2221 ctacctaaaa aatcccaaac atataactga actcctcaca cccaattgga ccaatctatc + 2281 accctataga agaactaatg ttagtataag taacatgaaa acattctcct ccgcataagc + 2341 ctgcgtcaga ttaaaacact gaactgacaa ttaacagccc aatatctaca atcaaccaac + 2401 aagtcattat taccctcact gtcaacccaa cacaggcatg ctcataagga aaggttaaaa + 2461 aaagtaaaag gaactcggca aatcttaccc cgcctgttta ccaaaaacat cacctctagc + 2521 atcaccagta ttagaggcac cgcctgccca gtgacacatg tttaacggcc gcggtaccct + 2581 aaccgtgcaa aggtagcata atcacttgtt ccttaaatag ggacctgtat gaatggctcc + 2641 acgagggttc agctgtctct tacttttaac cagtgaaatt gacctgcccg tgaagaggcg + 2701 ggcataacac agcaagacga gaagacccta tggagcttta atttattaat gcaaacagta + 2761 cctaacaaac ccacaggtcc taaactacca aacctgcatt aaaaatttcg gttggggcga + 2821 cctcggagca gaacccaacc tccgagcagt acatgctaag acttcaccag tcaaagcgaa + 2881 ctactatact caattgatcc aataacttga ccaacggaac aagttaccct agggataaca + 2941 gcgcaatcct attctagagt ccatatcaac aatagggttt acgacctcga tgttggatca + 3001 ggacatcccg atggtgcagc cgctattaaa ggttcgtttg ttcaacgatt aaagtcctac + 3061 gtgatctgag ttcagaccgg agtaatccag gtcggtttct atctacnttc aaattcctcc + 3121 ctgtacgaaa ggacaagaga aataaggcct acttcacaaa gcgccttccc ccgtaaatga + 3181 tatcatctca acttagtatt atacccacac ccacccaaga acagggtttg ttaagatggc + 3241 agagcccggt aatcgcataa aacttaaaac tttacagtca gaggttcaat tcctcttctt + 3301 aacaacatac ccatggccaa cctcctactc ctcattgtac ccattctaat cgcaatggca + 3361 ttcctaatgc ttaccgaacg aaaaattcta ggctatatac aactacgcaa aggccccaac + 3421 gttgtaggcc cctacgggct actacaaccc ttcgctgacg ccataaaact cttcaccaaa + 3481 gagcccctaa aacccgccac atctaccatc accctctaca tcaccgcccc gaccttagct + 3541 ctcaccatcg ctcttctact atgaaccccc ctccccatac ccaaccccct ggtcaacctc + 3601 aacctaggcc tcctatttat tctagccacc tctagcctag ccgtttactc aatcctctga + 3661 tcagggtgag catcaaactc aaactacgcc ctgatcggcg cactgcgagc agtagcccaa + 3721 acaatctcat atgaagtcac cctagccatc attctactat caacattact aataagtggc + 3781 tcctttaacc tctccaccct tatcacaaca caagaacacc tctgattact cctgccatca + 3841 tgacccttgg ccataatatg atttatctcc acactagcag agaccaaccg aacccccttc + 3901 gaccttgccg aaggggagtc cgaactagtc tcaggcttca acatcgaata cgccgcaggc + 3961 cccttcgccc tattcttcat agccgaatac acaaacatta ttataataaa caccctcacc + 4021 actacaatct tcctaggaac aacatatgac gcactctccc ctgaactcta cacaacatat + 4081 tttgtcacca agaccctact tctaacctcc ctgttcttat gaattcgaac agcatacccc + 4141 cgattccgct acgaccaact catacacctc ctatgaaaaa acttcctacc actcacccta + 4201 gcattactta tatgatatgt ctccataccc attacaatct ccagcattcc ccctcaaacc + 4261 taagaaatat gtctgataaa agagttactt tgatagagta aataatagga gcttaaaccc + 4321 ccttatttct aggactatga gaatcgaacc catccctgag aatccaaaat tctccgtgcc + 4381 acctatcaca ccccatccta aagtaaggtc agctaaataa gctatcgggc ccataccccg + 4441 aaaatgttgg ttataccctt cccgtactaa ttaatcccct ggcccaaccc gtcatctact + 4501 ctaccatctt tgcaggcaca ctcatcacag cgctaagctc gcactgattt tttacctgag + 4561 taggcctaga aataaacatg ctagctttta ttccagttct aaccaaaaaa ataaaccctc + 4621 gttccacaga agctgccatc aagtatttcc tcacgcaagc aaccgcatcc ataatccttc + 4681 taatagctat cctcttcaac aatatactct ccggacaatg aaccataacc aatactacca + 4741 atcaatactc atcattaata atcataatag ctatagcaat aaaactagga atagccccct + 4801 ttcacttctg agtcccagag gttacccaag gcacccctct gacatccggc ctgcttcttc + 4861 tcacatgaca aaaactagcc cccatctcaa tcatatacca aatctctccc tcactaaacg + 4921 taagccttct cctcactctc tcaatcttat ccatcatagc aggcagttga ggtggattaa + 4981 accaaaccca gctacgcaaa atcttagcat actcctcaat tacccacata ggatgaataa + 5041 tagcagttct accgtacaac cctaacataa ccattcttaa tttaactatt tatattatcc + 5101 taactactac cgcattccta ctactcaact taaactccag caccacgacc ctactactat + 5161 ctcgcacctg aaacaagcta acatgactaa cacccttaat tccatccacc ctcctctccc + 5221 taggaggcct gcccccgcta accggctttt tgcccaaatg ggccattatc gaagaattca + 5281 caaaaaacaa tagcctcatc atccccacca tcatagccac catcaccctc cttaacctct + 5341 acttctacct acgcctaatc tactccacct caatcacact actccccata tctaacaacg + 5401 taaaaataaa atgacagttt gaacatacaa aacccacccc attcctcccc acactcatcg + 5461 cccttaccac gctactccta cctatctccc cttttatact aataatctta tagaaattta + 5521 ggttaaatac agaccaagag ccttcaaagc cctcagtaag ttgcaatact taatttctgt + 5581 aacagctaag gactgcaaaa ccccactctg catcaactga acgcaaatca gccactttaa + 5641 ttaagctaag cccttactag accaatggga cttaaaccca caaacactta gttaacagct + 5701 aagcacccta atcaactggc ttcaatctac ttctcccgcc gccgggaaaa aaggcgggag + 5761 aagccccggc aggtttgaag ctgcttcttc gaatttgcaa ttcaatatga aaatcacctc + 5821 ggagctggta aaaagaggcc taacccctgt ctttagattt acagtccaat gcttcactca + 5881 gccattttac ctcaccccca ctgatgttcg ccgaccgttg actattctct acaaaccaca + 5941 aagacattgg aacactatac ctattattcg gcgcatgagc tggagtccta ggcacagctc + 6001 taagcctcct tattcgagcc gagctgggcc agccaggcaa ccttctaggt aacgaccaca + 6061 tctacaacgt tatcgtcaca gcccatgcat ttgtaataat cttcttcata gtaataccca + 6121 tcataatcgg aggctttggc aactgactag ttcccctaat aatcggtgcc cccgatatgg + 6181 cgtttccccg cataaacaac ataagcttct gactcttacc tccctctctc ctactcctgc + 6241 tcgcatctgc tatagtggag gccggagcag gaacaggttg aacagtctac cctcccttag + 6301 cagggaacta ctcccaccct ggagcctccg tagacctaac catcttctcc ttacacctag + 6361 caggtgtctc ctctatctta ggggccatca atttcatcac aacaattatc aatataaaac + 6421 cccctgccat aacccaatac caaacgcccc tcttcgtctg atccgtccta atcacagcag + 6481 tcctacttct cctatctctc ccagtcctag ctgctggcat cactatacta ctaacagacc + 6541 gcaacctcaa caccaccttc ttcgaccccg ccggaggagg agaccccatt ctataccaac + 6601 acctattctg atttttcggt caccctgaag tttatattct tatcctacca ggcttcggaa + 6661 taatctccca tattgtaact tactactccg gaaaaaaaga accatttgga tacataggta + 6721 tggtctgagc tatgatatca attggcttcc tagggtttat cgtgtgagca caccatatat + 6781 ttacagtagg aatagacgta gacacacgag catatttcac ctccgctacc ataatcatcg + 6841 ctatccccac cggcgtcaaa gtatttagct gactcgccac actccacgga agcaatatga + 6901 aatgatctgc tgcagtgctc tgagccctag gattcatctt tcttttcacc gtaggtggcc + 6961 tgactggcat tgtattagca aactcatcac tagacatcgt actacacgac acgtactacg + 7021 ttgtagccca cttccactat gtcctatcaa taggagctgt atttgccatc ataggaggct + 7081 tcattcactg atttccccta ttctcaggct acaccctaga ccaaacctac gccaaaatcc + 7141 atttcactat catattcatc ggcgtaaatc taactttctt cccacaacac tttctcggcc + 7201 tatccggaat gccccgacgt tactcggact accccgatgc atacaccaca tgaaacatcc + 7261 tatcatctgt aggctcattc atttctctaa cagcagtaat attaataatt ttcatgattt + 7321 gagaagcctt cgcttcgaag cgaaaagtcc taatagtaga agaaccctcc ataaacctgg + 7381 agtgactata tggatgcccc ccaccctacc acacattcga agaacccgta tacataaaat + 7441 ctagacaaaa aaggaaggaa tcgaaccccc caaagctggt ttcaagccaa ccccatggcc + 7501 tccatgactt tttcaaaaag gtattagaaa aaccatttca taactttgtc aaagttaaat + 7561 tataggctaa atcctatata tcttaatggc acatgcagcg caagtaggtc tacaagacgc + 7621 tacttcccct atcatagaag agcttatcac ctttcatgat cacgccctca taatcatttt + 7681 ccttatctgc ttcctagtcc tgtatgccct tttcctaaca ctcacaacaa aactaactaa + 7741 tactaacatc tcagacgctc aggaaataga aaccgtctga actatcctgc ccgccatcat + 7801 cctagtcctc atcgccctcc catccctacg catcctttac ataacagacg aggtcaacga + 7861 tccctccctt accatcaaat caattggcca ccaatggtac tgaacctacg agtacaccga + 7921 ctacggcgga ctaatcttca actcctacat acttccccca ttattcctag aaccaggcga + 7981 cctgcgactc cttgacgttg acaatcgagt agtactcccg attgaagccc ccattcgtat + 8041 aataattaca tcacaagacg tcttgcactc atgagctgtc cccacattag gcttaaaaac + 8101 agatgcaatt cccggacgtc taaaccaaac cactttcacc gctacacgac cgggggtata + 8161 ctacggtcaa tgctctgaaa tctgtggagc aaaccacagt ttcatgccca tcgtcctaga + 8221 attaattccc ctaaaaatct ttgaaatagg gcccgtattt accctatagc accccctcta + 8281 ccccctctag agcccactgt aaagctaact tagcattaac cttttaagtt aaagattaag + 8341 agaaccaaca cctctttaca gtgaaatgcc ccaactaaat actaccgtat ggcccaccat + 8401 aattaccccc atactcctta cactattcct catcacccaa ctaaaaatat taaacacaaa + 8461 ctaccaccta cctccctcac caaagcccat aaaaataaaa aattataaca aaccctgaga + 8521 accaaaatga acgaaaatct gttcgcttca ttcattgccc ccacaatcct aggcctaccc + 8581 gccgcagtac tgatcattct atttccccct ctattgatcc ccacctccaa atatctcatc + 8641 aacaaccgac taatcaccac ccaacaatga ctaatcaaac taacctcaaa acaaatgata + 8701 accatacaca acactaaagg acgaacctga tctcttatac tagtatcctt aatcattttt + 8761 attgccacaa ctaacctcct cggactcctg cctcactcat ttacaccaac cacccaacta + 8821 tctataaacc tagccatggc catcccctta tgagcgggca cagtgattat aggctttcgc + 8881 tctaagatta aaaatgccct agcccacttc ttaccacaag gcacacctac accccttatc + 8941 cccatactag ttattatcga aaccatcagc ctactcattc aaccaatagc cctggccgta + 9001 cgcctaaccg ctaacattac tgcaggccac ctactcatgc acctaattgg aagcgccacc + 9061 ctagcaatat caaccattaa ccttccctct acacttatca tcttcacaat tctaattcta + 9121 ctgactatcc tagaaatcgc tgtcgcctta atccaagcct acgttttcac acttctagta + 9181 agcctctacc tgcacgacaa cacataatga cccaccaatc acatgcctat catatagtaa + 9241 aacccagccc atgaccccta acaggggccc tctcagccct cctaatgacc tccggcctag + 9301 ccatgtgatt tcacttccac tccataacgc tcctcatact aggcctacta accaacacac + 9361 taaccatata ccaatgatgg cgcgatgtaa cacgagaaag cacataccaa ggccaccaca + 9421 caccacctgt ccaaaaaggc cttcgatacg ggataatcct atttattacc tcagaagttt + 9481 ttttcttcgc aggatttttc tgagcctttt accactccag cctagcccct accccccaat + 9541 taggagggca ctggccccca acaggcatca ccccgctaaa tcccctagaa gtcccactcc + 9601 taaacacatc cgtattactc gcatcaggag tatcaatcac ctgagctcac catagtctaa + 9661 tagaaaacaa ccgaaaccaa ataattcaag cactgcttat tacaatttta ctgggtctct + 9721 attttaccct cctacaagcc tcagagtact tcgagtctcc cttcaccatt tccgacggca + 9781 tctacggctc aacatttttt gtagccacag gcttccacgg acttcacgtc attattggct + 9841 caactttcct cactatctgc ttcatccgcc aactaatatt tcactttaca tccaaacatc + 9901 actttggctt cgaagccgcc gcctgatact ggcattttgt agatgtggtt tgactatttc + 9961 tgtatgtctc catctattga tgagggtctt actcttttag tataaatagt accgttaact + 10021 tccaattaac tagttttgac aacattcaaa aaagagtaat aaacttcgcc ttaattttaa + 10081 taatcaacac cctcctagcc ttactactaa taattattac attttgacta ccacaactca + 10141 acggctacat agaaaaatcc accccttacg agtgcggctt cgaccctata tcccccgccc + 10201 gcgtcccttt ctccataaaa ttcttcttag tagctattac cttcttatta tttgatctag + 10261 aaattgccct ccttttaccc ctaccatgag ccctacaaac aactaacctg ccactaatag + 10321 ttatgtcatc cctcttatta atcatcatcc tagccctaag tctggcctat gagtgactac + 10381 aaaaaggatt agactgaacc gaattggtat atagtttaaa caaaacgaat gatttcgact + 10441 cattaaatta tgataatcat atttaccaaa tgcccctcat ttacataaat attatactag + 10501 catttaccat ctcacttcta ggaatactag tatatcgctc acacctcata tcctccctac + 10561 tatgcctaga aggaataata ctatcgctgt tcattatagc tactctcata accctcaaca + 10621 cccactccct cttagccaat attgtgccta ttgccatact agtctttgcc gcctgcgaag + 10681 cagcggtggg cctagcccta ctagtctcaa tctccaacac atatggccta gactacgtac + 10741 ataacctaaa cctactccaa tgctaaaact aatcgtccca acaattatat tactaccact + 10801 gacatgactt tccaaaaaac acataatttg aatcaacaca accacccaca gcctaattat + 10861 tagcatcatc cctctactat tttttaacca aatcaacaac aacctattta gctgttcccc + 10921 aaccttttcc tccgaccccc taacaacccc cctcctaata ctaactacct gactcctacc + 10981 cctcacaatc atggcaagcc aacgccactt atccagtgaa ccactatcac gaaaaaaact + 11041 ctacctctct atactaatct ccctacaaat ctccttaatt ataacattca cagccacaga + 11101 actaatcata ttttatatct tcttcgaaac cacacttatc cccaccttgg ctatcatcac + 11161 ccgatgaggc aaccagccag aacgcctgaa cgcaggcaca tacttcctat tctacaccct + 11221 agtaggctcc cttcccctac tcatcgcact aatttacact cacaacaccc taggctcact + 11281 aaacattcta ctactcactc tcactgccca agaactatca aactcctgag ccaacaactt + 11341 aatatgacta gcttacacaa tagcttttat agtaaagata cctctttacg gactccactt + 11401 atgactccct aaagcccatg tcgaagcccc catcgctggg tcaatagtac ttgccgcagt + 11461 actcttaaaa ctaggcggct atggtataat acgcctcaca ctcattctca accccctgac + 11521 aaaacacata gcctacccct tccttgtact atccctatga ggcataatta taacaagctc + 11581 catctgccta cgacaaacag acctaaaatc gctcattgca tactcttcaa tcagccacat + 11641 agccctcgta gtaacagcca ttctcatcca aaccccctga agcttcaccg gcgcagtcat + 11701 tctcataatc gcccacgggc ttacatcctc attactattc tgcctagcaa actcaaacta + 11761 cgaacgcact cacagtcgca tcataatcct ctctcaagga cttcaaactc tactcccact + 11821 aatagctttt tgatgacttc tagcaagcct cgctaacctc gccttacccc ccactattaa + 11881 cctactggga gaactctctg tgctagtaac cacgttctcc tgatcaaata tcactctcct + 11941 acttacagga ctcaacatac tagtcacagc cctatactcc ctctacatat ttaccacaac + 12001 acaatggggc tcactcaccc accacattaa caacataaaa ccctcattca cacgagaaaa + 12061 caccctcatg ttcatacacc tatcccccat tctcctccta tccctcaacc ccgacatcat + 12121 taccgggttt tcctcttgta aatatagttt aaccaaaaca tcagattgtg aatctgacaa + 12181 cagaggctta cgacccctta tttaccgaga aagctcacaa gaactgctaa ctcatgcccc + 12241 catgtctaac aacatggctt tctcaacttt taaaggataa cagctatcca ttggtcttag + 12301 gccccaaaaa ttttggtgca actccaaata aaagtaataa ccatgcacac tactataacc + 12361 accctaaccc tgacttccct aattcccccc atccttacca ccctcgttaa ccctaacaaa + 12421 aaaaactcat acccccatta tgtaaaatcc attgtcgcat ccacctttat tatcagtctc + 12481 ttccccacaa caatattcat gtgcctagac caagaagtta ttatctcgaa ctgacactga + 12541 gccacaaccc aaacaaccca gctctcccta agcttcaaac tagactactt ctccataata + 12601 ttcatccctg tagcattgtt cgttacatgg tccatcatag aattctcact gtgatatata + 12661 aactcagacc caaacattaa tcagttcttc aaatatctac tcatcttcct aattaccata + 12721 ctaatcttag ttaccgctaa caacctattc caactgttca tcggctgaga gggcgtagga + 12781 attatatcct tcttgctcat cagttgatga tacgcccgag cagatgccaa cacagcagcc + 12841 attcaagcaa tcctatacaa ccgtatcggc gatatcggtt tcatcctcgc cttagcatga + 12901 tttatcctac actccaactc atgagaccca caacaaatag cccttctaaa cgctaatcca + 12961 agcctcaccc cactactagg cctcctccta gcagcagcag gcaaatcagc ccaattaggt + 13021 ctccacccct gactcccctc agccatagaa ggccccaccc cagtctcagc cctactccac + 13081 tcaagcacta tagttgtagc aggaatcttc ttactcatcc gcttccaccc cctagcagaa + 13141 aatagcccac taatccaaac tctaacacta tgcttaggcg ctatcaccac tctgttcgca + 13201 gcagtctgcg cccttacaca aaatgacatc aaaaaaatcg tagccttctc cacttcaagt + 13261 caactaggac tcataatagt tacaatcggc atcaaccaac cacacctagc attcctgcac + 13321 atctgtaccc acgccttctt caaagccata ctatttatgt gctccgggtc catcatccac + 13381 aaccttaaca atgaacaaga tattcgaaaa ataggaggac tactcaaaac catacctctc + 13441 acttcaacct ccctcaccat tggcagccta gcattagcag gaataccttt cctcacaggt + 13501 ttctactcca aagaccacat catcgaaacc gcaaacatat catacacaaa cgcctgagcc + 13561 ctatctatta ctctcatcgc tacctccctg acaagcgcct atagcactcg aataattctt + 13621 ctcaccctaa caggtcaacc tcgcttcccc acccttacta acattaacga aaataacccc + 13681 accctactaa accccattaa acgcctggca gccggaagcc tattcgcagg atttctcatt + 13741 actaacaaca tttcccccgc atcccccttc caaacaacaa tccccctcta cctaaaactc + 13801 acagccctcg ctgtcacttt cctaggactt ctaacagccc tagacctcaa ctacctaacc + 13861 aacaaactta aaataaaatc cccactatgc acattttatt tctccaacat actcggattc + 13921 taccctagca tcacacaccg cacaatcccc tatctaggcc ttcttacgag ccaaaacctg + 13981 cccctactcc tcctagacct aacctgacta gaaaagctat tacctaaaac aatttcacag + 14041 caccaaatct ccacctccat catcacctca acccaaaaag gcataattaa actttacttc + 14101 ctctctttct tcttcccact catcctaacc ctactcctaa tcacataacc tattcccccg + 14161 agcaatctca attacaatat atacaccaac aaacaatgtt caaccagtaa ctactactaa + 14221 tcaacgccca taatcataca aagcccccgc accaatagga tcctcccgaa tcaaccctga + 14281 cccctctcct tcataaatta ttcagcttcc tacactatta aagtttacca caaccaccac + 14341 cccatcatac tctttcaccc acagcaccaa tcctacctcc atcgctaacc ccactaaaac + 14401 actcaccaag acctcaaccc ctgaccccca tgcctcagga tactcctcaa tagccatcgc + 14461 tgtagtatat ccaaagacaa ccatcattcc ccctaaataa attaaaaaaa ctattaaacc + 14521 catataacct cccccaaaat tcagaataat aacacacccg accacaccgc taacaatcaa + 14581 tactaaaccc ccataaatag gagaaggctt agaagaaaac cccacaaacc ccattactaa + 14641 acccacactc aacagaaaca aagcatacat cattattctc gcacggacta caaccacgac + 14701 caatgatatg aaaaaccatc gttgtatttc aactacaaga acaccaatga ccccaatacg + 14761 caaaactaac cccctaataa aattaattaa ccactcattc atcgacctcc ccaccccatc + 14821 caacatctcc gcatgatgaa acttcggctc actccttggc gcctgcctga tcctccaaat + 14881 caccacagga ctattcctag ccatgcacta ctcaccagac gcctcaaccg ccttttcatc + 14941 aatcgcccac atcactcgag acgtaaatta tggctgaatc atccgctacc ttcacgccaa + 15001 tggcgcctca atattcttta tctgcctctt cctacacatc gggcgaggcc tatattacgg + 15061 atcatttctc tactcagaaa cctgaaacat cggcattatc ctcctgcttg caactatagc + 15121 aacagccttc ataggctatg tcctcccgtg aggccaaata tcattctgag gggccacagt + 15181 aattacaaac ttactatccg ccatcccata cattgggaca gacctagttc aatgaatctg + 15241 aggaggctac tcagtagaca gtcccaccct cacacgattc tttacctttc acttcatctt + 15301 gcccttcatt attgcagccc tagcaacact ccacctccta ttcttgcacg aaacgggatc + 15361 aaacaacccc ctaggaatca cctcccattc cgataaaatc accttccacc cttactacac + 15421 aatcaaagac gccctcggct tacttctctt ccttctctcc ttaatgacat taacactatt + 15481 ctcaccagac ctcctaggcg acccagacaa ttatacccta gccaacccct taaacacccc + 15541 tccccacatc aagcccgaat gatatttcct attcgcctac acaattctcc gatccgtccc + 15601 taacaaacta ggaggcgtcc ttgccctatt actatccatc ctcatcctag caataatccc + 15661 catcctccat atatccaaac aacaaagcat aatatttcgc ccactaagcc aatcacttta + 15721 ttgactccta gccgcagacc tcctcattct aacctgaatc ggaggacaac cagtaagcta + 15781 cccttttacc atcattggac aagtagcatc cgtactatac ttcacaacaa tcctaatcct + 15841 aataccaact atctccctaa ttgaaaacaa aatactcaaa tgggcctgtc cttgtagtat + 15901 aaactaatac accagtcttg taaaccggag atgaaaacct ttttccaagg acaaatcaga + 15961 gaaaaagtct ttaactccac cattagcacc caaagctaag attctaattt aaactattct + 16021 ctgttctttc atggggaagc agatttgggt accacccaag tattgactca cccatcaaca + 16081 accgctatgt atttcgtaca ttactgccag ccaccatgaa tattgtacgg taccataaat + 16141 acttgaccac ctgtagtaca taaaaaccca atccacatca aaaccccctc cccatgctta + 16201 caagcaagta cagcaatcaa ccctcaacta tcacacatca actgcaactc caaagccacc + 16261 cctcacccac taggatacca acaaacctac ccacccttaa cagtacatag tacataaagc + 16321 catttaccgt acatagcaca ttacagtcaa atcccttctc gtccccatgg atgacccccc + 16381 tcagataggg gtcccttgac caccatcctc cgtgaaatca atatcccgca caagagtgct + 16441 actctcctcg ctccgggccc ataacacttg ggggtagcta aagtgaactg tatccgacat + 16501 ctggttccta cttcagggtc ataaagccta aatagcccac acgttcccct taaataagac + 16561 atcacgatg +// + diff --git a/tests/data/assocacs.gz b/tests/data/assocacs.gz new file mode 100644 index 0000000..10214c2 Binary files /dev/null and b/tests/data/assocacs.gz differ diff --git a/tests/data/exonsets.mm-exons.gz b/tests/data/exonsets.mm-exons.gz new file mode 100644 index 0000000..9cec37e Binary files /dev/null and b/tests/data/exonsets.mm-exons.gz differ diff --git a/tests/data/expected_genomic_100.exonset b/tests/data/expected_genomic_100.exonset new file mode 100644 index 0000000..172f7a0 --- /dev/null +++ b/tests/data/expected_genomic_100.exonset @@ -0,0 +1,12 @@ +tx_ac alt_ac method strand exons_se_i +NR_046018.2 NC_000001.10 splign 1 11873,12227;12612,12721;13220,14409 +NR_024540.1 NC_000001.10 splign -1 29320,29370;24737,24891;18267,18366;17914,18061;17605,17742;17232,17368;16857,17055;16606,16765;15795,15947;14969,15038;14361,14829 +NR_106918.1 NC_000001.10 splign -1 17368,17436 +NR_036051.1 NC_000001.10 splign 1 30365,30503 +NR_026818.1 NC_000001.10 splign -1 35720,36081;35276,35481;34610,35174 +NM_001005484.2 NC_000001.10 splign 1 65418,65433;65519,65573;69036,71585 +NR_039983.2 NC_000001.10 splign -1 140074,140566;139789,139847;134772,139696 +NR_028322.1 NC_000001.10 splign 1 323891,324060;324287,324345;324438,328581 +NM_001005221.2 NC_000001.10 splign 1 367658,368597 +NR_125957.1 NC_000001.10 splign -1 564298,564389;563340,563603;562759,563203 +NR_162149.1 NC_000001.10 splign -1 567994,568065 diff --git a/tests/data/genomic_100.gff.gz b/tests/data/genomic_100.gff.gz new file mode 100644 index 0000000..813ad8f Binary files /dev/null and b/tests/data/genomic_100.gff.gz differ diff --git a/tests/data/rna.NM_001396027.gbff b/tests/data/rna.NM_001396027.gbff new file mode 100644 index 0000000..da202f2 --- /dev/null +++ b/tests/data/rna.NM_001396027.gbff @@ -0,0 +1,67 @@ +LOCUS NM_001396027 696 bp mRNA linear PRI 16-APR-2022 +DEFINITION Homo sapiens family with sequence similarity 246 member C + (gene/pseudogene) (FAM246C), mRNA. +ACCESSION NM_001396027 +VERSION NM_001396027.1 +KEYWORDS RefSeq; RefSeq Select. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. +COMMENT VALIDATED REFSEQ: This record has undergone validation or + preliminary review. The reference sequence was derived from + CP068256.2. + + Transcript Variant: This transcript contains two, common, single + nucleotide deletions relative to the GRCh38 reference assembly. + This transcript encodes a protein of 231 aa. + + ##RefSeq-Attributes-START## + polymorphic pseudogene :: based on alignments, homology + RefSeq Select criteria :: based on single protein-coding transcript + ##RefSeq-Attributes-END## +PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP + 1-696 CP068256.2 19404891-19405586 +FEATURES Location/Qualifiers + source 1..696 + /organism="Homo sapiens" + /mol_type="mRNA" + /db_xref="taxon:9606" + /chromosome="22" + /map="22q11.21" + gene 1..696 + /gene="FAM246C" + /note="family with sequence similarity 246 member C + (gene/pseudogene)" + /db_xref="GeneID:117134596" + /db_xref="HGNC:HGNC:54842" + CDS 1..696 + /gene="FAM246C" + /codon_start=1 + /product="protein FAM246C" + /protein_id="NP_001382956.1" + /db_xref="GeneID:117134596" + /db_xref="HGNC:HGNC:54842" + /translation="MAESGRPWAQARSAYRASEVLRRGTGRRRDPGPQSNGPGQEDAR + APGRMARLRGQLRAEAASRSEVPRLLKLVERAGAGAAGAGERTGAHSRGSVCSVCGEP + RGGATYPAGVLEVSERRLQEGLAAVREELGAGIEALRAELRAELDALRALLPPPPSPP + ARREPRAVPRAAPRGPTLPRTLGTVSALVAASRPADDAPDGPAECGAHRAPARKNHKK + MPVPPGAPQGGGD" + exon 1..696 + /gene="FAM246C" + /inference="alignment:Splign:2.1.0" +ORIGIN + 1 atggcggagt ccggccgccc gtgggcccag gcgcgtagtg cgtacagagc cagcgaggtg + 61 ctgcggcgcg gcacgggccg ccggcgggat ccggggccgc aatccaatgg gccgggccag + 121 gaagacgccc gagccccggg ccggatggct cgcctgcgcg gccagctccg ggccgaagcg + 181 gcttcgcggt ccgaggtgcc gcggctgctg aagctggtgg agcgtgcggg ggccggggcg + 241 gcgggcgcgg gcgagaggac cggcgcgcac agccgcggct ccgtgtgctc ggtatgcggg + 301 gagccccgcg gcggggccac ctacccggcg ggggtcctgg aggtgagcga gcggcggctg + 361 caggagggcc tggcggcagt gcgcgaggag ctgggcgccg ggattgaggc gctgcgcgcg + 421 gagcttcgag cggagctgga tgccctgcgc gcgctgctgc cgccgccgcc gtccccgcct + 481 gcccgccgcg agccccgcgc cgtcccccgc gccgcgcccc gcggcccgac cctgccgcgg + 541 acgctcggca ccgtgagcgc cctggtcgcc gcctccaggc ccgcagacga cgccccggac + 601 ggcccagcag aatgcggagc gcaccgagcc ccggccagga agaaccacaa gaagatgcca + 661 gtgccgcctg gggccccgca aggtggcggg gactga +// diff --git a/tests/data/rna.NM_001996.gbff b/tests/data/rna.NM_001996.gbff new file mode 100644 index 0000000..79ff577 --- /dev/null +++ b/tests/data/rna.NM_001996.gbff @@ -0,0 +1,393 @@ +LOCUS NM_001996 2251 bp mRNA linear PRI 18-APR-2022 +DEFINITION Homo sapiens fibulin 1 (FBLN1), transcript variant C, mRNA. +ACCESSION NM_001996 +VERSION NM_001996.4 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. +REFERENCE 1 (bases 1 to 2251) + AUTHORS Yang F, Shi J, Zhang L, Wang H and Li Y. + TITLE Role of fibulin-1 gene promoter methylation in the carcinogenesis + and development of tongue squamous cell carcinoma + JOURNAL Oral Surg Oral Med Oral Pathol Oral Radiol 133 (4), 432-440 (2022) + PUBMED 35153187 + REMARK GeneRIF: Role of fibulin-1 gene promoter methylation in the + carcinogenesis and development of tongue squamous cell carcinoma. +REFERENCE 2 (bases 1 to 2251) + AUTHORS Wan Y, Song Y, Chen J, Kong J, Gu C, Huang J and Zuo L. + TITLE Upregulated Fibulin-1 Increased Endometrial Stromal Cell Viability + and Migration by Repressing EFEMP1-Dependent Ferroptosis in + Endometriosis + JOURNAL Biomed Res Int 2022, 4809415 (2022) + PUBMED 35127942 + REMARK GeneRIF: Upregulated Fibulin-1 Increased Endometrial Stromal Cell + Viability and Migration by Repressing EFEMP1-Dependent Ferroptosis + in Endometriosis. + Publication Status: Online-Only +REFERENCE 3 (bases 1 to 2251) + AUTHORS Xu G, Geng X, Yang F and Zhang H. + TITLE FBLN1 promotes chondrocyte proliferation by increasing + phosphorylation of Smad2 + JOURNAL J Orthop Sci 27 (1), 242-248 (2022) + PUBMED 33610427 + REMARK GeneRIF: FBLN1 promotes chondrocyte proliferation by increasing + phosphorylation of Smad2. +REFERENCE 4 (bases 1 to 2251) + AUTHORS Ustunyurt E, Dundar B, Simsek D and Temur M. + TITLE Act of fibulin-1 in preeclamptic patients: can it be a predictive + marker? + JOURNAL J Matern Fetal Neonatal Med 34 (22), 3775-3781 (2021) + PUBMED 34238097 + REMARK GeneRIF: Act of fibulin-1 in preeclamptic patients: can it be a + predictive marker? +REFERENCE 5 (bases 1 to 2251) + AUTHORS Liu XT, Liu TT, Wu MY, Chen QX, Zhuang JX and Wang Q. + TITLE Identifying FBLN1 (Gene ID: 2192) as a Potential Melanoma Biomarker + for Melanoma based on an Analysis of microRNA Expression Profiles + in the GEO and TCGA Databases + JOURNAL Genet Test Mol Biomarkers 25 (1), 68-78 (2021) + PUBMED 33470885 + REMARK GeneRIF: Identifying FBLN1 (Gene ID: 2192) as a Potential Melanoma + Biomarker for Melanoma based on an Analysis of microRNA Expression + Profiles in the GEO and TCGA Databases. +REFERENCE 6 (bases 1 to 2251) + AUTHORS Sasaki T, Gohring W, Pan TC, Chu ML and Timpl R. + TITLE Binding of mouse and human fibulin-2 to extracellular matrix + ligands + JOURNAL J Mol Biol 254 (5), 892-899 (1995) + PUBMED 7500359 +REFERENCE 7 (bases 1 to 2251) + AUTHORS Roark EF, Keene DR, Haudenschild CC, Godyna S, Little CD and + Argraves WS. + TITLE The association of human fibulin-1 with elastic fibers: an + immunohistological, ultrastructural, and RNA study + JOURNAL J Histochem Cytochem 43 (4), 401-411 (1995) + PUBMED 7534784 +REFERENCE 8 (bases 1 to 2251) + AUTHORS Balbona K, Tran H, Godyna S, Ingham KC, Strickland DK and Argraves + WS. + TITLE Fibulin binds to itself and to the carboxyl-terminal + heparin-binding region of fibronectin + JOURNAL J Biol Chem 267 (28), 20120-20125 (1992) + PUBMED 1400330 +REFERENCE 9 (bases 1 to 2251) + AUTHORS Argraves WS, Tran H, Burgess WH and Dickerson K. + TITLE Fibulin is an extracellular matrix and plasma glycoprotein with + repeated domain structure + JOURNAL J Cell Biol 111 (6 Pt 2), 3155-3164 (1990) + PUBMED 2269669 +REFERENCE 10 (bases 1 to 2251) + AUTHORS Argraves WS, Dickerson K, Burgess WH and Ruoslahti E. + TITLE Fibulin, a novel protein that interacts with the fibronectin + receptor beta subunit cytoplasmic domain + JOURNAL Cell 58 (4), 623-629 (1989) + PUBMED 2527614 +COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The + reference sequence was derived from AL021391.3, Z98047.1 and + Z95331.2. + + On May 31, 2019 this sequence version replaced NM_001996.3. + + Summary: Fibulin 1 is a secreted glycoprotein that becomes + incorporated into a fibrillar extracellular matrix. Calcium-binding + is apparently required to mediate its binding to laminin and + nidogen. It mediates platelet adhesion via binding fibrinogen. Four + splice variants which differ in the 3' end have been identified. + Each variant encodes a different isoform, but no functional + distinctions have been identified among the four variants. + [provided by RefSeq, Jul 2008]. + + Transcript Variant: This variant (C) has an alternate 3' sequence, + as compared to variant D. The encoded isoform C is shorter and has + a distinct C-terminus, as compared to isoform D. + + Publication Note: This RefSeq record includes a subset of the + publications that are available for this gene. Please see the Gene + record to access additional publications. + + ##Evidence-Data-START## + Transcript exon combination :: SRR7346977.956579.1, + SRR7346977.1110405.1 [ECO:0000332] + RNAseq introns :: single sample supports all introns + SAMEA1965299, SAMEA1966682 + [ECO:0000348] + ##Evidence-Data-END## + COMPLETENESS: complete on the 3' end. +PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP + 1-182 AL021391.3 135806-135987 + 183-288 Z98047.1 13577-13682 + 289-424 Z98047.1 20438-20573 + 425-587 Z98047.1 22742-22904 + 588-647 Z98047.1 26160-26219 + 648-749 Z98047.1 27958-28059 + 750-887 Z98047.1 28656-28793 + 888-1025 Z98047.1 30095-30232 + 1026-1169 Z98047.1 36124-36267 + 1170-1298 Z98047.1 37050-37178 + 1299-1424 Z98047.1 38296-38421 + 1425-1544 Z98047.1 41980-42099 + 1545-1676 Z98047.1 43508-43639 + 1677-1800 Z98047.1 45387-45510 + 1801-2251 Z95331.2 10369-10819 +FEATURES Location/Qualifiers + source 1..2251 + /organism="Homo sapiens" + /mol_type="mRNA" + /db_xref="taxon:9606" + /chromosome="22" + /map="22q13.31" + gene 1..2251 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="fibulin 1" + /db_xref="GeneID:2192" + /db_xref="HGNC:HGNC:3600" + /db_xref="MIM:135820" + exon 1..182 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + CDS 104..2155 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="isoform C precursor is encoded by transcript + variant C" + /codon_start=1 + /product="fibulin-1 isoform C precursor" + /protein_id="NP_001987.3" + /db_xref="CCDS:CCDS14069.1" + /db_xref="GeneID:2192" + /db_xref="HGNC:HGNC:3600" + /db_xref="MIM:135820" + /translation="MERAAPSRRVPLPLLLLGGLALLAAGVDADVLLEACCADGHRMA + THQKDCSLPYATESKECRMVQEQCCHSQLEELHCATGISLANEQDRCATPHGDNASLE + ATFVKRCCHCCLLGRAAQAQGQSCEYSLMVGYQCGQVFQACCVKSQETGDLDVGGLQE + TDKIIEVEEEQEDPYLNDRCRGGGPCKQQCRDTGDEVVCSCFVGYQLLSDGVSCEDVN + ECITGSHSCRLGESCINTVGSFRCQRDSSCGTGYELTEDNSCKDIDECESGIHNCLPD + FICQNTLGSFRCRPKLQCKSGFIQDALGNCIDINECLSISAPCPIGHTCINTEGSYTC + QKNVPNCGRGYHLNEEGTRCVDVDECAPPAEPCGKGHRCVNSPGSFRCECKTGYYFDG + ISRMCVDVNECQRYPGRLCGHKCENTLGSYLCSCSVGFRLSVDGRSCEDINECSSSPC + SQECANVYGSYQCYCRRGYQLSDVDGVTCEDIDECALPTGGHICSYRCINIPGSFQCS + CPSSGYRLAPNGRNCQDIDECVTGIHNCSINETCFNIQGGFRCLAFECPENYRRSAAT + RCERLPCHENRECSKLPLRITYYHLSFPTNIQAPAVVFRMGPSSAVPGDSMQLAITGG + NEEGFFTTRKVSPHSGVVALTKPVPEPRDLLLTVKMDLSRHGTVSSFVAKLFIFVSAE + L" + sig_peptide 104..190 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="COORDINATES: ab initio prediction:SignalP:4.0" + mat_peptide 191..2152 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /product="fibulin-1 isoform C" + misc_feature 191..340 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Anaphylatoxin homologous domain; C3a, C4a and C5a + anaphylatoxins are protein fragments generated + enzymatically in serum during activation of complement + molecules C3, C4, and C5. They induce smooth muscle + contraction. These fragments are homologous to...; Region: + ANATO; cd00017" + /db_xref="CDD:237984" + misc_feature 392..559 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Anaphylatoxin homologous domain; C3a, C4a and C5a + anaphylatoxins are protein fragments generated + enzymatically in serum during activation of complement + molecules C3, C4, and C5. They induce smooth muscle + contraction. These fragments are homologous to...; Region: + ANATO; cd00017" + /db_xref="CDD:237984" + misc_feature 698..760 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Complement Clr-like EGF-like; Region: cEGF; + pfam12662" + /db_xref="CDD:432704" + misc_feature 749..832 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Calcium-binding EGF domain; Region: EGF_CA; + pfam07645" + /db_xref="CDD:429571" + misc_feature 887..994 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Calcium-binding EGF domain; Region: EGF_CA; + pfam07645" + /db_xref="CDD:429571" + misc_feature order(887..889,896..898,944..946) + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Ca2+ binding site [ion binding]; other site" + /db_xref="CDD:238011" + misc_feature 1025..1168 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Calcium-binding EGF-like domain; Region: EGF_CA; + smart00179" + /db_xref="CDD:214542" + misc_feature order(1025..1027,1034..1036,1082..1084) + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Ca2+ binding site [ion binding]; other site" + /db_xref="CDD:238011" + misc_feature 1169..1270 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Calcium-binding EGF-like domain; Region: EGF_CA; + smart00179" + /db_xref="CDD:214542" + misc_feature order(1169..1171,1178..1180,1226..1228) + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Ca2+ binding site [ion binding]; other site" + /db_xref="CDD:238011" + misc_feature order(1298..1300,1307..1309,1352..1354) + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Ca2+ binding site [ion binding]; other site" + /db_xref="CDD:238011" + misc_feature 1364..1435 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Complement Clr-like EGF-like; Region: cEGF; + pfam12662" + /db_xref="CDD:432704" + misc_feature 1481..1555 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Complement Clr-like EGF-like; Region: cEGF; + pfam12662" + /db_xref="CDD:432704" + misc_feature 1544..1636 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Calcium-binding EGF-like domain, present in a large + number of membrane-bound and extracellular (mostly animal) + proteins. Many of these proteins require calcium for their + biological function and calcium-binding sites have been + found to be located at the...; Region: EGF_CA; cl21504" + /db_xref="CDD:451279" + misc_feature 1613..1687 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Complement Clr-like EGF-like; Region: cEGF; + pfam12662" + /db_xref="CDD:432704" + misc_feature 1676..1780 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="Calcium-binding EGF domain; Region: EGF_CA; + pfam07645" + /db_xref="CDD:429571" + exon 183..288 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 289..424 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 425..587 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 588..647 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 648..749 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 750..887 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 888..1025 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 1026..1169 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 1170..1298 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 1299..1424 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 1425..1544 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 1545..1676 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 1677..1800 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + exon 1801..2251 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /inference="alignment:Splign:2.1.0" + regulatory 2220..2225 + /regulatory_class="polyA_signal_sequence" + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="hexamer: AATAAA" + polyA_site 2251 + /gene="FBLN1" + /gene_synonym="FBLN; FIBL1" + /note="major polyA site" +ORIGIN + 1 gttggctgcc gaggctcggc cggagcgtgg agcccgcgcc gctgccccag gaccgcgccc + 61 gcgcctttgt ccgccgccgc ccaccgcccg tcgcccgccg cccatggagc gcgccgcgcc + 121 gtcgcgccgg gtcccgcttc cgctgctgct gctcggcggc cttgcgctgc tggcggccgg + 181 agtggacgcg gatgtcctcc tggaggcctg ctgtgcggac ggacaccgga tggccactca + 241 tcagaaggac tgctcgctgc catatgctac ggaatccaaa gaatgcagga tggtgcagga + 301 gcagtgctgc cacagccagc tggaggagct gcactgtgcc acgggcatca gcctggccaa + 361 cgagcaggac cgctgtgcca cgccccacgg tgacaacgcc agcctggagg ccacatttgt + 421 gaagaggtgc tgccattgct gtctgctggg gagggcggcc caggcccagg gccagagctg + 481 cgagtacagc ctcatggttg gctaccagtg tggacaggtc ttccaggcat gctgtgtcaa + 541 gagccaggag accggagatt tggatgtcgg gggcctccaa gaaacggata agatcattga + 601 ggttgaggag gaacaagagg acccatatct gaatgaccgc tgccgaggag gcgggccctg + 661 caagcagcag tgccgagaca cgggtgacga ggtggtctgc tcctgcttcg tgggctacca + 721 gctgctgtct gatggtgtct cctgtgaaga tgtcaatgaa tgcatcacgg gcagccacag + 781 ctgccggctt ggagaatcct gcatcaacac agtgggctct ttccgctgcc agcgggacag + 841 cagctgcggg actggctatg agctcacaga ggacaatagc tgcaaagata ttgacgagtg + 901 tgagagtggt attcataact gcctccccga ttttatctgt cagaatactc tgggatcctt + 961 ccgctgccga cccaagctac agtgcaagag tggctttata caagatgctc taggcaactg + 1021 tattgatatc aatgagtgtt tgagtatcag tgccccgtgc cctatcgggc atacatgcat + 1081 caacacagag ggctcctaca cgtgccagaa gaacgtgccc aactgtggcc gtggctacca + 1141 tctcaacgag gagggaacgc gctgtgttga tgtggacgag tgcgcgccac ctgctgagcc + 1201 ctgtgggaag ggacatcgct gcgtgaactc tcccggcagt ttccgctgcg aatgcaagac + 1261 gggttactat tttgacggca tcagcaggat gtgtgtcgat gtcaacgagt gccagcgcta + 1321 ccccgggcgc ctgtgtggcc acaagtgcga gaacacgctg ggctcctacc tctgcagctg + 1381 ttccgtgggc ttccggctct ctgtggatgg caggtcatgt gaagacatca atgagtgcag + 1441 cagcagcccc tgtagccagg agtgtgccaa cgtctacggc tcctaccagt gttactgccg + 1501 gcgaggctac cagctcagcg atgtggatgg agtcacctgt gaagacatcg acgagtgcgc + 1561 cctgcccacc gggggccaca tctgctccta ccgctgcatc aacatccctg gaagcttcca + 1621 gtgcagctgc ccctcgtctg gctacaggct ggcccccaat ggccgcaact gccaagacat + 1681 tgatgagtgt gtgactggca tccacaactg ctccatcaac gagacctgct tcaacatcca + 1741 gggcggcttc cgctgcctgg ccttcgagtg ccctgagaac taccgccgct ccgcagccac + 1801 ccgctgtgag cgcttgcctt gccatgagaa tcgggagtgc tccaagctgc ctctgagaat + 1861 aacctactac cacctctctt tccccaccaa catccaagcg cccgcggtgg ttttccgcat + 1921 gggcccctcc agtgctgtcc ccggggacag catgcagctg gccatcaccg gcggcaatga + 1981 ggagggcttt ttcaccaccc ggaaggtgag cccccacagt ggggtggtgg ccctcaccaa + 2041 gcctgtcccc gagcccaggg acttgctcct gaccgtcaag atggatctct ctcgccacgg + 2101 caccgtcagc tcctttgtgg ccaagctttt catctttgtg tctgcagagc tctgagcact + 2161 cgcttcgcgt cgcggggtct ccctcctgtt gctttcctaa ccctgccctc cggggcgtta + 2221 ataaagtctt agcaagcgtc ccacacagtg a +// diff --git a/tests/data/rna.NR_173080.gbff b/tests/data/rna.NR_173080.gbff new file mode 100644 index 0000000..f6b2486 --- /dev/null +++ b/tests/data/rna.NR_173080.gbff @@ -0,0 +1,61 @@ +LOCUS NR_173080 1073 bp RNA linear PRI 20-JUL-2023 +DEFINITION Homo sapiens uncharacterized LOC122455341 (LOC122455341), + transcript variant 1, long non-coding RNA. +ACCESSION NR_173080 +VERSION NR_173080.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. +COMMENT VALIDATED REFSEQ: This record has undergone validation or + preliminary review. The reference sequence was derived from + AC007326.28. + + ##Evidence-Data-START## + RNAseq introns :: single sample supports all introns SAMEA2159764 + [ECO:0000348] + ##Evidence-Data-END## + COMPLETENESS: full length. +PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP + 1-93 AC007326.28 35576-35668 + 94-170 AC007326.28 37874-37950 + 171-1073 AC007326.28 44992-45894 +FEATURES Location/Qualifiers + source 1..1073 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="22" + /map="22q11.21" + gene 1..1073 + /gene="LOC122455341" + /note="uncharacterized LOC122455341" + /db_xref="GeneID:122455341" + ncRNA 1..1073 + /ncRNA_class="lncRNA" + /gene="LOC122455341" + /product="uncharacterized LOC122455341, transcript variant + 1" + /db_xref="GeneID:122455341" +ORIGIN + 1 acaagttgta tggttcgttt tatatgaaga gttcagaata gacaaatcga tagagacaga + 61 agtcacacga agacaagccc atagtggagc ctgggtgaag gtacgctcga gcgtggtcat + 121 tgaggacaag tcgacgagag atcccgagta cgtctacagt cagccttacg tctgcaggtg + 181 tacccaacag ctccgaagag acagcgacca tcgagaacgg gccatgatga cgatggcggt + 241 tttgtcgaaa agaaaagggg gaaatgtggg gaaaagcaag agagatcaga ttgttactgt + 301 gtctgtgtag aaagaagtag acataggaga ctccattttg ttatgtgcta agaaaaattc + 361 ttctgccttg agattctgtt aatctataac cttaccccca accccgtgct ctctgaaacg + 421 tgtgctgtgt caactcagag ttaaatggat taagggcggt gcaggatgtg ctttgttaaa + 481 cagatgcttg aaggcagcat gctccttaag agtcatcacc actccctaat ctcaagtacc + 541 cagggacaca aaaactgcgg aaggccgcag ggacctctgc ctaggaaagc caggtattgt + 601 ccaaggtttc tccccatgtg atagtctgaa atatggcctc gtgggaaggg aaagacctga + 661 ccgtccccca gcccgacacc aagggtctgt gctgaggagg attagtaaaa gaggaaggaa + 721 tgcctcttgc agttgagaca agaggaaggc atctgtctcc tgcctgtccc tgggcaatgg + 781 aatgtctcgg tataaaaccc gattgtatgc tccatctact gagataggga aaaaccgcct + 841 tagggctgga ggtgggacct gcgggcagca atactgcttt gtaaagcatt gagatgttta + 901 tgtgtatgca tatctaaaag cacagcactt aatcctttac attgtctatg atgcaaagac + 961 ctttgttcac gtgtttgtct gctgaccctc tccccacaat tgtcttgtga ccctgacaca + 1021 tccccctctt cgagaaacac ccacaagtga tgaataaata ctaagggaac tca +// diff --git a/tests/data/rna.NR_173148.gbff b/tests/data/rna.NR_173148.gbff new file mode 100644 index 0000000..91312ca --- /dev/null +++ b/tests/data/rna.NR_173148.gbff @@ -0,0 +1,57 @@ +LOCUS NR_173148 698 bp RNA linear PRI 17-SEP-2021 +DEFINITION Homo sapiens family with sequence similarity 246 member C + (gene/pseudogene) (FAM246C), non-coding RNA. +ACCESSION NR_173148 +VERSION NR_173148.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. +COMMENT VALIDATED REFSEQ: This record has undergone validation or + preliminary review. The reference sequence was derived from + AC000095.5. + + Transcript Variant: This transcript matches the GRCh38 reference + assembly. Compared to other members of the FAM246 gene family, it + has a premature stop codon which makes it a non-coding transcript. + + ##RefSeq-Attributes-START## + polymorphic pseudogene :: based on alignments, homology + ##RefSeq-Attributes-END## +PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP + 1-698 AC000095.5 28241-28938 +FEATURES Location/Qualifiers + source 1..698 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="22" + /map="22q11.21" + gene 1..698 + /gene="FAM246C" + /note="family with sequence similarity 246 member C + (gene/pseudogene)" + /db_xref="GeneID:117134596" + /db_xref="HGNC:HGNC:54842" + misc_RNA 1..698 + /gene="FAM246C" + /product="family with sequence similarity 246 member C + (gene/pseudogene)" + /db_xref="GeneID:117134596" + /db_xref="HGNC:HGNC:54842" +ORIGIN + 1 atggcggagc ccggccgccc gtgggcccag gcgcgtagtg cgtacagagc cagcgaggtg + 61 ctgcggcgcg gcacgggccg ccggcgggat ccggggccgc aatccaatgg gccgggccag + 121 gaagacgccc gagccccggg ccggatggct cgcctgcgcg gccagctccg ggccgaagcg + 181 gcttcgcggt ccgaggtgcc gcggctgctg aagctggtgg agcgtgcggg ggccggggcg + 241 gccgggcgcg ggcgagagga ccggcgcgca cagccgcggg ctccgtgtgc tcggtatgcg + 301 gggagccccg cggcggggcc acctacccgg cgggggtcct ggaggtgagc gagcggcggc + 361 tgcaggaggg cctggcggca gtgcgcgagg agctgggcgc cgggattgag gcgctgcgcg + 421 cggagcttcg agcggagctg gatgccctgc gcgcgctgct gccgccgccg ccgtccccgc + 481 ctgcccgccg cgagccccgc gccgtccccc gcgccgcgcc ccgcggcccg accctgccgc + 541 ggacgctcgg caccgtgagc gccctggtcg ccgcctccag gcccgcagac gacgccccgg + 601 acggcccagc agaatgcgga gcgcaccgag ccccggccag gaagaaccac aagaagatgc + 661 cagtgccgcc tggggccccg caaggtggcg gggactga +// diff --git a/tests/data/txinfo.gz b/tests/data/txinfo.gz new file mode 100644 index 0000000..8b33153 Binary files /dev/null and b/tests/data/txinfo.gz differ diff --git a/tests/test_coalesce_exonsets.py b/tests/test_coalesce_exonsets.py new file mode 100644 index 0000000..757ec0b --- /dev/null +++ b/tests/test_coalesce_exonsets.py @@ -0,0 +1,48 @@ +import contextlib +import io +import sys +import unittest +from tempfile import NamedTemporaryFile +from unittest.mock import patch + +from sbin.coalesce_exonsets import coalesce_exonsets +from uta.formats.exonset import ExonSetWriter + + +class TestCoalesceExonsets(unittest.TestCase): + + def _create_temporary_file(self, lines): + with NamedTemporaryFile(delete=False) as temp_exonsets: + with open(temp_exonsets.name, "wt") as f: + for line in lines: + f.write(line) + temp_exonsets.seek(0) + return temp_exonsets.name + + @patch('sbin.coalesce_exonsets.logger') + def test_coalesce_exonsets(self, mock_logger): + lines_1 = [ + "tx_ac\talt_ac\tmethod\tstrand\texons_se_i\n", + "NM_145660.2\tNC_000022.10\tsplign\t-1\t36600673,36600879;36598038,36598101;36595375,36595422;36591356,36591483;36585175,36587958\n", + "NM_000348.4\tNC_000002.11\tsplign\t-1\t31805689,31806007;31758672,31758836;31756440,31756542;31754376,31754527;31747549,31751332\n" + ] + lines_2 = [ + "tx_ac\talt_ac\tmethod\tstrand\texons_se_i\n", + "NM_145660.2\tNC_000022.10\tsplign\t-1\t36600673,36600879;36598038,36598101;36595375,36595422;36591356,36591483;36587846,36587958;36585175,36587845\n", + "NM_145660.2\tNC_000022.11\tsplign\t-1\t36204627,36204833;36201992,36202055;36199329,36199376;36195310,36195437;36189127,36191912\n", + "NM_001005484.2\tNC_000001.10\tsplign\t1\t65418,65433;65519,65573;69036,71585\n" + ] + temp_exonsets_1_fn = self._create_temporary_file(lines_1) + temp_exonsets_2_fn = self._create_temporary_file(lines_2) + + # the first record in lines_2 (NM_145660.2, NC_000022.10) will be skipped, as it is already passed to the output + expected_output = lines_1 + lines_2[2:] + stdout = io.StringIO() + + with contextlib.redirect_stdout(stdout): + coalesce_exonsets([temp_exonsets_1_fn, temp_exonsets_2_fn]) + + output = stdout.getvalue() + self.assertEqual(output, ''.join(expected_output)) + + mock_logger.warning.assert_called_with(f" - exon set for transcript NM_145660.2/NC_000022.10 already seen in {temp_exonsets_1_fn}. Skipping.") diff --git a/tests/test_filter_exonset_transcripts.py b/tests/test_filter_exonset_transcripts.py new file mode 100644 index 0000000..8bc7995 --- /dev/null +++ b/tests/test_filter_exonset_transcripts.py @@ -0,0 +1,46 @@ +import contextlib +import io +import unittest +from tempfile import NamedTemporaryFile +from unittest.mock import patch + +from sbin.filter_exonset_transcripts import filter_exonset + + +class TestFilterExonsetTranscripts(unittest.TestCase): + + @patch('sbin.filter_exonset_transcripts.logger') + def test_filter_exonset(self, mock_logger): + # Test NR_046571.1 is filtered out + lines = [ + "tx_ac\talt_ac\tmethod\tstrand\texons_se_i\n", + "NR_122113.1\tNC_000022.10\tsplign\t-1\t16192905,16193009;16190680,16190791;16189263,16189378;16189031,16189143;16187164,16187302;16186810,16186953;16162396,16162487;16150528,16151821\n", + "NR_133911.1\tNC_000022.10\tsplign\t1\t16157078,16157342;16164481,16164569;16171951,16172265\n", + "NR_046571.1\tNC_000022.10\tsplign\t1\t16274608,16275003;16276480,16277577\n" + ] + with NamedTemporaryFile(delete=False) as temp_exonsets: + with open(temp_exonsets.name, "wt") as f: + for line in lines: + f.write(line) + temp_exonsets.seek(0) + missing_ids_file = NamedTemporaryFile() + + transcript_ids = {"NR_122113.1", "NR_133911.1"} + stdout = io.StringIO() + with contextlib.redirect_stdout(stdout): + filter_exonset(temp_exonsets.name, transcript_ids, missing_ids_file.name) + + # Assert the record for NR_046571.1 is filtered out + self.assertEqual(stdout.getvalue(), ''.join(lines[0:3])) + + # Confirm filtered transcript is present in missing_ids_file + with open(missing_ids_file.name, 'r') as f: + contents = f.read() + self.assertEqual(contents, 'NR_046571.1\n') + + mock_logger.debug.assert_called_with('Exon set transcript NR_046571.1 not found in txinfo file. Filtering out.') + mock_logger.info.assert_called_with('Filtered out exon sets for 1 transcript(s)') + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_ncbi_parse_genomic_gff.py b/tests/test_ncbi_parse_genomic_gff.py new file mode 100644 index 0000000..4a0b638 --- /dev/null +++ b/tests/test_ncbi_parse_genomic_gff.py @@ -0,0 +1,247 @@ +import gzip +import os +import subprocess +import unittest +from tempfile import NamedTemporaryFile + +from sbin.ncbi_parse_genomic_gff import ( + get_zero_based_exon_ranges, + GFFRecord, + parse_gff_files, + parse_gff_record, +) + +CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) +BASE_DIR = os.path.dirname(CURRENT_DIR) + + +def sample_line(**params): + defaults = { + "seqid": "NC_000001.10", + "source": "BestRefSeq", + "feature": "exon", + "start": 11874, + "stop": 12227, + "score": ".", + "strand": "1", + "phase": ".", + "attributes_str": "ID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;transcript_id=NR_046018.2", + } + defaults.update(params) + return "\t".join(map(str, defaults.values())) + "\n" + + +class TestGFFParsing(unittest.TestCase): + def setUp(self): + with NamedTemporaryFile(delete=False) as temp_gff: + with open(temp_gff.name, "wt") as f: + f.write( + "NC_000001.10\tBestRefSeq\texon\t11874\t12227\t.\t+\t.\tID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n" + ) + f.write( + "NC_000001.10\tBestRefSeq\texon\t12613\t12721\t.\t+\t.\tID=exon-NR_046018.2-2;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n" + ) + f.write( + "NC_000001.10\tBestRefSeq\texon\t13221\t14409\t.\t+\t.\tID=exon-NR_046018.2-3;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n" + ) + f.write( + "NC_000001.11\tBestRefSeq\texon\t15874\t16227\t.\t+\t.\tID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n" + ) + f.write( + "NC_000001.11\tBestRefSeq\texon\t16613\t16721\t.\t+\t.\tID=exon-NR_046018.2-2;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n" + ) + f.write( + "NC_000001.11\tBestRefSeq\texon\t17221\t18409\t.\t+\t.\tID=exon-NR_046018.2-3;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n" + ) + temp_gff.seek(0) + self.temp_gff = temp_gff + self.gff_records = [ + GFFRecord( + seqid="NC_000001.10", + start=11874, + end=12227, + strand="+", + exon_number=1, + parent_id="rna-NR_046018.2", + transcript_id="NR_046018.2", + ), + GFFRecord( + seqid="NC_000001.10", + start=12613, + end=12721, + strand="+", + exon_number=2, + parent_id="rna-NR_046018.2", + transcript_id="NR_046018.2", + ), + GFFRecord( + seqid="NC_000001.10", + start=13221, + end=14409, + strand="+", + exon_number=3, + parent_id="rna-NR_046018.2", + transcript_id="NR_046018.2", + ), + GFFRecord( + seqid="NC_000001.11", + start=15874, + end=16227, + strand="+", + exon_number=1, + parent_id="rna-NR_046018.2", + transcript_id="NR_046018.2", + ), + GFFRecord( + seqid="NC_000001.11", + start=16613, + end=16721, + strand="+", + exon_number=2, + parent_id="rna-NR_046018.2", + transcript_id="NR_046018.2", + ), + GFFRecord( + seqid="NC_000001.11", + start=17221, + end=18409, + strand="+", + exon_number=3, + parent_id="rna-NR_046018.2", + transcript_id="NR_046018.2", + ), + ] + + def tearDown(self): + os.remove(self.temp_gff.name) + + def test_parse_gff_record(self): + # Test parsing a single GFF record + line = sample_line() + expected_record = GFFRecord( + seqid="NC_000001.10", + start=11874, + end=12227, + strand="1", + exon_number=1, + parent_id="rna-NR_046018.2", + transcript_id="NR_046018.2", + ) + parsed_record = parse_gff_record(line) + self.assertEqual(parsed_record, expected_record) + self.assertEqual(parsed_record.key, f"{expected_record.transcript_id}:{expected_record.seqid}") + + def test_parse_gff_record_skips_non_exon_records(self): + # We exclude non-exon records + line = sample_line(feature="pseudogene") + expected_record = None + parsed_record = parse_gff_record(line) + self.assertEqual(parsed_record, expected_record) + + def test_parse_gff_record_skips_missing_transcript_id(self): + # We exclude alignments missing a parent field + line = sample_line( + attributes_str="ID=exon-NR_046018.2-1;transcript_id=NR_046018.2" + ) # parent missing from attributes + expected_record = None + parsed_record = parse_gff_record(line) + self.assertEqual(parsed_record, expected_record) + + def test_parse_gff_record_skips_missing_parent_field(self): + # We exclude alignments missing transcript_id + line = sample_line( + attributes_str="ID=exon-NR_046018.2-1;Parent=rna-NR_046018.2" + ) # transcript_id missing from attributes + expected_record = None + parsed_record = parse_gff_record(line) + self.assertEqual(parsed_record, expected_record) + + def test_parse_gff_record_skips_non_NM_NR_transcripts(self): + # We only care about transcripts that start with NM_ or NR_ + line = sample_line( + attributes_str="ID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;transcript_id=somethingelse" + ) # transcript_id missing from attributes + expected_record = None + parsed_record = parse_gff_record(line) + self.assertEqual(parsed_record, expected_record) + + def test_parse_gff_record_unexpected_number_of_fields(self): + # Raise an exception if there are not exactly 9 fields in a non-comment line + line = "NC_000001.10\tID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;transcript_id=NR_046018\n" # only 2 fields + with self.assertRaises(ValueError) as context: + parse_gff_record(line) + + self.assertEqual( + str(context.exception), "Expected 9 tab-separated fields, got 2" + ) + + def test_parse_gff_record_raises_non_int_start_stop(self): + # Raise an exception if either start or stop is not an integer + lines = [sample_line(start="a string"), sample_line(stop="another string")] + for line in lines: + with self.assertRaises(ValueError): + parse_gff_record(line) + + def test_parse_gff_record_raises_unparseable_id(self): + # raise an exception if we cannot parse the exon number from the ID + line = sample_line( + attributes_str="ID=unexpected_id;Parent=rna-NR_046018.2;transcript_id=NR_046018" + ) + with self.assertRaises(ValueError) as context: + parse_gff_record(line) + + self.assertEqual( + str(context.exception), "Failed to parse exon number from unexpected_id" + ) + + def test_parse_gff_file(self): + # Test parsing the entire uncompressed GFF file + expected_result = { + "NR_046018.2:NC_000001.10": self.gff_records[:3], + "NR_046018.2:NC_000001.11": self.gff_records[3:], + } + parsed_result = parse_gff_files([self.temp_gff.name]) + self.assertEqual(parsed_result, expected_result) + + def test_parse_gff_file_accepts_gzipped_files(self): + # Create a gzipped version of the temp_gff file + with gzip.open(self.temp_gff.name + ".gz", "wb") as f_out: + with open(self.temp_gff.name, "rb") as f_in: + f_out.write(f_in.read()) + + # Test parsing the gzipped GFF file + expected_result = { + "NR_046018.2:NC_000001.10": self.gff_records[:3], + "NR_046018.2:NC_000001.11": self.gff_records[3:], + } + parsed_result = parse_gff_files([self.temp_gff.name + ".gz"]) + self.assertEqual(parsed_result, expected_result) + + def test_get_zero_based_exon_ranges(self): + # Test converting exon ranges to 0-based half-open format yields expected values + exon_ranges = get_zero_based_exon_ranges(self.gff_records[:3]) + assert exon_ranges == "11873,12227;12612,12721;13220,14409" + + def test_script_output(self): + # Run the script from the command line + input_gff_file = os.path.join(CURRENT_DIR, "data", f"genomic_100.gff.gz") + script_path = os.path.join(BASE_DIR, "sbin", "ncbi_parse_genomic_gff.py") + + command = ["python", script_path, input_gff_file] + completed_process = subprocess.run( + command, check=True, capture_output=True, text=True + ) + stdout_content = completed_process.stdout + expected_file_path = os.path.join( + CURRENT_DIR, "data", "expected_genomic_100.exonset" + ) + with open(expected_file_path, "r") as expected_file: + expected_content = expected_file.read() + + assert ( + stdout_content == expected_content + ), "Output content doesn't match expected." + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_ncbi_process_mito.py b/tests/test_ncbi_process_mito.py new file mode 100644 index 0000000..6e8ef27 --- /dev/null +++ b/tests/test_ncbi_process_mito.py @@ -0,0 +1,279 @@ +import os +import unittest +from unittest.mock import MagicMock, patch + +from Bio.SeqRecord import SeqRecord + +from sbin.ncbi_process_mito import ( + download_mito_files, + get_mito_genes, + parse_db_xrefs, + parse_nomenclature_value, +) + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + + +class TestNcbiProcessMito(unittest.TestCase): + def verify_mito_gene_attributes(self, mito_gene, expected_values): + for k, v in expected_values.items(): + try: + self.assertEqual(getattr(mito_gene, k), v) + except AssertionError: + print( + f"Test failure on mito gene {mito_gene.gene_symbol} ({mito_gene.gene_id}) " + f'attribute "{k}" with value "{v}" not equal to "{getattr(mito_gene, k)}"' + ) + raise + + @patch("sbin.ncbi_process_mito.download_from_eutils") + def test_download_mito_files(self, mock_download): + output_dir = "test_dir" + accession = "test_accession" + result = download_mito_files(output_dir, accession) + self.assertEqual( + result, + { + "gbff": "test_dir/test_accession.gbff", + "fna": "test_dir/test_accession.fna", + }, + ) + mock_download.assert_any_call(accession, "gb", f"{output_dir}/{accession}.gbff") + mock_download.assert_any_call( + accession, "fasta", f"{output_dir}/{accession}.fna" + ) + + def test_db_xrefs(self): + gb_feature_mock = MagicMock(spec=SeqRecord) + gb_feature_mock.qualifiers = { + "db_xref": ["GeneID:4558", "HGNC:HGNC:7481", "MIM:590070"] + } + + result = parse_db_xrefs(gb_feature_mock) + self.assertEqual( + result, {"GeneID": "4558", "HGNC": "HGNC:7481", "MIM": "590070"} + ) + + def test_db_xrefs_empty(self): + gb_feature_mock = MagicMock(spec=SeqRecord) + gb_feature_mock.qualifiers = {} + + result = parse_db_xrefs(gb_feature_mock) + self.assertEqual(result, {}) + + def test_parse_nomenclature_value(self): + gb_feature_mock = MagicMock(spec=SeqRecord) + gb_feature_mock.qualifiers = { + "nomenclature": [ + "Official Symbol: MT-TF | Name: mitochondrially encoded tRNA phenylalanine | Provided by: HGNC:HGNC:7481" + ] + } + + result = parse_nomenclature_value(gb_feature_mock) + self.assertEqual( + result, + { + "Official Symbol": "MT-TF", + "Name": "mitochondrially encoded tRNA phenylalanine", + "Provided by": "HGNC:HGNC:7481", + }, + ) + + def test_parse_nomenclature_value_empty(self): + gb_feature_mock = MagicMock(spec=SeqRecord) + gb_feature_mock.qualifiers = {} + + result = parse_nomenclature_value(gb_feature_mock) + self.assertEqual(result, {}) + + def test_get_mito_genes(self): + mito_genbank_filepath = f"{BASE_DIR}/data/NC_012920.1.gbff" + results = [_ for _ in get_mito_genes(mito_genbank_filepath)] + expected_gene_ids = [ + 4508, + 4509, + 4511, + 4512, + 4513, + 4514, + 4519, + 4535, + 4536, + 4537, + 4538, + 4539, + 4540, + 4541, + 4549, + 4550, + 4553, + 4555, + 4556, + 4558, + 4563, + 4564, + 4565, + 4566, + 4567, + 4568, + 4569, + 4570, + 4571, + 4572, + 4573, + 4574, + 4575, + 4576, + 4577, + 4578, + 4579, + ] + expected_gene_symbols = [ + "MT-ATP6", + "MT-ATP8", + "MT-CO1", + "MT-CO2", + "MT-CO3", + "MT-CYB", + "MT-ND1", + "MT-ND2", + "MT-ND3", + "MT-ND4", + "MT-ND4L", + "MT-ND5", + "MT-ND6", + "MT-RNR1", + "MT-RNR2", + "MT-TA", + "MT-TC", + "MT-TD", + "MT-TE", + "MT-TF", + "MT-TG", + "MT-TH", + "MT-TI", + "MT-TK", + "MT-TL1", + "MT-TL2", + "MT-TM", + "MT-TN", + "MT-TP", + "MT-TQ", + "MT-TR", + "MT-TS1", + "MT-TS2", + "MT-TT", + "MT-TV", + "MT-TW", + "MT-TY", + ] + expected_origin = "NCBI" + expected_aln_method = "splign" + + self.assertEqual(len(results), 37) + self.assertEqual(sorted([r.gene_id for r in results]), expected_gene_ids) + self.assertEqual( + sorted([r.gene_symbol for r in results]), expected_gene_symbols + ) + self.assertEqual([r.origin for r in results], [expected_origin] * 37) + self.assertEqual( + [r.alignment_method for r in results], [expected_aln_method] * 37 + ) + + results_by_gene = {mg.gene_id: mg for mg in results} + + # Expected results for "MT-TV" non-coding tRNA gene on the plus strand + expected_mg4577_values = { + "gene_symbol": "MT-TV", + "name": "mitochondrially encoded tRNA valine", + "tx_ac": "NC_012920.1_01601_01670", + "tx_seq": "CAGAGTGTAGCTTAACACAAAGCACCCAACTTACACTTAGGAGATTTCAACTTAACTTGACCGCTCTGA", + "tx_start": 0, + "tx_end": 69, + "alt_ac": "NC_012920.1", + "alt_start": 1601, + "alt_end": 1670, + "strand": 1, + "transl_table": None, + "transl_except": None, + "pro_ac": None, + "pro_seq": None, + } + self.verify_mito_gene_attributes(results_by_gene[4577], expected_mg4577_values) + + # Expected results for "MT-TQ" tRNA gene on the minus strand + expected_mg4572_values = { + "gene_symbol": "MT-TQ", + "name": "mitochondrially encoded tRNA glutamine", + "tx_ac": "NC_012920.1_04328_04400", + "tx_seq": "TAGGATGGGGTGTGATAGGTGGCACGGAGAATTTTGGATTCTCAGGGATGGGTTCGATTCTCATAGTCCTAG", + "tx_start": 0, + "tx_end": 72, + "alt_ac": "NC_012920.1", + "alt_start": 4328, + "alt_end": 4400, + "strand": -1, + "transl_table": None, + "transl_except": None, + "pro_ac": None, + "pro_seq": None, + } + self.verify_mito_gene_attributes(results_by_gene[4572], expected_mg4572_values) + + # Expected results for "MT-CO2" coding gene on the plus strand + expected_mg4513_values = { + "gene_symbol": "MT-CO2", + "name": "mitochondrially encoded cytochrome c oxidase II", + "tx_ac": "NC_012920.1_07585_08269", + "tx_seq": "ATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTT" + "TCCTTATCTGCTTCCTAGTCCTGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATCTCAGACGCTCAGGAAATAGAAACCGTCTGAACT" + "ATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGACGAGGTCAACGATCCCTCCCTTACCATCAAATCAATTGG" + "CCACCAATGGTACTGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCC" + "TTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTA" + "AAAACAGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAG" + "TTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAG", + "tx_start": 0, + "tx_end": 684, + "alt_ac": "NC_012920.1", + "alt_start": 7585, + "alt_end": 8269, + "strand": 1, + "transl_table": "2", + "transl_except": None, + "pro_ac": "YP_003024029.1", + "pro_seq": "MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFLTLTTKLTNTNISDAQEMETVWTILPAIILVLIALPSLRILYMTDEVNDP" + "SLTIKSIGHQWYWTYEYTDYGGLIFNSYMLPPLFLEPGDLRLLDVDNRVVLPIEAPIRMMITSQDVLHSWAVPTLGLKTDAIPGRLNQTTFTATRPGVYYGQCS" + "EICGANHSFMPIVLELIPLKIFEMGPVFTL", + } + self.verify_mito_gene_attributes(results_by_gene[4513], expected_mg4513_values) + + # Expected results for "MT-ND1" coding gene on the minus strand with a transl_except + expected_mg4535_values = { + "gene_symbol": "MT-ND1", + "name": "mitochondrially encoded NADH dehydrogenase 1", + "tx_ac": "NC_012920.1_03306_04262", + "tx_seq": "ATACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAATCGCAATGGCATTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATAC" + "AACTACGCAAAGGCCCCAACGTTGTAGGCCCCTACGGGCTACTACAACCCTTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCACATCT" + "ACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATCGCTCTTCTACTATGAACCCCCCTCCCCATACCCAACCCCCTGGTCAACCTCAACCTAGG" + "CCTCCTATTTATTCTAGCCACCTCTAGCCTAGCCGTTTACTCAATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCGCACTGCGAGCAG" + "TAGCCCAAACAATCTCATATGAAGTCACCCTAGCCATCATTCTACTATCAACATTACTAATAAGTGGCTCCTTTAACCTCTCCACCCTTATCACAACACAAGAA" + "CACCTCTGATTACTCCTGCCATCATGACCCTTGGCCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTTCGACCTTGCCGAAGGGGAGTC" + "CGAACTAGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATACACAAACATTATTATAATAAACACCCTCACCACTA" + "CAATCTTCCTAGGAACAACATATGACGCACTCTCCCCTGAACTCTACACAACATATTTTGTCACCAAGACCCTACTTCTAACCTCCCTGTTCTTATGAATTCGA" + "ACAGCATACCCCCGATTCCGCTACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTAGCATTACTTATATGATATGTCTCCATACCCAT" + "TACAATCTCCAGCATTCCCCCTCAAACCTA", + "tx_start": 0, + "tx_end": 956, + "alt_ac": "NC_012920.1", + "alt_start": 3306, + "alt_end": 4262, + "strand": 1, + "transl_table": "2", + "transl_except": ["(pos:4261..4262,aa:TERM)"], + "pro_ac": "YP_003024026.1", + "pro_seq": "MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLLQPFADAMKLFTKEPLKPATSTITLYITAPTLALTIALLLWTPLPMPN" + "PLVNLNLGLLFILATSSLAVYSILWSGWASNSNYALIGALRAVAQTISYEVTLAIILLSTLLMSGSFNLSTLITTQEHLWLLLPSWPLAMMWFISTLAETNRTP" + "FDLAEGESELVSGFNIEYAAGPFALFFMAEYTNIIMMNTLTTTIFLGTTYDALSPELYTTYFVTKTLLLTSLFLWIRTAYPRFRYDQLMHLLWKNFLPLTLALL" + "MWYVSMPITISSIPPQT", + } + self.verify_mito_gene_attributes(results_by_gene[4535], expected_mg4535_values) diff --git a/tests/test_uta_formats_txinfo.py b/tests/test_uta_formats_txinfo.py new file mode 100644 index 0000000..b2e3e09 --- /dev/null +++ b/tests/test_uta_formats_txinfo.py @@ -0,0 +1,10 @@ +import unittest +from uta.formats.txinfo import TxInfo + + +class TestUtaFormats(unittest.TestCase): + + def test_txinfo_serialize_transl_except(self): + self.assertIsNone(TxInfo.serialize_transl_except(None)) + self.assertEqual(TxInfo.serialize_transl_except([]), '') + self.assertEqual(TxInfo.serialize_transl_except(['(pos:333..335,aa:Sec)', '(pos:1017,aa:TERM)']), '(pos:333..335,aa:Sec);(pos:1017,aa:TERM)') diff --git a/tests/test_uta_loading.py b/tests/test_uta_loading.py new file mode 100644 index 0000000..210d2ab --- /dev/null +++ b/tests/test_uta_loading.py @@ -0,0 +1,445 @@ +import configparser +import signal +import unittest +from unittest.mock import Mock, patch + +import sqlalchemy as sa +import testing.postgresql + +import uta +import uta.loading as ul +import uta.models as usam + + +class TestUtaLoading(unittest.TestCase): + + def setUp(self): + # setup test database + self.db = testing.postgresql.Postgresql() + self.session = uta.connect(self.db.url()) + + admin_role = 'uta_admin' + self.session.execute(sa.text(f'create user {admin_role}')) + self.session.execute(sa.text(f'grant all privileges on database test to {admin_role}')) + + self.cf = configparser.ConfigParser() + self.cf.add_section('uta') + self.cf.set('uta', 'admin_role', 'uta_admin') + + ul.create_schema(self.session, {}, self.cf) + ul.grant_permissions(self.session, {}, self.cf) + + def tearDown(self): + self.session.close() + self.db.stop(_signal=signal.SIGKILL) + self.db.cleanup() + + def test_meta_data(self): + """ + Metadata should exist, then updated when update_meta_data is called. + """ + # the schema_version should match existing values in UTA models + expected_schema_version = usam.schema_version + md_schema_version = self.session.query(usam.Meta).filter(usam.Meta.key == 'schema_version').one() + self.assertEqual(md_schema_version.value, expected_schema_version) + + new_schema_version = '9.9' + with patch('uta.models.schema_version', new_schema_version): + ul.update_meta_data(self.session, {}, self.cf) + + md_schema_version = self.session.query(usam.Meta).filter(usam.Meta.key == 'schema_version').one() + self.assertEqual(md_schema_version.value, new_schema_version) + + md_updated_at = self.session.query(usam.Meta).filter(usam.Meta.key == 'updated on').one_or_none() + self.assertIsNotNone(md_updated_at) + + def test_load_assoc_ac(self): + """ + Loading file tests/data/assocacs.gz should create associated_accessions records in the database. + Row will be created in associated_accessions even when transcript or origin does not exist in database. + This is only the case until tx_ac and origin are converted to foreign keys. + """ + + # insert origins referenced in data file + o1 = usam.Origin( + name='NCBI', + url='http://bogus.com/ncbi', + url_ac_fmt='http://bogus.com/ncbi/{ac}', + ) + self.session.add(o1) + + # insert genes required for transcripts + g1 = usam.Gene( + gene_id='49', + hgnc='ACR', + symbol='ACR', + maploc='22q13.33', + descr='acrosin', + summary='acrosin', + aliases='SPGF87', + type='protein-coding', + xrefs='MIM:102480,HGNC:HGNC:126,Ensembl:ENSG00000100312,AllianceGenome:HGNC:126', + ) + g2 = usam.Gene( + gene_id=50, + hgnc='ACO2', + symbol='ACO2', + maploc='22q13.2', + descr='aconitase 2', + summary='aconitase 2', + aliases='ACONM,HEL-S-284,ICRD,OCA8,OPA9', + type='protein-coding', + xrefs='MIM:100850,HGNC:HGNC:118,Ensembl:ENSG00000100412,AllianceGenome:HGNC:118', + ) + self.session.add(g1) + self.session.add(g2) + + # insert transcripts referenced in data file + t1 = usam.Transcript( + ac='NM_001097.3', + origin=o1, + gene_id=g1.gene_id, + cds_start_i=0, + cds_end_i=1, + cds_md5='a', + ) + t2 = usam.Transcript( + ac='NM_001098.3', + origin=o1, + gene_id=g2.gene_id, + cds_start_i=2, + cds_end_i=3, + cds_md5='b', + ) + self.session.add(t1) + self.session.add(t2) + + # pre-add one of the associated_acessions from the test data file + # to demonstrate get-or-insert behavior + p = usam.AssociatedAccessions( + tx_ac='NM_001097.3', + pro_ac='NP_001088.2', + origin='NCBI', + ) + self.session.add(p) + + self.session.commit() + + cf = configparser.ConfigParser() + cf.add_section('uta') + cf.set('uta', 'admin_role', 'uta_admin') + + ul.load_assoc_ac(self.session, {'FILE': 'tests/data/assocacs.gz'}, cf) + + # associated_accessions table should contain one record per line in file + aa = self.session.query(usam.AssociatedAccessions).order_by(usam.AssociatedAccessions.tx_ac).all() + aa_list = [{'tx_ac': aa.tx_ac, 'pro_ac': aa.pro_ac, 'origin_name': aa.origin} for aa in aa] + expected_aa_list = [ + { + 'tx_ac': 'DummyTx', + 'pro_ac': 'DummyProtein', + 'origin_name': 'DummyOrigin', + }, + { + 'tx_ac': 'NM_001097.3', + 'pro_ac': 'NP_001088.2', + 'origin_name': 'NCBI', + }, + { + 'tx_ac': 'NM_001098.3', + 'pro_ac': 'NP_001089.1', + 'origin_name': 'NCBI', + }, + ] + self.assertEqual(aa_list, expected_aa_list) + + def test_load_txinfo(self): + """ + Loading file tests/data/txinfo.gz should create transcript, exon_set, exon, and translation_exception records in the database. + """ + + # insert origins referenced in data file + o1 = usam.Origin( + name='NCBI', + url='http://bogus.com/ncbi', + url_ac_fmt='http://bogus.com/ncbi/{ac}', + ) + self.session.add(o1) + + # insert genes required for transcripts + g1 = usam.Gene( + gene_id='140606', + hgnc='SELENOM', + symbol='SELENOM', + maploc='22q12.2', + descr='selenoprotein M', + summary='selenoprotein M', + aliases='SELM,SEPM', + type='protein-coding', + xrefs='MIM:610918,HGNC:HGNC:30397,Ensembl:ENSG00000198832,AllianceGenome:HGNC:30397', + ) + g2 = usam.Gene( + gene_id='4514', + hgnc='MT-CO3', + symbol='MT-CO3', + maploc=None, + descr='mitochondrially encoded cytochrome c oxidase III', + summary='mitochondrially encoded cytochrome c oxidase III', + aliases='COIII,MTCO3', + type='protein-coding', + xrefs='GeneID:4514,HGNC:HGNC:7422,MIM:516050', + ) + self.session.add(g1) + self.session.add(g2) + self.session.commit() + + cf = configparser.ConfigParser() + cf.add_section('uta') + cf.set('uta', 'admin_role', 'uta_admin') + + with patch('uta.loading._get_seqfetcher', return_value=Mock(fetch=Mock(return_value='FAKESEQUENCE'))): + ul.load_txinfo(self.session, {'FILE': 'tests/data/txinfo.gz'}, cf) + + transcript = self.session.query(usam.Transcript).filter(usam.Transcript.ac == 'NM_080430.4').one() + self.assertEqual( + { + 'ac': transcript.ac, + 'gene_id': transcript.gene_id, + 'cds_start_i': transcript.cds_start_i, + 'cds_end_i': transcript.cds_end_i, + 'codon_table': transcript.codon_table, + }, + { + 'ac': 'NM_080430.4', + 'gene_id': '140606', + 'cds_start_i': 63, + 'cds_end_i': 501, + 'codon_table': '1', + }, + ) + + transcript = self.session.query(usam.Transcript).filter(usam.Transcript.ac == 'NC_012920.1_09206_09990').one() + self.assertEqual( + { + 'ac': transcript.ac, + 'gene_id': transcript.gene_id, + 'cds_start_i': transcript.cds_start_i, + 'cds_end_i': transcript.cds_end_i, + 'codon_table': transcript.codon_table, + }, + { + 'ac': 'NC_012920.1_09206_09990', + 'gene_id': '4514', + 'cds_start_i': 0, + 'cds_end_i': 784, + 'codon_table': '2', + }, + ) + + exon_set = self.session.query(usam.ExonSet).filter(usam.ExonSet.tx_ac == 'NM_080430.4').one() + exons = self.session.query(usam.Exon).filter(usam.Exon.exon_set_id == exon_set.exon_set_id).all() + self.assertEqual(len(exons), 5) + + translation_exception = self.session.query(usam.TranslationException).filter(usam.TranslationException.tx_ac == 'NM_080430.4').one() + self.assertEqual( + { + 'tx_ac': translation_exception.tx_ac, + 'start_position': translation_exception.start_position, + 'end_position': translation_exception.end_position, + 'amino_acid': translation_exception.amino_acid, + }, + { + 'tx_ac': 'NM_080430.4', + 'start_position': 204, + 'end_position': 207, + 'amino_acid': 'Sec', + }, + ) + + def test_load_exonset_with_exon_structure_mismatch(self): + """ + Loading the test file tests/data/exonsets-mm-exons.gz should not raise an exception, exon alignments without + a mismatch should load, those with a mismatch should be skipped and logged as such. The input file has + alignments for 4 transcripts against NC_000001.11, but only 2 of them have the correct number of exons. + We only expect the alignmets for NM_000911.4 and NM_001005277.1 to be loaded. + """ + # setup + # insert origins referenced in data file + o1 = usam.Origin( + name="NCBI", + url="http://bogus.com/ncbi", + url_ac_fmt="http://bogus.com/ncbi/{ac}", + ) + self.session.add(o1) + self.session.flush() + + for gene_data in [ + { + "gene_id": "3352", + "hgnc": "HTR1D", + "symbol": "HTR1D", + "type": "protein-coding", + }, + { + "gene_id": "4985", + "hgnc": "OPRD1", + "symbol": "OPRD1", + "type": "protein-coding", + }, + { + "gene_id": "81399", + "hgnc": "OR4F16", + "symbol": "OR4F16", + "type": "protein-coding", + }, + { + "gene_id": "79501", + "hgnc": "OR4F5", + "symbol": "OR4F5", + "type": "protein-coding", + }, + ]: + gene = usam.Gene(**gene_data) + self.session.add(gene) + + for tx_data in [ + { + "ac": "NM_000864.5", + "origin_id": o1.origin_id, + "gene_id": "3352", + "cds_start_i": 994, + "cds_end_i": 2128, + "cds_md5": "a", + }, + { + "ac": "NM_000911.4", + "origin_id": o1.origin_id, + "gene_id": "4985", + "cds_start_i": 214, + "cds_end_i": 1333, + "cds_md5": "b", + }, + { + "ac": "NM_001005277.1", + "origin_id": o1.origin_id, + "gene_id": "81399", + "cds_start_i": 0, + "cds_end_i": 939, + "cds_md5": "c", + }, + { + "ac": "NM_001005484.2", + "origin_id": o1.origin_id, + "gene_id": "79501", + "cds_start_i": 60, + "cds_end_i": 1041, + "cds_md5": "d", + }, + ]: + tx = usam.Transcript(**tx_data) + self.session.add(tx) + es = usam.ExonSet( + tx_ac=tx.ac, + alt_ac=tx.ac, + alt_strand=1, + alt_aln_method="transcript", + ) + self.session.add(es) + self.session.flush() + + for exon_data in [ + ("NM_000864.5", 1, 0, 3319), # exons for NM_000864.5 are 0,212;212,3319 + ("NM_000911.4", 1, 0, 441), + ("NM_000911.4", 2, 441, 791), + ("NM_000911.4", 3, 791, 9317), + ("NM_001005277.1", 1, 0, 939), + ("NM_001005484.2", 1, 0, 15), + ("NM_001005484.2", 2, 15, 69), + ( + "NM_001005484.2", + 3, + 69, + 1041, + ), # exons for NM_001005484.2 are 0,15;15,69;69,2618 + ("NM_001005484.2", 4, 1041, 2618), + ]: + es = ( + self.session.query(usam.ExonSet) + .filter( + usam.ExonSet.tx_ac == exon_data[0], usam.ExonSet.alt_ac == exon_data[0] + ) + .one() + ) + exon = usam.Exon( + exon_set_id=es.exon_set_id, + start_i=exon_data[2], + end_i=exon_data[3], + ord=exon_data[1], + ) + self.session.add(exon) + self.session.commit() + + cf = configparser.ConfigParser() + cf.add_section("uta") + cf.set("uta", "admin_role", "uta_admin") + + # load data from test exonsets file. + with patch( + "uta.loading._get_seqfetcher", + return_value=Mock(fetch=Mock(return_value="FAKESEQUENCE")), + ), patch("uta.loading.logger") as mock_logger: + ul.load_exonset(self.session, {"FILE": "tests/data/exonsets.mm-exons.gz"}, cf) + + assert mock_logger.warning.called_with( + "Exon structure mismatch: 4 exons in transcript NM_001005484.2; 3 in alignment NC_000001.11" + ) + assert mock_logger.warning.called_with( + "Exon structure mismatch: 1 exons in transcript NM_000864.5; 2 in alignment NC_000001.11" + ) + + # check that the exons for NM_000864.5 and NM_001005484.2 were not loaded, + # and NM_000911.4 and NM_001005277.1 were loaded + for tx_ac, expected_exon_count in [("NM_000911.4", 3), ("NM_001005277.1", 1)]: + exon_set = ( + self.session.query(usam.ExonSet) + .filter( + usam.ExonSet.tx_ac == tx_ac, + usam.ExonSet.alt_ac == "NC_000001.11", + usam.ExonSet.alt_aln_method == "splign", + ) + .one() + ) + exons = ( + self.session.query(usam.Exon) + .filter(usam.Exon.exon_set_id == exon_set.exon_set_id) + .all() + ) + self.assertEqual(len(exons), expected_exon_count) + + for tx_ac in ["NM_000864.5", "NM_001005484.2"]: + with self.assertRaises(sa.orm.exc.NoResultFound): + self.session.query(usam.ExonSet).filter( + usam.ExonSet.tx_ac == tx_ac, + usam.ExonSet.alt_ac == "NC_000001.11", + usam.ExonSet.alt_aln_method == "splign", + ).one() + + +class TestUtaLoadingFunctions(unittest.TestCase): + def test__create_translation_exceptions(self): + transl_except_list = ['(pos:333..335,aa:Sec)', '(pos:1017,aa:TERM)'] + translation_exceptions = ul._create_translation_exceptions(transcript='dummy_tx', transl_except_list=transl_except_list) + self.assertEqual(translation_exceptions, [ + { + 'tx_ac': 'dummy_tx', + 'start_position': 332, + 'end_position': 335, + 'amino_acid': 'Sec', + }, + { + 'tx_ac': 'dummy_tx', + 'start_position': 1016, + 'end_position': 1017, + 'amino_acid': 'TERM', + }, + ]) diff --git a/tests/test_uta_models.py b/tests/test_uta_models.py index 86eafea..d619dfc 100644 --- a/tests/test_uta_models.py +++ b/tests/test_uta_models.py @@ -1,7 +1,8 @@ -import os +import datetime import unittest import sqlalchemy +from sqlalchemy import text import testing.postgresql import uta @@ -16,6 +17,11 @@ 'g_strand': -1, 'g_starts_i': [26721603, 26627221], 'g_ends_i': [26722922, 26628183], 'g_cds_start_i': 26627665, 'g_cds_end_i': 26722486, + 'pro_ac': 'NP_000671.2', + 'translation_exceptions': [ + {'start_position': 333, 'end_position': 335, 'amino_acid': 'Sec'}, + {'start_position': 589, 'end_position': 589, 'amino_acid': 'TERM'}, + ], }, 'NM_033302.2': { 'seq': 'gaattccgaatcatgtgcagaatgctgaatcttcccccagccaggacgaataagacagcgcggaaaagcagattctcgtaattctggaattgcatgttgcaaggagtctcctggatcttcgcacccagcttcgggtagggagggagtccgggtcccgggctaggccagcccggcaggtggagagggtccccggcagccccgcgcgcccctggccatgtctttaatgccctgccccttcatgtggccttctgagggttcccagggctggccagggttgtttcccacccgcgcgcgcgctctcacccccagccaaacccacctggcagggctccctccagccgagaccttttgattcccggctcccgcgctcccgcctccgcgccagcccgggaggtggccctggacagccggacctcgcccggccccggctgggaccatggtgtttctctcgggaaatgcttccgacagctccaactgcacccaaccgccggcaccggtgaacatttccaaggccattctgctcggggtgatcttggggggcctcattcttttcggggtgctgggtaacatcctagtgatcctctccgtagcctgtcaccgacacctgcactcagtcacgcactactacatcgtcaacctggcggtggccgacctcctgctcacctccacggtgctgcccttctccgccatcttcgaggtcctaggctactgggccttcggcagggtcttctgcaacatctgggcggcagtggatgtgctgtgctgcaccgcgtccatcatgggcctctgcatcatctccatcgaccgctacatcggcgtgagctacccgctgcgctacccaaccatcgtcacccagaggaggggtctcatggctctgctctgcgtctgggcactctccctggtcatatccattggacccctgttcggctggaggcagccggcccccgaggacgagaccatctgccagatcaacgaggagccgggctacgtgctcttctcagcgctgggctccttctacctgcctctggccatcatcctggtcatgtactgccgcgtctacgtggtggccaagagggagagccggggcctcaagtctggcctcaagaccgacaagtcggactcggagcaagtgacgctccgcatccatcggaaaaacgccccggcaggaggcagcgggatggccagcgccaagaccaagacgcacttctcagtgaggctcctcaagttctcccgggagaagaaagcggccaaaacgctgggcatcgtggtcggctgcttcgtcctctgctggctgccttttttcttagtcatgcccattgggtctttcttccctgatttcaagccctctgaaacagtttttaaaatagtattttggctcggatatctaaacagctgcatcaaccccatcatatacccatgctccagccaagagttcaaaaaggcctttcagaatgtcttgagaatccagtgtctctgcagaaagcagtcttccaaacatgccctgggctacaccctgcacccgcccagccaggccgtggaagggcaacacaaggacatggtgcgcatccccgtgggatcaagagagaccttctacaggatctccaagacggatggcgtttgtgaatggaaatttttctcttccatgccccgtggatctgccaggattacagtgtccaaagaccaatcctcctgtaccacagcccggggacacacacccatgacatgaagccagcttcccgtccacgactgttgtccttactgcccaaggaaggggagcatgaaacccaccactggtcctgcgacccactgtctttggaatccaccccaggagcccaggagccttgcctgacacttggatttacttctttatcaagcatccatctgactaaggcacaaatccaacatgttactgttactgatacaggaaaaacagtaacttaaggaatgatcatgaatgcaaagggaaagaggaaaagagccttcagggacaaatagctcgattttttgtaaatcagtttcatacaacctccctcccccatttcattcttaaaagttaattgagaatcatcagccacgtgtagggtgtgag', @@ -24,6 +30,7 @@ 'g_strand': -1, 'g_starts_i': [26721603, 26627797, 26613912], 'g_ends_i': [26722922, 26628183, 26614296], 'g_cds_start_i': 26614275, 'g_cds_end_i': 26722486, + 'pro_ac': 'NP_150645.2', }, 'NM_033303.3': { 'seq': 'gaattccgaatcatgtgcagaatgctgaatcttcccccagccaggacgaataagacagcgcggaaaagcagattctcgtaattctggaattgcatgttgcaaggagtctcctggatcttcgcacccagcttcgggtagggagggagtccgggtcccgggctaggccagcccggcaggtggagagggtccccggcagccccgcgcgcccctggccatgtctttaatgccctgccccttcatgtggccttctgagggttcccagggctggccagggttgtttcccacccgcgcgcgcgctctcacccccagccaaacccacctggcagggctccctccagccgagaccttttgattcccggctcccgcgctcccgcctccgcgccagcccgggaggtggccctggacagccggacctcgcccggccccggctgggaccatggtgtttctctcgggaaatgcttccgacagctccaactgcacccaaccgccggcaccggtgaacatttccaaggccattctgctcggggtgatcttggggggcctcattcttttcggggtgctgggtaacatcctagtgatcctctccgtagcctgtcaccgacacctgcactcagtcacgcactactacatcgtcaacctggcggtggccgacctcctgctcacctccacggtgctgcccttctccgccatcttcgaggtcctaggctactgggccttcggcagggtcttctgcaacatctgggcggcagtggatgtgctgtgctgcaccgcgtccatcatgggcctctgcatcatctccatcgaccgctacatcggcgtgagctacccgctgcgctacccaaccatcgtcacccagaggaggggtctcatggctctgctctgcgtctgggcactctccctggtcatatccattggacccctgttcggctggaggcagccggcccccgaggacgagaccatctgccagatcaacgaggagccgggctacgtgctcttctcagcgctgggctccttctacctgcctctggccatcatcctggtcatgtactgccgcgtctacgtggtggccaagagggagagccggggcctcaagtctggcctcaagaccgacaagtcggactcggagcaagtgacgctccgcatccatcggaaaaacgccccggcaggaggcagcgggatggccagcgccaagaccaagacgcacttctcagtgaggctcctcaagttctcccgggagaagaaagcggccaaaacgctgggcatcgtggtcggctgcttcgtcctctgctggctgccttttttcttagtcatgcccattgggtctttcttccctgatttcaagccctctgaaacagtttttaaaatagtattttggctcggatatctaaacagctgcatcaaccccatcatatacccatgctccagccaagagttcaaaaaggcctttcagaatgtcttgagaatccagtgtctctgcagaaagcagtcttccaaacatgccctgggctacaccctgcacccgcccagccaggccgtggaagggcaacacaaggacatggtgcgcatccccgtgggatcaagagagaccttctacaggatctccaagacggatggcgtttgtgaatggaaatttttctcttccatgccccgtggatctgccaggattacagtgtccaaagaccaatcctcctgtaccacagcccggacgaagtctcgctctgtcaccaggctggagtgcagtggcatgatcttggctcactgcaacctccgcctcccgggttcaagagattctcctgcctcagcctcccaagcagctgggactacagggatgtgccaccaggccgacgccaccaggcccagctaatttttgtatttttagtagagacggggtttcaccatgttggccaggatgatctcgatctcttgacctcatgatctgcctgcctcagcctcccaaagtgctgggattacaggcgtgagccaccgtgcccggcccaactattttttttttttatcttttttaacagtgcaatcctttctgtggatgaaatcttgctcagaagctcaatatgcaaaagaaagaaaaacagcagggctggacggatgttgggagtggggtaagaccccaaccactcagaaccacccccccaacacacacacacattctctccatggtgactggtgaggggcctctagagggtacatagtacaccatggagcacggtttaagcaccactggactacacattcttctgtggcagttatcttaccttcccatagacacccagcccatagccattggtt', @@ -32,6 +39,7 @@ 'g_strand': -1, 'g_starts_i': [26721603, 26627797, 26605666], 'g_ends_i': [26722922, 26628183, 26606265], 'g_cds_start_i': 26606106, 'g_cds_end_i': 26722486, + 'pro_ac': 'NP_150646.3', }, 'NM_033304.2': { 'seq': 'gaattccgaatcatgtgcagaatgctgaatcttcccccagccaggacgaataagacagcgcggaaaagcagattctcgtaattctggaattgcatgttgcaaggagtctcctggatcttcgcacccagcttcgggtagggagggagtccgggtcccgggctaggccagcccggcaggtggagagggtccccggcagccccgcgcgcccctggccatgtctttaatgccctgccccttcatgtggccttctgagggttcccagggctggccagggttgtttcccacccgcgcgcgcgctctcacccccagccaaacccacctggcagggctccctccagccgagaccttttgattcccggctcccgcgctcccgcctccgcgccagcccgggaggtggccctggacagccggacctcgcccggccccggctgggaccatggtgtttctctcgggaaatgcttccgacagctccaactgcacccaaccgccggcaccggtgaacatttccaaggccattctgctcggggtgatcttggggggcctcattcttttcggggtgctgggtaacatcctagtgatcctctccgtagcctgtcaccgacacctgcactcagtcacgcactactacatcgtcaacctggcggtggccgacctcctgctcacctccacggtgctgcccttctccgccatcttcgaggtcctaggctactgggccttcggcagggtcttctgcaacatctgggcggcagtggatgtgctgtgctgcaccgcgtccatcatgggcctctgcatcatctccatcgaccgctacatcggcgtgagctacccgctgcgctacccaaccatcgtcacccagaggaggggtctcatggctctgctctgcgtctgggcactctccctggtcatatccattggacccctgttcggctggaggcagccggcccccgaggacgagaccatctgccagatcaacgaggagccgggctacgtgctcttctcagcgctgggctccttctacctgcctctggccatcatcctggtcatgtactgccgcgtctacgtggtggccaagagggagagccggggcctcaagtctggcctcaagaccgacaagtcggactcggagcaagtgacgctccgcatccatcggaaaaacgccccggcaggaggcagcgggatggccagcgccaagaccaagacgcacttctcagtgaggctcctcaagttctcccgggagaagaaagcggccaaaacgctgggcatcgtggtcggctgcttcgtcctctgctggctgccttttttcttagtcatgcccattgggtctttcttccctgatttcaagccctctgaaacagtttttaaaatagtattttggctcggatatctaaacagctgcatcaaccccatcatatacccatgctccagccaagagttcaaaaaggcctttcagaatgtcttgagaatccagtgtctctgcagaaagcagtcttccaaacatgccctgggctacaccctgcacccgcccagccaggccgtggaagggcaacacaaggacatggtgcgcatccccgtgggatcaagagagaccttctacaggatctccaagacggatggcgtttgtgaatggaaatttttctcttccatgccccgtggatctgccaggattacagtgtccaaagaccaatcctcctgtaccacagcccggaggggaatggattgtagatatttcaccaagaattgcagagagcatatcaagcatgtgaattttatgatgccaccgtggagaaagggttcagaatgctgatctccaggtagctggagacctaggcagtctgcaaatgaggagtcagctggaagctatggctatgtattatgtgacatcgcttgttcctaagtgaaaactggatatcccaaccttctggcccagtaggtttcatggttaagacctggtagtgagaacattttaggaactatttgcttgggcaggcaatttttcactct', @@ -40,19 +48,22 @@ 'g_strand': -1, 'g_starts_i': [26721603, 26627797, 26623370], 'g_ends_i': [26722922, 26628183, 26623666], 'g_cds_start_i': 26623567, 'g_cds_end_i': 26722486, + 'pro_ac': 'NP_150647.2', }, } -class Test_uta_models(unittest.TestCase): +class TestUtaModels(unittest.TestCase): @classmethod def setUpClass(cls): cls._postgresql = testing.postgresql.Postgresql() engine = sqlalchemy.create_engine(cls._postgresql.url()) - engine.execute('drop schema if exists {schema} cascade'.format(schema=usam.schema_name)) - engine.execute('create schema {schema}'.format(schema=usam.schema_name)) + with engine.connect() as connection: + connection.execute(text('drop schema if exists {schema} cascade'.format(schema=usam.schema_name))) + connection.execute(text('create schema {schema}'.format(schema=usam.schema_name))) + connection.commit() engine.dispose() cls.session = uta.connect(cls._postgresql.url()) @@ -62,14 +73,17 @@ def setUpClass(cls): # http://www.ncbi.nlm.nih.gov/nuccore/NM_033304.2 o = usam.Origin( - name='Testing (originally NCBI, via Eutils)', + name='NCBI', + descr='Testing (originally NCBI, via Eutils)', url='http://bogus.com/', url_ac_fmt='http://bogus.com/{ac}', ) cls.session.add(o) g = usam.Gene( + gene_id='148', hgnc='ADRA1A', + symbol='ADRA1A', maploc='8p21.2', descr='adrenoceptor alpha 1A', summary='''Alpha-1-adrenergic receptors (alpha-1-ARs) are @@ -113,13 +127,25 @@ def setUpClass(cls): t = usam.Transcript( ac=ac, origin=o, - hgnc=g.hgnc, + gene_id=g.gene_id, cds_start_i=tx_info['t_cds_start_i'], cds_end_i=tx_info['t_cds_end_i'], cds_md5='d41d8cd98f00b204e9800998ecf8427e', ) cls.session.add(t) + if 'translation_exceptions' in tx_info: + for te in tx_info['translation_exceptions']: + te = usam.TranslationException(tx_ac=ac, **te) + cls.session.add(te) + + p = usam.AssociatedAccessions( + tx_ac=ac, + pro_ac=tx_info['pro_ac'], + origin=o.name, + ) + cls.session.add(p) + # ExonSet and Exons on Transcript seq t_es = usam.ExonSet( tx_ac=ac, @@ -161,7 +187,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - # sqlalchemy is keeping connections open and I can't figure out where + cls.session.close() # kill the database (we started it) import signal cls._postgresql.stop(_signal=signal.SIGKILL) @@ -172,9 +198,9 @@ def test_origin(self): self.assertEqual(len(all_origins), 1) o = all_origins[0] - self.assertRegexpMatches(o.name, 'Testing') - self.assertEquals(o.url, 'http://bogus.com/') - self.assertEquals(o.url_ac_fmt, 'http://bogus.com/{ac}') + self.assertEqual(o.name, 'NCBI') + self.assertEqual(o.url, 'http://bogus.com/') + self.assertEqual(o.url_ac_fmt, 'http://bogus.com/{ac}') # NM_000680.2, NM_033302.2, NM_033303.3, NM_033304.2 self.assertEqual(len(o.transcripts), 4) @@ -201,23 +227,23 @@ def test_dnaseq(self): n = self.session.query(usam.SeqAnno).filter( usam.SeqAnno.ac == 'NC_000008.10').one() - self.assertEquals(n.ac, u'NC_000008.10') + self.assertEqual(n.ac, u'NC_000008.10') # self.assertTrue(len(n.exon_sets),2) - self.assertRegexpMatches(n.origin.name, '^Testing') - #self.assertEquals(len(n.transcripts), 0) + self.assertEqual(n.origin.name, 'NCBI') + #self.assertEqual(len(n.transcripts), 0) n = self.session.query(usam.SeqAnno).filter( usam.SeqAnno.ac == 'NM_000680.2').one() - self.assertEquals(n.ac, u'NM_000680.2') + self.assertEqual(n.ac, u'NM_000680.2') # self.assertTrue(len(n.exon_sets),1) - self.assertRegexpMatches(n.origin.name, '^Testing') + self.assertEqual(n.origin.name, 'NCBI') n = self.session.query(usam.Seq).join(usam.Seq.aliases).filter( usam.SeqAnno.ac == 'NM_000680.2').one() - self.assertEquals(len(n.seq), 2281) + self.assertEqual(len(n.seq), 2281) self.assertTrue(n.seq.startswith('gaattccgaa')) self.assertTrue(n.seq.endswith('gacatttatg')) - #self.assertEquals(len(n.transcripts), 1) + #self.assertEqual(len(n.transcripts), 1) def test_exon_set(self): all_exon_sets = self.session.query(usam.Seq).all() @@ -228,12 +254,12 @@ def test_exon_set(self): # http://www.ncbi.nlm.nih.gov/nuccore/NM_000680.2 ## es = [ es for es in exon_sets if es.is_primary ][0] - ## self.assertEquals( (es.cds_start_i,es.cds_end_i), (436, 1837) ) - ## self.assertEquals( len(es.exons), 2 ) - ## self.assertEquals( es.is_primary, True ) - ## self.assertEquals( es.ref_dnaseq.ac, 'NM_000680.2' ) - ## self.assertEquals( es.strand, 1 ) - ## self.assertEquals( es.transcript.ac, 'NM_000680.2' ) + ## self.assertEqual( (es.cds_start_i,es.cds_end_i), (436, 1837) ) + ## self.assertEqual( len(es.exons), 2 ) + ## self.assertEqual( es.is_primary, True ) + ## self.assertEqual( es.ref_dnaseq.ac, 'NM_000680.2' ) + ## self.assertEqual( es.strand, 1 ) + ## self.assertEqual( es.transcript.ac, 'NM_000680.2' ) # seq_gene.md.gz: # 9606 8 26627222 26627665 - NT_167187.1 14485368 14485811 - NM_000680.2 GeneID:148 UTR GRCh37.p10-Primary Assembly NM_000680.2 - @@ -241,12 +267,12 @@ def test_exon_set(self): # 9606 8 26721604 26722486 - NT_167187.1 14579750 14580632 - NP_000671.2 GeneID:148 CDS GRCh37.p10-Primary Assembly NM_000680.2 - # 9606 8 26722487 26722922 - NT_167187.1 14580633 14581068 - NM_000680.2 GeneID:148 UTR GRCh37.p10-Primary Assembly NM_000680.2 - ## es = [ es for es in exon_sets if not es.is_primary ][0] - ## self.assertEquals( (es.cds_start_i,es.cds_end_i), (26627665, 26722486) ) - ## self.assertEquals( len(es.exons), 2 ) - ## self.assertEquals( es.is_primary, False ) - ## self.assertEquals( es.ref_dnaseq.ac, 'NC_000008.10' ) - ## self.assertEquals( es.strand, -1 ) - ## self.assertEquals( es.transcript.ac, 'NM_000680.2' ) + ## self.assertEqual( (es.cds_start_i,es.cds_end_i), (26627665, 26722486) ) + ## self.assertEqual( len(es.exons), 2 ) + ## self.assertEqual( es.is_primary, False ) + ## self.assertEqual( es.ref_dnaseq.ac, 'NC_000008.10' ) + ## self.assertEqual( es.strand, -1 ) + ## self.assertEqual( es.transcript.ac, 'NM_000680.2' ) def test_exon(self): t = self.session.query(usam.Transcript).filter( @@ -255,6 +281,63 @@ def test_exon(self): #self.assertEqual( (es.exons[0].start_i,es.exons[0].end_i) , (0,1319) ) #self.assertEqual( (es.exons[1].start_i,es.exons[1].end_i) , (1319,2281) ) + def test_associated_accessions(self): + all_aa = self.session.query(usam.AssociatedAccessions).all() + self.assertEqual(len(all_aa), 4) + # check values in one row: + aa = self.session.query(usam.AssociatedAccessions).filter_by(tx_ac='NM_000680.2').one() + self.assertIsInstance(aa.associated_accession_id, int) + self.assertIsInstance(aa.added, datetime.datetime) + self.assertEqual(aa.tx_ac, 'NM_000680.2') + self.assertEqual(aa.pro_ac, 'NP_000671.2') + self.assertEqual(aa.origin, 'NCBI') + + def test_associated_accessions_transcript_not_in_database(self): + """ + Should create row in associated_accessions even for transcripts not in database. + This is only the case until associated_accessions.tx_ac is converted to a transcript foreign key. + """ + p = usam.AssociatedAccessions( + tx_ac='dummy_transcript', + pro_ac='dummy_protein', + origin='dummy_origin', + ) + self.session.add(p) + self.session.commit() + aa = self.session.query(usam.AssociatedAccessions).filter_by(tx_ac='dummy_transcript').one() + self.assertEqual(aa.tx_ac, 'dummy_transcript') + self.assertEqual(aa.pro_ac, 'dummy_protein') + self.assertEqual(aa.origin, 'dummy_origin') + + def test_translation_exception(self): + """ + Should create rows in translation_exception table. + """ + translation_exceptions = self.session.query(usam.TranslationException).filter_by(tx_ac='NM_000680.2').all() + self.assertEqual(len(translation_exceptions), 2) + + def test_translation_exception_start_not_greater_than_end(self): + """ + Should not create row in translation_exception table if start is greater than end. + """ + te = usam.TranslationException( + tx_ac='NM_033302.2', + start_position=100, + end_position=99, + amino_acid='dummy_aa', + ) + self.session.add(te) + + with self.assertRaises(sqlalchemy.exc.IntegrityError): + self.session.commit() + + # allow session to be used after failure + self.session.rollback() + + # translation exception should not exist because transaction failed + translation_exceptions = self.session.query(usam.TranslationException).filter_by(tx_ac='NM_033302.2').all() + self.assertEqual(translation_exceptions, []) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_uta_parsers_seqrecord.py b/tests/test_uta_parsers_seqrecord.py new file mode 100644 index 0000000..469e1c5 --- /dev/null +++ b/tests/test_uta_parsers_seqrecord.py @@ -0,0 +1,202 @@ +import os +import unittest +from unittest.mock import Mock, PropertyMock, patch + +from Bio import SeqIO +from parameterized import param, parameterized + +from uta.parsers.seqrecord import SeqRecordFacade, SeqRecordFeatureError + + +class TestSeqRecordFacade(unittest.TestCase): + test_data_dir = os.path.join(os.path.dirname(__file__), 'data') + + @parameterized.expand([ + param( + 'NM_001396027 - single exon feature', + file_name='rna.NM_001396027.gbff', + expected_id='NM_001396027.1', + expected_gene_symbol='FAM246C', + expected_gene_synonyms=[], + expected_gene_type="protein-coding", + expected_gene_id='117134596', + expected_db_xrefs=["GeneID:117134596", "HGNC:HGNC:54842"], + expected_cds_se_i=(0, 696), + expected_cds_product="protein FAM246C", + expected_cds_protein_id="NP_001382956.1", + expected_cds_translation="MAESGRPWAQARSAYRASEVLRRGTGRRRDPGPQSNGPGQEDARAPGRMARLRGQLRAEAASRSEVPRLLKLVERAGAG" \ + "AAGAGERTGAHSRGSVCSVCGEPRGGATYPAGVLEVSERRLQEGLAAVREELGAGIEALRAELRAELDALRALLPPPPSPPARREPRAVPRAAPRGPTLP" \ + "RTLGTVSALVAASRPADDAPDGPAECGAHRAPARKNHKKMPVPPGAPQGGGD", + expected_codon_table="1", + expected_exons_se_i=[(0, 696)], + ), + param( + 'NM_001396027 - multiple exon features', + file_name='rna.NM_001996.gbff', + expected_id='NM_001996.4', + expected_gene_symbol='FBLN1', + expected_gene_synonyms=["FBLN","FIBL1"], + expected_gene_type="protein-coding", + expected_gene_id="2192", + expected_db_xrefs=["GeneID:2192", "HGNC:HGNC:3600", "MIM:135820"], + expected_cds_se_i=(103, 2155), + expected_cds_product="fibulin-1 isoform C precursor", + expected_cds_protein_id="NP_001987.3", + expected_cds_translation="MERAAPSRRVPLPLLLLGGLALLAAGVDADVLLEACCADGHRMATHQKDCSLPYATESKECRMVQEQCCHSQLEELHCA" \ + "TGISLANEQDRCATPHGDNASLEATFVKRCCHCCLLGRAAQAQGQSCEYSLMVGYQCGQVFQACCVKSQETGDLDVGGLQETDKIIEVEEEQEDPYLNDR" \ + "CRGGGPCKQQCRDTGDEVVCSCFVGYQLLSDGVSCEDVNECITGSHSCRLGESCINTVGSFRCQRDSSCGTGYELTEDNSCKDIDECESGIHNCLPDFIC" \ + "QNTLGSFRCRPKLQCKSGFIQDALGNCIDINECLSISAPCPIGHTCINTEGSYTCQKNVPNCGRGYHLNEEGTRCVDVDECAPPAEPCGKGHRCVNSPGS" \ + "FRCECKTGYYFDGISRMCVDVNECQRYPGRLCGHKCENTLGSYLCSCSVGFRLSVDGRSCEDINECSSSPCSQECANVYGSYQCYCRRGYQLSDVDGVTC" \ + "EDIDECALPTGGHICSYRCINIPGSFQCSCPSSGYRLAPNGRNCQDIDECVTGIHNCSINETCFNIQGGFRCLAFECPENYRRSAATRCERLPCHENREC" \ + "SKLPLRITYYHLSFPTNIQAPAVVFRMGPSSAVPGDSMQLAITGGNEEGFFTTRKVSPHSGVVALTKPVPEPRDLLLTVKMDLSRHGTVSSFVAKLFIFV" \ + "SAEL", + expected_codon_table="1", + expected_exons_se_i=[ + (0, 182), + (182, 288), + (288, 424), + (424, 587), + (587, 647), + (647, 749), + (749, 887), + (887, 1025), + (1025, 1169), + (1169, 1298), + (1298, 1424), + (1424, 1544), + (1544, 1676), + (1676, 1800), + (1800, 2251), + ], + ), + param( + 'NR_173080 - no exon features, ncRNA', + file_name='rna.NR_173080.gbff', + expected_id='NR_173080.1', + expected_gene_symbol='LOC122455341', + expected_gene_synonyms=[], + expected_gene_type="ncRNA", + expected_gene_id='122455341', + expected_db_xrefs=["GeneID:122455341"], + expected_cds_se_i=None, + expected_cds_product=None, + expected_cds_protein_id=None, + expected_cds_translation=None, + expected_codon_table=None, + expected_exons_se_i=[], + ), + param( + 'NR_173148 - no exon features, misc_RNA', + file_name='rna.NR_173148.gbff', + expected_id='NR_173148.1', + expected_gene_symbol='FAM246C', + expected_gene_synonyms=[], + expected_gene_type="misc_RNA", + expected_gene_id='117134596', + expected_db_xrefs=["GeneID:117134596", "HGNC:HGNC:54842"], + expected_cds_se_i=None, + expected_cds_product=None, + expected_cds_protein_id=None, + expected_cds_translation=None, + expected_codon_table=None, + expected_exons_se_i=[], + ), + ]) + def test_seq_record_facade( + self, + test_name, + file_name, + expected_id, + expected_gene_symbol, + expected_gene_synonyms, + expected_gene_type, + expected_gene_id, + expected_db_xrefs, + expected_cds_se_i, + expected_cds_product, + expected_cds_protein_id, + expected_cds_translation, + expected_codon_table, + expected_exons_se_i, + ): + gbff_file = os.path.join(self.test_data_dir, file_name) + seq_record = [sr for sr in SeqIO.parse(gbff_file, 'gb')][0] + self.seq_record_facade = SeqRecordFacade(seq_record) + assert self.seq_record_facade.id == expected_id + assert self.seq_record_facade.gene_symbol == expected_gene_symbol + assert self.seq_record_facade.gene_synonyms == expected_gene_synonyms + assert self.seq_record_facade.gene_type == expected_gene_type + assert self.seq_record_facade.gene_id == expected_gene_id + assert self.seq_record_facade.db_xrefs == expected_db_xrefs + assert self.seq_record_facade.cds_se_i == expected_cds_se_i + assert self.seq_record_facade.cds_product == expected_cds_product + assert self.seq_record_facade.cds_protein_id == expected_cds_protein_id + assert self.seq_record_facade.cds_translation == expected_cds_translation + assert self.seq_record_facade.codon_table == expected_codon_table + assert self.seq_record_facade.exons_se_i == expected_exons_se_i + + @parameterized.expand([ + param("no gene feature", gene_feature_mock={}), + param("gene feature is None", gene_feature_mock={"gene": None}), + param("gene feature is empty", gene_feature_mock={"gene": []}), + param("gene feature has more than one", gene_feature_mock={"gene": [Mock(), Mock()]}), + ]) + def test_validate_gene_feature(self, test_name, gene_feature_mock): + with patch('uta.parsers.seqrecord.SeqRecordFacade.features_by_type', + new_callable=PropertyMock) as mock_features_by_type: + mock_features_by_type.return_value = gene_feature_mock + srf = SeqRecordFacade(seqrecord=Mock()) + with self.assertRaises(SeqRecordFeatureError): + _ = srf.gene_feature + + def test_cds_feature_validation_error(self): + with patch('uta.parsers.seqrecord.SeqRecordFacade.features_by_type', + new_callable=PropertyMock) as mock_features_by_type: + mock_cds_feature = Mock() + mock_cds_feature.qualifiers = {"protein_id": "NP_fake", "translation": "MNBVCXZ"} + mock_features_by_type.return_value = {'CDS': [mock_cds_feature, mock_cds_feature]} + srf = SeqRecordFacade(seqrecord=Mock()) + with self.assertRaises(SeqRecordFeatureError): + _ = srf.cds_feature + + def test_cds_feature(self): + with patch('uta.parsers.seqrecord.SeqRecordFacade.features_by_type', new_callable=PropertyMock) as mock_features_by_type: + # no CDS feature + mock_features_by_type.return_value = {} + srf = SeqRecordFacade(seqrecord=Mock()) + self.assertIsNone(srf.cds_feature) + + # one CDS feature + desired_cds_feature = Mock() + desired_cds_feature.qualifiers = {"protein_id": "NP_fake", "translation": "MNBVCXZ"} + mock_features_by_type.return_value = {'CDS': [desired_cds_feature]} + srf = SeqRecordFacade(seqrecord=Mock()) + self.assertIs(srf.cds_feature, desired_cds_feature) + + # more than one CDS feature, but only one is returned + extra_cds_feature = Mock() + extra_cds_feature.qualifiers = {"other_key": "NP_fake", "second_key": "MNBVCXZ"} + mock_features_by_type.return_value = {'CDS': [desired_cds_feature, extra_cds_feature]} + srf = SeqRecordFacade(seqrecord=Mock()) + self.assertIs(srf.cds_feature, desired_cds_feature) + + def test_transl_except(self): + with patch('uta.parsers.seqrecord.SeqRecordFacade.cds_feature', new_callable=PropertyMock) as mock_cds_feature: + # no CDS feature + mock_cds_feature.return_value = None + srf = SeqRecordFacade(seqrecord=Mock()) + self.assertIsNone(srf.transl_except) + + # one CDS feature without transl_except + mock_cds_feature.return_value = Mock(qualifiers={}) + srf = SeqRecordFacade(seqrecord=Mock()) + self.assertIsNone(srf.transl_except) + + # one CDS feature with transl_except + mock_cds_feature.return_value = Mock(qualifiers={'transl_except': ['(pos:333..335,aa:Sec)', '(pos:1017,aa:TERM)']}) + srf = SeqRecordFacade(seqrecord=Mock()) + self.assertEqual(srf.transl_except, ['(pos:333..335,aa:Sec)', '(pos:1017,aa:TERM)']) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_uta_tools_eutils.py b/tests/test_uta_tools_eutils.py new file mode 100644 index 0000000..c875a7f --- /dev/null +++ b/tests/test_uta_tools_eutils.py @@ -0,0 +1,47 @@ +import os +import unittest +from unittest.mock import Mock, patch + +from uta import EutilsDownloadError +from uta.tools.eutils import download_from_eutils, NcbiFileFormatEnum + + +class TestEutils(unittest.TestCase): + URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' + + def setUp(self): + self.output_file = 'test_output.fa' + + def tearDown(self): + if os.path.exists(self.output_file): + os.remove(self.output_file) + + @patch('requests.get') + def test_successful_download(self, mock_get): + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = 'file content' + mock_get.return_value = mock_response + + download_from_eutils('accession', NcbiFileFormatEnum.FASTA, self.output_file) + + mock_get.assert_called_once_with( + self.URL, + params={ + 'db': 'nuccore', + 'id': 'accession', + 'retmode': 'text', + 'rettype': 'fasta' + } + ) + + with open(self.output_file, 'r') as file: + self.assertEqual(file.read(), 'file content') + + @patch('requests.get') + def test_unsuccessful_download(self, mock_get): + mock_response = Mock() + mock_response.status_code = 404 + mock_get.return_value = mock_response + with self.assertRaises(EutilsDownloadError): + download_from_eutils('accession', NcbiFileFormatEnum.FASTA, self.output_file)