diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..409225a
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,25 @@
+name: Continuous Integration
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+  merge_group:
+    types:
+      - checks_requested
+
+jobs:
+  test:
+    name: Run tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Build image
+        run: docker build --target uta-test -t uta-test .
+      - name: Run tests
+        run: docker run --rm uta-test python -m unittest
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..34d9a05
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,33 @@
+FROM ubuntu:22.04 as uta
+
+# set python version and define arguments
+ARG python_version="3.10"
+
+# list and install dependencies
+ARG dependencies="python${python_version} python3-dev python3-pip rsync git postgresql-client-14 tabix"
+
+RUN apt-get update && apt-get install -y $dependencies && apt-get clean
+
+# install pysam, copy code, and run pip install
+RUN ln -s /usr/bin/python3 /usr/bin/python
+RUN python -m pip install --upgrade pip
+RUN pip install --upgrade setuptools
+RUN pip install pysam
+
+WORKDIR /opt/repos/uta/
+COPY pyproject.toml ./
+COPY etc ./etc
+COPY misc ./misc
+COPY sbin ./sbin
+COPY src ./src
+RUN pip install -e .[dev]
+
+
+# UTA test image
+FROM uta as uta-test
+RUN DEBIAN_FRONTEND=noninteractive apt-get -yq install postgresql
+COPY tests ./tests
+RUN pip install -e .[test]
+RUN useradd uta-tester
+RUN chown -R uta-tester .
+USER uta-tester
diff --git a/README.md b/README.md
index 70c7e92..f15f0d4 100644
--- a/README.md
+++ b/README.md
@@ -203,8 +203,8 @@ you will not need to install PostgreSQL or any of its dependencies.
     (code) version used to build the instance.
 
         $ psql -h localhost -U anonymous -d uta -c "select * from $uta_v.meta"
-        
-              key       |                               value                                
+
+              key       |                               value
         ----------------+--------------------------------------------------------------------
          schema_version | 1.1
          created on     | 2015-08-21T10:53:50.666152
@@ -213,7 +213,7 @@ you will not need to install PostgreSQL or any of its dependencies.
          (4 rows)
 
 6.  (Optional) To configure [hgvs](https://github.com/biocommons/hgvs)
-    to use this local installation, consult the 
+    to use this local installation, consult the
     [hgvs documentation](https://hgvs.readthedocs.io/en/latest/installation.html#local-installation-of-uta-optional)
 
 ### Installing from database dumps
@@ -253,6 +253,7 @@ the installation environment.*
 
 ## Developer Setup
 
+### Virtual Environment
 To develop UTA, follow these steps.
 
 1.  Set up a virtual environment using your preferred method.
@@ -272,3 +273,110 @@ To develop UTA, follow these steps.
 4.  To run the tests:
 
         $ python3 -m unittest
+
+### Docker
+
+1. Clone UTA and build docker image:
+
+        $ git clone git@github.com:biocommons/uta.git
+        $ cd uta
+        $ docker build -t uta .
+
+2. Restore a database or load a new one using the instructions [above](#installing-from-database-dumps).
+
+3. Run container and tests
+
+        $ docker run -it --rm uta bash
+
+4. Testing
+
+        $ docker build --target uta-test -t uta-test .
+        $ docker run --rm uta-test python -m unittest
+
+## UTA update procedure
+
+Requires docker.
+
+### 0. Setup
+
+Make directories:
+```
+mkdir -p $(pwd)/ncbi-data
+mkdir -p $(pwd)/output/artifacts
+mkdir -p $(pwd)/output/logs
+```
+
+Set variables:
+```
+export UTA_ETL_OLD_UTA_IMAGE_TAG=uta_20210129b
+export UTA_ETL_OLD_UTA_VERSION=UTA_ETL_OLD_UTA_IMAGE_TAG
+export UTA_ETL_NEW_UTA_VERSION=uta_20240512
+export UTA_ETL_NCBI_DIR=./ncbi-data
+export UTA_ETL_WORK_DIR=./output/artifacts
+export UTA_ETL_LOG_DIR=./output/logs
+```
+
+Build the UTA image:
+```
+docker build --target uta -t uta-update .
+```
+
+### 1. Download SeqRepo data
+```
+docker compose run seqrepo-pull
+```
+
+Note: pulling data takes ~30 minutes and requires ~13 GB.
+Note: a container called seqrepo will be left behind.
+
+### 2. Extract and transform data from NCBI
+
+Download files from NCBI, extract into intermediate files, and load into UTA and SeqRepo.
+
+See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
+
+#### 2A. Nuclear transcripts
+```
+docker compose run ncbi-download
+docker compose run uta-extract
+docker compose run seqrepo-load
+docker compose run uta-load
+```
+
+#### 2B. Mitochondrial transcripts
+```
+docker compose -f docker-compose.yml -f misc/mito-transcripts/docker-compose-mito-extract.yml run mito-extract
+docker compose run seqrepo-load
+docker compose run uta-load
+```
+
+#### 2C. Manual splign transcripts
+To load splign-manual transcripts, the workflow expects an input txdata.yaml file and splign alignments. Define this path 
+using the environment variable $UTA_SPLIGN_MANUAL_DIR. These file paths should exist:
+- `$UTA_SPLIGN_MANUAL_DIR/splign-manual/txdata.yaml`
+- `$UTA_SPLIGN_MANUAL_DIR/splign-manual/alignments/*.splign`
+
+[txdata.yaml](loading/data/splign-manual/txdata.yaml) defines the transcripts and their metadata. The [alignments dir](loading/data/splign-manual/alignments) contains the splign alignments.
+To run the workflow:
+```
+export UTA_SPLIGN_MANUAL_DIR=$(pwd)/loading/data/splign-manual/
+docker compose run splign-manual
+```
+
+UTA has updated and the database has been dumped into a pgd file in `UTA_ETL_WORK_DIR`. SeqRepo has been updated in place.
+
+
+## Migrations
+UTA uses alembic to manage database migrations. To auto-generate a migration:
+```
+alembic -c etc/alembic.ini revision --autogenerate -m "description of the migration"
+```
+This will create a migration script in the alembic/versions directory.
+Adjust the upgrade and downgrade function definitions. To apply the migration:
+```
+alembic -c etc/alembic.ini upgrade head
+```
+To reverse a migration, use `downgrade` with the number of steps to reverse. For example, to reverse the last:
+```
+alembic -c etc/alembic.ini downgrade -1
+```
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..84b043a
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,62 @@
+# docker compose file for the UTA update procedure
+
+version: '3'
+
+services:
+  seqrepo-pull:
+    user: root
+    image: uta-update
+    command: sbin/seqrepo-pull
+    volumes:
+      - seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo
+    network_mode: host
+  ncbi-download:
+    image: uta-update
+    command: sbin/ncbi-download etc/ncbi-files.txt /ncbi-dir
+    volumes:
+      - .:/opt/repos/uta
+      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
+    working_dir: /opt/repos/uta
+    network_mode: host
+  uta-extract:
+    image: uta-update
+    command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs
+    volumes:
+      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
+      - ${UTA_ETL_WORK_DIR}:/uta-extract/work
+      - ${UTA_ETL_LOG_DIR}:/uta-extract/logs
+    working_dir: /opt/repos/uta
+    network_mode: host
+  seqrepo-load:
+    image: uta-update
+    command: sbin/seqrepo-load /seqrepo-load/work /seqrepo-load/logs
+    volumes:
+      - seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo
+      - ${UTA_ETL_WORK_DIR}:/seqrepo-load/work
+      - ${UTA_ETL_LOG_DIR}:/seqrepo-load/logs
+    working_dir: /opt/repos/uta
+    network_mode: host
+  uta:
+    container_name: uta
+    image: biocommons/uta:${UTA_ETL_OLD_UTA_IMAGE_TAG}
+    environment:
+      - POSTGRES_HOST_AUTH_METHOD=trust
+    healthcheck:
+      test: psql -h localhost -U anonymous -d uta -c "select * from ${UTA_ETL_OLD_UTA_IMAGE_TAG}.meta"
+      interval: 10s
+      retries: 80
+    network_mode: host
+  uta-load:
+    image: uta-update
+    command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} ${UTA_ETL_NEW_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
+    depends_on:
+      uta:
+        condition: service_healthy
+    volumes:
+      - seqrepo-volume:/biocommons/dl.biocommons.org/seqrepo
+      - ${UTA_ETL_WORK_DIR}:/uta-load/work
+      - ${UTA_ETL_LOG_DIR}:/uta-load/logs
+    network_mode: host
+
+volumes:
+  seqrepo-volume:
diff --git a/etc/alembic.ini b/etc/alembic.ini
new file mode 100644
index 0000000..6777380
--- /dev/null
+++ b/etc/alembic.ini
@@ -0,0 +1,116 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = src/alembic
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
+# for all available tokens
+# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python>=3.9 or backports.zoneinfo library.
+# Any required deps can installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to ZoneInfo()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the
+# "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; This defaults
+# to alembic/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path.
+# The path separator used here should be the separator specified by "version_path_separator" below.
+# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
+# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
+
+# set to 'true' to search source files recursively
+# in each "version_locations" directory
+# new in Alembic version 1.10
+# recursive_version_locations = false
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+sqlalchemy.url = postgresql://uta_admin:@localhost/uta
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
+# hooks = ruff
+# ruff.type = exec
+# ruff.executable = %(here)s/.venv/bin/ruff
+# ruff.options = --fix REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/etc/global.conf b/etc/global.conf
index c929b28..0d3cfcc 100644
--- a/etc/global.conf
+++ b/etc/global.conf
@@ -16,7 +16,7 @@ aligner = utaaa
 fasta_directories =
     aux/sequences2
     aux/sequences
-seqrepo = /usr/local/share/seqrepo/latest
+seqrepo = /biocommons/dl.biocommons.org/seqrepo/master
 
 #data/manual
 #data/bic/sequences.fasta.bgz
diff --git a/etc/ncbi-files.txt b/etc/ncbi-files.txt
new file mode 100644
index 0000000..0a70efc
--- /dev/null
+++ b/etc/ncbi-files.txt
@@ -0,0 +1,48 @@
+# This configuration file contains the paths to the NCBI data files needed by the SeqRepo/UTA load pipelines.
+#
+#    ├── gene
+#    │   └── DATA
+#    │       ├── GENE_INFO
+#    │       │   └── Mammalia
+#    │       │       └── Homo_sapiens.gene_info.gz
+#    │       └── gene2refseq.gz
+#    ├── genomes
+#    │   └── refseq
+#    │       └── vertebrate_mammalian
+#    │           └── Homo_sapiens
+#    │               └── all_assembly_versions
+#    │                   └── GCF_000001405.25_GRCh37.p13
+#    │                       ├── GCF_000001405.25_GRCh37.p13_genomic.fna.gz
+#    │                       └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz
+#    └── refseq
+#        └── H_sapiens
+#            └── mRNA_Prot
+#                ├── human.1.protein.faa.gz
+#                ├── human.1.rna.fna.gz
+#                ├── human.1.rna.gbff.gz
+#                ├── ...
+
+## Gene Data
+gene/DATA/gene2refseq.gz
+gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz
+
+## RefSeq Data
+refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz
+refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz
+refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz
+
+## Genome build and alignment data
+# Build 37
+genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt
+genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz
+genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz
+
+# Build 38
+genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_report.txt
+genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz
+genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz
+
+# T2Tv2.0
+genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt
+genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna.gz
+genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz
diff --git a/etc/scripts/create-new-schema.sh b/etc/scripts/create-new-schema.sh
new file mode 100755
index 0000000..d13100a
--- /dev/null
+++ b/etc/scripts/create-new-schema.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+if [ "$#" -lt 2 ]
+then
+    echo "error: too few arguments, you provided $#, 1 required"
+    echo "usage: create-new-schema.sh <source_uta_v>"
+    exit 1
+fi
+
+set -euxo pipefail
+
+source_uta_v=$1
+dest_uta_v=$2
+dumps_dir=/temp/dumps
+mkdir -p $dumps_dir
+
+# dump current version
+pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \
+ gzip -c > $dumps_dir/"$source_uta_v".pgd.gz
+
+# create new schema
+gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \
+ sbin/pg-dump-schema-rename "$source_uta_v" "$dest_uta_v" | \
+ psql -U uta_admin -h localhost -d uta -aeE
diff --git a/etc/scripts/delete-schema.sh b/etc/scripts/delete-schema.sh
new file mode 100755
index 0000000..e64eb3b
--- /dev/null
+++ b/etc/scripts/delete-schema.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+if [ "$#" -lt 1 ]
+then
+    echo "error: too few arguments, you provided $#, 1 required"
+    echo "usage: delete-schema.sh <source_uta_v>"
+    exit 1
+fi
+
+set -euxo pipefail
+
+source_uta_v=$1
+
+psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $source_uta_v CASCADE"
\ No newline at end of file
diff --git a/etc/uta_dev@localhost.conf b/etc/uta_dev@localhost.conf
index 2d1e508..bfb5c66 100644
--- a/etc/uta_dev@localhost.conf
+++ b/etc/uta_dev@localhost.conf
@@ -1,4 +1,4 @@
 [uta]
 hostport = localhost
 cluster = %(user)s@%(hostport)s
-database = uta_dev
+database = uta
diff --git a/etc/uta_dev@uic.com b/etc/uta_dev@uic.com
index e61a48b..cbf2e77 100644
--- a/etc/uta_dev@uic.com
+++ b/etc/uta_dev@uic.com
@@ -1,4 +1,4 @@
 [uta]
-hostport = uta.invitae.com
+hostport = uta-int-02.cj7o8ef9mt4v.us-east-1.rds.amazonaws.com
 user = uta_admin
-database = uta_dev
+database = uta
diff --git a/loading/data/splign-manual/README.md b/loading/data/splign-manual/README.md
index 3a4d62b..f364281 100644
--- a/loading/data/splign-manual/README.md
+++ b/loading/data/splign-manual/README.md
@@ -50,7 +50,7 @@ For a given RefSeq transcript (e.g., NM_000996.3), do the following:
    - Click on the gene id to go to the gene page (e.g., `6165`)
    - N.B. Strand is inferred from the orientation of aligned exons.
 
-1. Enter the gene and CDS info in txdata.yaml
+1. Enter the gene, geneID, and CDS info in txdata.yaml
 
 1. Get the chromosome and coordinates from the gene page
    - From the "Genomic Context" section, note the chromosomal
diff --git a/loading/data/splign-manual/txdata.yaml b/loading/data/splign-manual/txdata.yaml
index 1e7498d..64ef14c 100644
--- a/loading/data/splign-manual/txdata.yaml
+++ b/loading/data/splign-manual/txdata.yaml
@@ -7,6 +7,7 @@ NM_000000.0:      # transcript_accession
   cds:            # CDS start and end, 1-based inclusive
   hgnc:           # HGNC *symbol*
   genomic_region: # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/259291
+  gene_id:        # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/259291
 
 
 NM_001025190.1:
@@ -14,67 +15,80 @@ NM_001025190.1:
   cds: 1,3162
   hgnc: MSLNL
   genomic_region: NC_000016.9 (819428..831996, complement)
+  gene_id: 401827
 
 NM_006060.6:
   cds: 222,1781
   hgnc: IKZF1
   genomic_region: NC_000007.13 (50344378..50367358) , (50444231..50472799)
+  gene_id: 10320
 
 NM_000769.4:
   cds: 26,1498
   hgnc: CYP2C19
   genomic_region: NC_000010.10 (96522463..96612671)
+  gene_id: 1557
 
 NM_001807.4:
   cds: 17,2287
   hgnc: CEL
   genomic_region: NC_000009.11 (135936741..135947250)
+  gene_id: 1056
 
 NM_002116.7:
   cds: 85,1182
   hgnc: HLA-A
   genomic_region: NC_000006.11 (29910247..29913661)
+  gene_id: 3105
 
 NM_002122.3:
   cds: 54,821
   hgnc: HLA-DQA1
   genomic_region: NC_000006.11 (32605169..32612152)
+  gene_id: 3117
 
 NM_006060.5:
   cds: 269,1828
   hgnc: IKZF1
   genomic_region: NC_000007.13 (50344378..50367358) , (50444231..50472799)
+  gene_id: 10320
 
 NM_000996.3:
   cds: 65,397
   hgnc: RPL35A
   genomic_region: NC_000003.11 (197677023..197682722)
+  gene_id: 6165
 
 NM_001261826.2:
   cds: 293,3940
   hgnc: AP3D1
   genomic_region: NC_000019.9 (2100987..2151556, complement)
+  gene_id: 8943
 
 NM_001355436.1:
   cds: 144,7130
   hgnc: SPTB
   genomic_region: NC_000014.8 (65213001..65346604, complement)
+  gene_id: 6710
 
 NM_001428.4:
   cds: 117,1421
   hgnc: ENO1
   genomic_region: NC_000001.10 (8921059..8939151, complement)
+  gene_id: 2023
 
 NM_032589.2:
-  genomic_region: NM_032589.2 was permanently suppressed because currently there is support for the transcript but not for the protein.
+  # NM_032589.2 was permanently suppressed because currently there is support for the transcript but not for the protein.
   cds: 150,425
   hgnc: DSCR8
   genomic_region: NC_000021.8 (39493545..39528605)
+  gene_id: 84677
 
 NM_176886.1:
   cds: 1,900
   hgnc: TAS2R45
   genomic_region: NW_003571050.1 (327525..328424, complement)
+  gene_id: 259291
 
 
 
@@ -90,6 +104,7 @@ NM_002457.4:
   cds: 28,15897
   hgnc: MUC2
   genomic_region: NC_000011.9 (1074875..1104417)
+  gene_id: 4583
 
 
 # Case 2: overall low coverage and/or identity. 
@@ -99,6 +114,7 @@ NM_001277444.1:
   cds: 76,3411
   hgnc: NBPF9
   genomic_region: NC_000001.10 (144811743..144830407)
+  gene_id: 400818
 
 
 # Case 3: high identity alignments but with large gaps.  These
@@ -110,18 +126,21 @@ NM_031421.4:
   cds: 131,2149
   hgnc: TTC25
   genomic_region: NC_000017.10 (40086888..40117669)
+  gene_id: 83538
 
 NM_001349168.1:
   # Splign alignment has 159 nt unaligned exonic sequence. This is unusable. -Reece 2020-04-08
   cds: 239,4762
   hgnc: DCAF1
   genomic_region: NC_000003.11 (51433298..51534018, complement)
+  gene_id: 9730
 
 NM_001733.5:
   # Splign alignment has 232 nt unaligned exonic sequence. This is unusable. -Reece 2020-04-08
   cds: 220,2337
   hgnc: C1R
   genomic_region: NC_000012.11 (7241205..7245043, complement) , (7187513..7189412, complement)
+  gene_id: 715
 
 # Transcript, gene, and genomic alignment info
 # cds start,end (in human, 1-based coordinates) and hgnc symbol
@@ -132,53 +151,64 @@ NM_001038633.3:      # transcript_accession
   cds: 893,1684            # CDS start and end, 1-based inclusive
   hgnc: RSPO1           # HGNC *symbol*
   genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/284654
+  gene_id: 284654
 
 NM_005363.3:      # transcript_accession
   cds: 208,1152           # CDS start and end, 1-based inclusive
   hgnc: MAGEA6          # HGNC *symbol*
   genomic_region: NC_000023.10 (151867245..151870814) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/4105
+  gene_id: 4105
 
 NM_006561.3:      # transcript_accession
   cds: 161,1726           # CDS start and end, 1-based inclusive
   hgnc: CELF2           # HGNC *symbol*
   genomic_region: NC_000010.10 (10838851..11378674) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/10659
+  gene_id: 10659
 
 NM_001242908.1:      # transcript_accession
   cds: 714,1505           # CDS start and end, 1-based inclusive
   hgnc: RSPO1          # HGNC *symbol*
   genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/284654
+  gene_id: 284654
 
 NM_001242909.1:      # transcript_accession
   cds: 474,1184           # CDS start and end, 1-based inclusive
   hgnc: RSPO1          # HGNC *symbol*
   genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/259291
+  gene_id: 284654
 
 NM_001242910.1:      # transcript_accession
   cds: 714,1316           # CDS start and end, 1-based inclusive
   hgnc: RSPO1          # HGNC *symbol*
   genomic_region: NC_000001.10 (38076821..38100595, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/284654
+  gene_id: 284654
 
 NM_001012709.1:      # transcript_accession
   cds: 46,912            # CDS start and end, 1-based inclusive
   hgnc: KRTAP5-4           # HGNC *symbol*
   genomic_region: NC_000011.9 (1642188..1643368, complement)  # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/387267
+  gene_id: 387267
 
 NM_001123068.1:      # transcript_accession
   cds: 34,528            # CDS start and end, 1-based inclusive
   hgnc: COAS-2          # HGNC *symbol*
   genomic_region: NC_000001.10 (143767144..143767881, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/644591
+  gene_id: 644591
 
 NM_130797.2:      # transcript_accession
   cds: 130,2727           # CDS start and end, 1-based inclusive
   hgnc: DPPX          # HGNC *symbol*
   genomic_region: NC_000007.13 (153584419..154264025) , (154400205..154685995) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/1804
+  gene_id: 1804
 
 NM_033060.2:      # transcript_accession
   cds: 42,425           # CDS start and end, 1-based inclusive
   hgnc: KRTAP4-1          # HGNC *symbol*
   genomic_region: NC_000017.10 (39340352..39341147, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/1804
+  gene_id: 85285
 
 NM_033060.3:      # transcript_accession
   cds: 58,441           # CDS start and end, 1-based inclusive
   hgnc: KRTAP4-1          # HGNC *symbol*
   genomic_region: NC_000017.10 (39340352..39341163, complement) # from gene page e.g., https://www.ncbi.nlm.nih.gov/gene/1804
+  gene_id: 85285
diff --git a/misc/export_mappings.py b/misc/export_mappings.py
new file mode 100644
index 0000000..ded3ddb
--- /dev/null
+++ b/misc/export_mappings.py
@@ -0,0 +1,184 @@
+import argparse
+import logging
+import re
+from bioutils.assemblies import make_name_ac_map
+from contextlib import ExitStack
+from dataclasses import dataclass, field
+
+import psycopg2
+import six
+
+import uta
+
+logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
+logger = logging.getLogger("export_mappings")
+
+
+UTA_SCHEMA_VERSION_SQL = """
+select value as schema_version
+from meta
+where key='schema_version';
+"""
+
+ASSOCIATED_ACCESSIONS_SQL = """
+select aa.tx_ac, aa.pro_ac
+from associated_accessions as aa
+where aa.tx_ac='{}';
+"""
+
+# get_tx_mapping_options
+TX_MAPPING_OPTIONS_SQL = """
+select distinct tx_ac,alt_ac,alt_aln_method
+from tx_exon_aln_v where tx_ac='{}' and exon_aln_id is not NULL
+order by alt_ac,alt_aln_method;
+"""
+
+# get_tx_info
+TX_V1_INFO_SQL = """
+select hgnc, cds_start_i, cds_end_i, tx_ac, alt_ac, alt_aln_method
+from transcript T
+join exon_set ES on T.ac=ES.tx_ac
+where tx_ac='{}' and alt_ac='{}' and alt_aln_method='{}';
+"""
+
+TX_V2_INFO_SQL = """
+select G.hgnc, T.cds_start_i, T.cds_end_i, ES.tx_ac, ES.alt_ac, ES.alt_aln_method
+from gene G
+join transcript T on G.gene_id=T.gene_id
+join exon_set ES on T.ac=ES.tx_ac
+where tx_ac='{}' and alt_ac='{}' and alt_aln_method='{}';
+"""
+
+EXON_SET_SQL = """
+select *
+from tx_exon_aln_v
+where tx_ac='{}' and alt_ac='{}' and alt_aln_method='{}'
+order by alt_start_i;
+"""
+
+TX_INDENTITY_SQL = """
+select distinct(tx_ac), alt_ac, alt_aln_method, cds_start_i, cds_end_i, lengths, hgnc
+from tx_def_summary_v
+where tx_ac='{}';
+"""
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Export transcript alignments for a given genome build from UTA database."
+    )
+    parser.add_argument("transcripts_file", type=str)
+    parser.add_argument("--genome-build", type=str, default="GRCh37.p13")
+    parser.add_argument("--db-url", default="postgresql://uta_admin@localhost/uta")
+    parser.add_argument("--schema-name", default="uta_20210129")
+    return parser.parse_args()
+
+
+def _get_cursor(con, schema_name):
+    cur = con.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor)
+    cur.execute(f"set search_path = {schema_name}")
+    return cur
+
+
+def _get_rows(cur, sql):
+    cur.execute(sql)
+    return cur.fetchall()
+
+
+def _get_chr_ac_map(genome_build):
+    filtered_chr_ac_map = {}
+
+    def_name_ac_map = make_name_ac_map(assy_name=genome_build, primary_only=True)
+    for chr_name in list(map(str, range(1, 23))) + ["X", "Y"]:
+        filtered_chr_ac_map[chr_name] = def_name_ac_map.get(chr_name)
+
+    return filtered_chr_ac_map
+
+
+def main(transcripts_file, genome_build, db_url, schema_name):
+    logger.info(f"connecting to {db_url}")
+    session = uta.connect(db_url)
+
+    con = session.bind.pool.connect()
+    cur = _get_cursor(con, schema_name)
+
+    chr_to_acc_dict = _get_chr_ac_map(genome_build=genome_build)
+
+    schema_version = _get_rows(cur, UTA_SCHEMA_VERSION_SQL)[0].schema_version
+    if schema_version == "1.1":
+        TX_INFO_SQL = TX_V1_INFO_SQL
+    else:
+        TX_INFO_SQL = TX_V2_INFO_SQL
+
+    # read in transcripts
+    transcripts = []
+    with open(transcripts_file, "r") as f:
+        for line in f:
+            if line.startswith("accession"):
+                continue
+            accession, chrom = line.rstrip("\r\n").split("\t")
+            transcripts.append((accession, chrom))
+
+    # setup context managers for file writers
+    with ExitStack() as stack:
+        # associated acccessions
+        assocacs_fh = stack.enter_context(
+            open(f"{schema_name}_associated_accessions.tsv", "w")
+        )
+        assocacs_fh.write("tx_ac\tpro_ac\n")
+
+        # transcript info
+        txinfo_fh = stack.enter_context(open(f"{schema_name}_transcript_info.tsv", "w"))
+        txinfo_fh.write("hgnc\ttx_ac\tcds_start_i\tcds_end_i\talt_ac\talt_aln_method\n")
+
+        # exon sets
+        exons_fh = stack.enter_context(open(f"{schema_name}_exon_sets.tsv", "w"))
+        exons_fh.write("hgnc\ttx_ac\talt_ac\talt_aln_method\talt_strand\tords\ttx_ac_se_i\talt_ac_se_i\tcigars\n")
+
+        # transcript identity
+        tx_identity_fh = stack.enter_context(
+            open(f"{schema_name}_transcript_identity.tsv", "w")
+        )
+
+        logger.info("querying database for transcript mappings...")
+        for i, (tx_ac, chrom) in enumerate(transcripts):
+            assocacs_rows = _get_rows(cur, ASSOCIATED_ACCESSIONS_SQL.format(tx_ac))
+            for row in assocacs_rows:
+                assocacs_fh.write(f"{row.tx_ac}\t{row.pro_ac}\n")
+
+            alt_ac = chr_to_acc_dict.get(chrom)
+            for alt_aln_method in ("splign", "splign-manual"):
+                txinfo_rows = _get_rows(
+                    cur, TX_INFO_SQL.format(tx_ac, alt_ac, alt_aln_method)
+                )
+                if txinfo_rows:
+                    for row in txinfo_rows:
+                        txinfo_fh.write(
+                            f"{row.hgnc}\t{row.tx_ac}\t{row.cds_start_i}\t{row.cds_end_i}\t{row.alt_ac}\t{row.alt_aln_method}\n"
+                        )
+                exons_rows = _get_rows(
+                    cur, EXON_SET_SQL.format(tx_ac, alt_ac, alt_aln_method)
+                )
+                if exons_rows:
+                    hgnc = exons_rows[0].hgnc
+                    tx_ac = exons_rows[0].tx_ac
+                    alt_ac = exons_rows[0].alt_ac
+                    alt_aln_method = exons_rows[0].alt_aln_method
+                    alt_strand = exons_rows[0].alt_strand
+                    ords, tx_ac_se_i, alt_ac_se_i, cigars = [], [], [], []
+                    for row in sorted(exons_rows, key=lambda x: x.ord):
+                        ords.append(str(row.ord))
+                        tx_ac_se_i.append(f"{row.tx_start_i},{row.tx_end_i}")
+                        alt_ac_se_i.append(f"{row.alt_start_i},{row.alt_end_i}")
+                        cigars.append(row.cigar)
+                    exons_fh.write(
+                        f"{hgnc}\t{tx_ac}\t{alt_ac}\t{alt_aln_method}\t{alt_strand}\t{';'.join(ords)}\t{';'.join(tx_ac_se_i)}\t{';'.join(alt_ac_se_i)}\t{';'.join(cigars)}\n"
+                    )
+
+            if i % 2500 == 0 and i > 0:
+                logger.info(f"processed {i} transcripts")
+
+
+if __name__ == '__main__':
+    arguments = parse_args()
+    main(arguments.transcripts_file, arguments.genome_build, arguments.db_url, arguments.schema_name)
diff --git a/misc/gene-update/backfill_gene_id.py b/misc/gene-update/backfill_gene_id.py
new file mode 100644
index 0000000..350ef46
--- /dev/null
+++ b/misc/gene-update/backfill_gene_id.py
@@ -0,0 +1,117 @@
+import argparse
+import logging
+
+from datetime import datetime
+from sqlalchemy.orm import Session
+from sqlalchemy import text
+
+import uta
+from uta.models import Gene, Transcript
+from uta.tools.file_utils import open_file
+
+
+logger = None
+n = 50000
+
+
+def backfill_gene(uta_session: Session, gene_update_file: str) -> None:
+    logger.info("Dropping gene table contents")
+    uta_session.execute(text("DELETE FROM uta.gene;"))
+    uta_session.commit()
+
+    logger.info(f"Back filling gene table from {gene_update_file}")
+    now_ts = datetime.now()
+    i = 0
+    new_genes = []
+    with open_file(gene_update_file) as f:
+        for line in f:
+            if line.startswith("gene_id"):
+                continue
+
+            if i % n == 0:
+                if i > 0:
+                    logger.info(f"Bulk inserting {len(new_genes)} genes")
+                    uta_session.bulk_save_objects(new_genes)
+                    uta_session.commit()
+                logger.info(f"Processing chunk {int(i/n) + 1}")
+                new_genes = []
+
+            gene_id, hgnc, maploc, desc, summary, aliases, added = line.rstrip("\r\n").split("\t")
+            # set timestamp from file string, if empty set to now.
+            if added == "":
+                added_ts = now_ts
+            else:
+                added_ts = datetime.strptime(added, "%Y-%m-%d %H:%M:%S.%f")
+
+            # clean up aliases
+            aliases = aliases.replace("{", "").replace("}", "")
+            if aliases == "-":
+                aliases = None
+
+            gene = Gene(
+                gene_id=gene_id,
+                hgnc=hgnc,
+                maploc=maploc if maploc else None,
+                descr=desc if desc else None,
+                summary=summary if desc else None,
+                aliases=aliases if aliases else None,
+                added=added_ts,
+            )
+            i += 1
+            new_genes.append(gene)
+
+    logger.info(f"Bulk inserting {len(new_genes)} genes")
+    uta_session.bulk_save_objects(new_genes)
+    uta_session.commit()
+    logger.info(f"Inserted {i} total genes")
+
+
+def backfill_transcript(uta_session: Session, transcript_update_file: str) -> None:
+    logger.info("Backfilling gene_id in transcript table")
+    tx_ac_to_gene_id = {}
+
+    logger.info(f"Reading transcript to gene id mappings from {transcript_update_file}")
+    with open_file(transcript_update_file) as f:
+        for line in f:
+            if line.startswith("origin"):
+                continue
+            _, tx_ac, gene_id, _ = line.rstrip("\r\n").split("\t")
+            tx_ac_to_gene_id[tx_ac] = gene_id
+    logger.info(f"  - {len(tx_ac_to_gene_id)} mappings read")
+
+    i = 0
+    txs = []
+    for tx_ac, gene_id in tx_ac_to_gene_id.items():
+        if i % n == 0:
+            if i > 0:
+                logger.info(f"Updating {len(txs)} transcripts")
+                uta_session.flush()
+
+            logger.info(f"Processing chunk {int(i/n) + 1}")
+            txs = []
+
+        tx = uta_session.query(Transcript).filter(Transcript.ac == tx_ac).one()
+        tx.gene_id = gene_id
+        txs.append(tx)
+        i += 1
+
+    logger.info(f"Updating {len(txs)} transcripts")
+    uta_session.flush()
+    uta_session.commit()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Backfill gene_id in gene and transcript tables")
+    parser.add_argument("db_url", help="URL of the UTA database")
+    parser.add_argument("gene_update_file", type=str, help="File containing gene_id updates for gene table")
+    parser.add_argument("transcript_update_file", type=str, help="File containing gene_id updates for transcript table")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+    logger = logging.getLogger("backfill_gene_id")
+
+    session = uta.connect(args.db_url)
+
+    backfill_gene(session, args.gene_update_file)
+    backfill_transcript(session, args.transcript_update_file)
+    session.close()
diff --git a/misc/gene-update/docker-compose-gene-update.yml b/misc/gene-update/docker-compose-gene-update.yml
new file mode 100644
index 0000000..2245f44
--- /dev/null
+++ b/misc/gene-update/docker-compose-gene-update.yml
@@ -0,0 +1,17 @@
+# docker compose file for the NCBI gene_id update and backfill procedure
+
+version: '3'
+
+services:
+  uta-gene-update:
+    image: uta-update
+    command: misc/gene-update/upgrade-uta-schema.sh ${UTA_ETL_NEW_UTA_VERSION}
+    depends_on:
+      uta:
+        condition: service_healthy
+    volumes:
+      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
+      - ${UTA_ETL_WORK_DIR}:/uta-gene-update/work
+      - ${UTA_ETL_LOG_DIR}:/uta-gene-update/logs
+    working_dir: /opt/repos/uta
+    network_mode: host
diff --git a/misc/gene-update/gene_update.tsv.gz b/misc/gene-update/gene_update.tsv.gz
new file mode 100644
index 0000000..9c44bab
Binary files /dev/null and b/misc/gene-update/gene_update.tsv.gz differ
diff --git a/misc/gene-update/transcript_update.tsv.gz b/misc/gene-update/transcript_update.tsv.gz
new file mode 100644
index 0000000..b5d5da5
Binary files /dev/null and b/misc/gene-update/transcript_update.tsv.gz differ
diff --git a/misc/gene-update/upgrade-uta-schema.sh b/misc/gene-update/upgrade-uta-schema.sh
new file mode 100755
index 0000000..ba66c0b
--- /dev/null
+++ b/misc/gene-update/upgrade-uta-schema.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# This script is used to upgrade older UTA schemas (specifically uta_20210129b) to a newer version.
+# Part of this upgrade is to introduce gene_id to the gene and transcript tables. The columns are
+# added with a Alembic migration. Then a data migration to back fill the new columns. Then a second
+# Alembic migration to add the constraints to the columns and update primary and foreign keys.
+
+if [ "$#" -lt 1 ]
+then
+    echo "error: too few arguments, you provided $#, 1 required"
+    echo "usage: upgrade-uta-schema.sh <dest_uta_v>"
+    exit 1
+fi
+
+set -euxo pipefail
+
+source_uta_v="uta_20210129b"
+working_uta_v="uta"
+dest_uta_v=$1
+tmp_dumps_dir="/tmp/dumps"
+mkdir -p $tmp_dumps_dir
+
+## setup working uta schema
+# delete schema if exists
+psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $working_uta_v CASCADE;"
+
+# dump source version
+pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \
+ gzip -c > $tmp_dumps_dir/"$source_uta_v".pgd.gz
+
+# create new schema
+gzip -cdq $tmp_dumps_dir/"$source_uta_v".pgd.gz | \
+ sbin/pg-dump-schema-rename "$source_uta_v" "$working_uta_v" | \
+ sbin/pg-dump-schema-rename "uta_1_1" "$working_uta_v" | \
+ psql -U uta_admin -h localhost -d uta -aeE
+
+## upgrade working uta schema
+# set initial Alembic migration so it is not ran.
+alembic -c etc/alembic.ini stamp edadb97f6502
+
+# run Alembic migration to add gene_id to gene and transcript tables
+alembic -c etc/alembic.ini upgrade 595a586e6de7
+
+# run data migration to back fill gene_id
+python misc/gene-update/backfill_gene_id.py \
+  postgresql://uta_admin:@localhost/uta \
+  misc/gene-update/gene_update.tsv.gz \
+  misc/gene-update/transcript_update.tsv.gz
+
+# run Alembic migrations to add constraints and update existing views
+alembic -c etc/alembic.ini upgrade head
+
+## Rename schema to destination schema name
+psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $dest_uta_v CASCADE;"
+psql -h localhost -U uta_admin -d uta -c "ALTER SCHEMA uta RENAME TO $dest_uta_v";
+pg_dump -h localhost -U uta_admin -d uta -n "$dest_uta_v" | \
+ gzip -c > "/uta-gene-update/work/$dest_uta_v.pgd.gz"
diff --git a/misc/generate_alignment_metrics.py b/misc/generate_alignment_metrics.py
new file mode 100644
index 0000000..d0e0e7f
--- /dev/null
+++ b/misc/generate_alignment_metrics.py
@@ -0,0 +1,300 @@
+"""
+To determine the quality of an alignment in UTA this script will compute metrics that can be used to evaluate
+the quality of the alignment.
+
+The metrics are:
+    seq_length: max(exon.ends_i)
+    exon_count: count of exons linked to transcript (from genbank file)
+    aligned_exon_count: count of blocks from GFF file
+    exon_structure_mismatch: True if exon_count != aligned_exon_count
+    matches: count of matching bases between chromosome and transcript within alignment bounds
+    mismatches: count of mismatched bases between chromosome and transcript within alignment bounds
+    gap_count: count of gaps (indels) between chromosome and transcript sequenceswithin alignment bounds
+    aln_length: total number of alignment blocks, includes counts of indel positions
+    identity_gapped: matches / aln_length ## not gap compressed identity calculation
+    identity_ungapped: matches / (matches + mismatches)
+    coverage: (matches + mismatches + deletions) / seq_length
+
+Example:
+
+                    1          11         21         31         41
+    Chromo+:  1 CCAGTGTGGC CGATACCCCA GGTTGGC-AC GCATCGTTGC CTTGGTAAGC 49
+                |||||||||| |||| |||    ||  || || |||||||||| ||||||||||
+    Refseq+:  1 CCAGTGTGGC CGATGCCC-- -GT--GCTAC GCATCGTTGC CTTGGTAAGC 45
+
+    seq_length: 45
+    exon_count: 1
+    aligned_exon_count: 1
+    matches: 43
+    mismatches: 1
+    gap_count: 3
+    aln_length: 43 matches + 1 mismatch + 3bp insertion + 2bp insertion + 1bp deletion = 50
+    identity_gapped: 43 / 50 = 0.86
+    identity_ungapped: 43 / (43 + 1) = 0.9772
+    coverage: (43 + 1 + 1) / 45 = 1.0
+
+Usage:
+    python generate_alignment_metrics.py <output_file> --db-url <db_url> --schema-name <schema_name>
+"""
+
+import argparse
+import logging
+import re
+from dataclasses import dataclass, field
+
+import psycopg2
+import six
+
+import uta
+
+
+@dataclass
+class CigarAln:
+    op: str
+    length: int
+
+
+@dataclass
+class ExonAln:
+    tx_start_i: int
+    tx_end_i: int
+    alt_start_i: int
+    alt_end_i: int
+    cigar: str
+    cigar_alns: list[CigarAln] = field(default_factory=list)
+
+
+@dataclass
+class TxAln:
+    hgnc: str
+    tx_ac: str
+    seq_length: int
+    exon_count: int
+    alt_ac: str
+    alt_aln_method: str
+    alt_strand: int
+    aligned_exon_count: int
+    exon_alignments: list[ExonAln] = field(default_factory=list)
+
+    @staticmethod
+    def metrics_header():
+        return "{}\n".format("\t".join([
+            "hgnc",
+            "tx_ac",
+            "seq_length",
+            "exon_count",
+            "alt_ac",
+            "alt_aln_method",
+            "alt_strand",
+            "aligned_exon_count",
+            "exon_structure_mismatch",
+            "matches_bps",
+            "mismatches_bps",
+            "gap_count",
+            "deletions_bps",
+            "aln_length",
+            "identity_gap",
+            "identity_ungap",
+            "coverage",
+        ]))
+
+    def to_metric_output_row(self):
+        return "{}\n".format("\t".join(map(str, [
+            self.hgnc,
+            self.tx_ac,
+            self.seq_length,
+            self.exon_count,
+            self.alt_ac,
+            self.alt_aln_method,
+            self.alt_strand,
+            self.aligned_exon_count,
+            not self.exon_count == self.aligned_exon_count,
+            self.matches(),
+            self.mismatches(),
+            self.gap_count(),
+            self.deletions(),
+            self.aln_length(),
+            self.identity_gap(),
+            self.identity_ungap(),
+            self.coverage(),
+        ])))
+
+    def matches(self):
+        matches = 0
+        for exon_aln in self.exon_alignments:
+            for cigar_aln in exon_aln.cigar_alns:
+                if cigar_aln.op == MATCH:
+                    matches += cigar_aln.length
+        return matches
+
+    def mismatches(self):
+        mismatches = 0
+        for exon_aln in self.exon_alignments:
+            for cigar_aln in exon_aln.cigar_alns:
+                if cigar_aln.op == MM:
+                    mismatches += cigar_aln.length
+        return mismatches
+
+    def deletions(self):
+        deletions = 0
+        for exon_aln in self.exon_alignments:
+            for cigar_aln in exon_aln.cigar_alns:
+                if cigar_aln.op == DEL:
+                    deletions += cigar_aln.length
+        return deletions
+
+    def gap_count(self):
+        gaps = 0
+        for exon_aln in self.exon_alignments:
+            for cigar_aln in exon_aln.cigar_alns:
+                if cigar_aln.op == DEL or cigar_aln.op == INS:
+                    gaps += 1
+        return gaps
+
+    def aln_length(self):
+        length = 0
+        for exon_aln in self.exon_alignments:
+            for cigar_aln in exon_aln.cigar_alns:
+                length += cigar_aln.length
+        return length
+
+    def identity_gap(self):
+        return f"{self.matches() / float(self.aln_length()):.6f}"
+
+    def identity_ungap(self):
+        return f"{self.matches() / float(self.matches() + self.mismatches()):.6f}"
+
+    def coverage(self):
+        return f"{(self.matches() + self.mismatches() + self.deletions()) / float(self.seq_length):.6f}"
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("alignment_metrics")
+
+p = re.compile("\d+[=DIX]")
+MATCH = "="
+INS = "I"
+DEL = "D"
+MM = "X"
+
+TX_EXON_SET_SUMMARY_ALL_BUILD37_SQL = """
+select *
+from tx_exon_set_summary_mv as mv
+where mv.alt_aln_method in ('splign', 'splign-manual') and mv.tx_ac ~ 'N[MR]_*' and mv.tx_ac !~ '/'
+      and mv.alt_ac in ('NC_000001.10', 'NC_000002.11', 'NC_000003.11', 'NC_000004.11', 'NC_000005.9', 'NC_000006.11',
+                        'NC_000007.13', 'NC_000008.10', 'NC_000009.11', 'NC_000010.10', 'NC_000011.9', 'NC_000012.11',
+                        'NC_000013.10', 'NC_000014.8', 'NC_000015.9', 'NC_000016.9', 'NC_000017.10', 'NC_000018.9',
+                        'NC_000019.9', 'NC_000020.10', 'NC_000021.8', 'NC_000022.10', 'NC_000023.10', 'NC_000024.9'
+                        );
+"""
+
+TX_EXON_SET_SUMMARY_SQL = """
+select mv.ends_i[mv.n_exons] as tx_length, *
+from tx_exon_set_summary_mv as mv
+where mv.tx_ac='{tx_ac}' and mv.alt_ac='{alt_ac}' and mv.alt_aln_method='{alt_aln_method}';
+"""
+
+TX_EXON_ALN_SQL = """
+select v.hgnc, v.tx_ac, v.alt_ac, v.alt_aln_method, v.alt_strand, v.ord, v.tx_start_i, v.tx_end_i,
+       v.alt_start_i, v.alt_end_i, v.cigar
+from tx_exon_aln_v as v
+where tx_ac='{tx_ac}' and alt_ac='{alt_ac}' and alt_aln_method='{alt_aln_method}'
+order by ord;
+"""
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Generate alignment metrics for transcript alignments")
+    parser.add_argument("output_file", type=str)
+    parser.add_argument("--db-url", default="postgresql://uta_admin@localhost/uta")
+    parser.add_argument("--schema-name", default="uta_20210129b")
+    return parser.parse_args()
+
+
+def _get_cursor(con, schema_name):
+    cur = con.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor)
+    cur.execute(f"set search_path = {schema_name}")
+    return cur
+
+
+def _get_alignment(cur, tx_ac, alt_ac, alt_aln_method, aligned_exon_count):
+    # get transcript exon count
+    cur.execute(
+        TX_EXON_SET_SUMMARY_SQL.format(tx_ac=tx_ac, alt_ac=tx_ac, alt_aln_method="transcript")
+    )
+    row = cur.fetchone()
+
+    if row is None:
+        logger.warn(f"no transcript alignment found for {tx_ac} {alt_ac} {alt_aln_method}")
+        tx_exon_count = None
+    else:
+        tx_exon_count = row.n_exons
+    tx_seq_length = row.tx_length
+
+    cur.execute(
+        TX_EXON_ALN_SQL.format(
+            tx_ac=tx_ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
+        )
+    )
+    rows = cur.fetchall()
+    tx_aln = None
+
+    for row in rows:
+        if tx_aln is None:
+            tx_aln = TxAln(
+                hgnc=row.hgnc,
+                tx_ac=row.tx_ac,
+                seq_length=tx_seq_length,
+                exon_count=tx_exon_count,
+                alt_ac=row.alt_ac,
+                alt_aln_method=row.alt_aln_method,
+                alt_strand=row.alt_strand,
+                aligned_exon_count=aligned_exon_count,
+            )
+        exon_aln = ExonAln(
+            tx_start_i=row.tx_start_i,
+            tx_end_i=row.tx_end_i,
+            alt_start_i=row.alt_start_i,
+            alt_end_i=row.alt_end_i,
+            cigar=row.cigar,
+        )
+        for match in p.finditer(exon_aln.cigar):
+            cigar_aln = CigarAln(op=match.group()[-1], length=int(match.group()[:-1]))
+            exon_aln.cigar_alns.append(cigar_aln)
+        tx_aln.exon_alignments.append(exon_aln)
+    return tx_aln
+
+
+def main(db_url, schema_name, output_file):
+    logger.info(f"connecting to {db_url}")
+    session = uta.connect(db_url)
+
+    con = session.bind.pool.connect()
+    cur = _get_cursor(con, schema_name)
+
+    # get tx_ac/alt_ac pairs
+    tx_alt_ac_pairs = []
+    cur.execute(TX_EXON_SET_SUMMARY_ALL_BUILD37_SQL)
+    rows = cur.fetchall()
+
+    for row in rows:
+        tx_alt_ac_pairs.append((row.tx_ac, row.alt_ac, row.alt_aln_method, row.n_exons))
+
+    logger.info(f"writing metrics to {output_file} for {len(tx_alt_ac_pairs)} transcript alignments")
+    with open(output_file, "w") as f_out:
+        f_out.write(TxAln.metrics_header())
+        i = 0
+        for tx_ac, alt_ac, alt_aln_method, aligned_exon_count in tx_alt_ac_pairs:
+            tx_aln = _get_alignment(
+                cur, tx_ac, alt_ac, alt_aln_method, aligned_exon_count
+            )
+            f_out.write(tx_aln.to_metric_output_row())
+            f_out.flush()
+            i += 1
+            if i % 500 == 0:
+                logger.info(f"  - {i} transcript alignments processed")
+
+
+if __name__ == "__main__":
+    arguments = parse_args()
+    main(arguments.db_url, arguments.schema_name, arguments.output_file)
diff --git a/misc/mito-transcripts/docker-compose-mito-extract.yml b/misc/mito-transcripts/docker-compose-mito-extract.yml
new file mode 100644
index 0000000..c5a8a0d
--- /dev/null
+++ b/misc/mito-transcripts/docker-compose-mito-extract.yml
@@ -0,0 +1,13 @@
+# docker compose file for the mito transcript extraction for the UTA update procedure
+
+version: '3'
+
+services:
+  mito-extract:
+    image: uta-update
+    command: sbin/ncbi_process_mito.py NC_012920.1 --output-dir /mito-extract/work | tee /mito-extract/logs/mito.log
+    volumes:
+      - ${UTA_ETL_WORK_DIR}:/mito-extract/work
+      - ${UTA_ETL_LOG_DIR}:/mito-extract/logs
+    working_dir: /opt/repos/uta
+    network_mode: host
diff --git a/misc/refseq-historical-backfill/docker-compose-backfill.yml b/misc/refseq-historical-backfill/docker-compose-backfill.yml
new file mode 100644
index 0000000..6e773b0
--- /dev/null
+++ b/misc/refseq-historical-backfill/docker-compose-backfill.yml
@@ -0,0 +1,14 @@
+# docker compose file for the RefSeq historical backfill procedure
+
+version: '3'
+
+services:
+  uta-extract-historical:
+    image: uta-update
+    command: misc/refseq-historical-backfill/uta-extract-historical /ncbi-dir /uta-extract/work /uta-extract/logs
+    volumes:
+      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
+      - ${UTA_ETL_WORK_DIR}:/uta-extract/work
+      - ${UTA_ETL_LOG_DIR}:/uta-extract/logs
+    working_dir: /opt/repos/uta
+    network_mode: host
diff --git a/misc/refseq-historical-backfill/ncbi_extract_gbff.py b/misc/refseq-historical-backfill/ncbi_extract_gbff.py
new file mode 100755
index 0000000..cab3277
--- /dev/null
+++ b/misc/refseq-historical-backfill/ncbi_extract_gbff.py
@@ -0,0 +1,197 @@
+"""
+Extract and write all files needed by UTA load, except alt accession exonsets (aka, alignments). From a single
+GBFF file we can create dna fasta, protein fasta, associated accessions, geneinfo, and txinfo files.
+"""
+import argparse
+import gzip
+import importlib_resources
+import io
+import logging
+import logging.config
+from collections import Counter
+from contextlib import ExitStack
+from typing import Iterable
+
+from Bio.Seq import Seq
+import Bio.SeqIO
+from Bio.SeqRecord import SeqRecord
+
+from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter
+from uta.formats.geneinfo import GeneInfo, GeneInfoWriter
+from uta.formats.txinfo import TxInfo, TxInfoWriter
+from uta.parsers.seqrecord import SeqRecordFacade, SeqRecordFeatureError
+from uta.tools.file_utils import open_file
+
+
+logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
+logging.config.fileConfig(logging_conf_fn)
+logging.getLogger().setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    ap = argparse.ArgumentParser(
+        description=__doc__,
+    )
+    ap.add_argument("GBFF_FILES", nargs="+")
+    ap.add_argument("--origin", "-o", default="NCBI")
+    ap.add_argument("--prefix", "-p", default="")
+    ap.add_argument("--output_dir", "-d", default=".", type=str)
+    opts = ap.parse_args()
+    return opts
+
+
+def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> None:
+    if prefix:
+        prefix = f"{prefix}."
+
+    # setup context managers for file writers
+    with ExitStack() as stack:
+        # DNA fasta file
+        dna_fasta_fh = stack.enter_context(
+            io.TextIOWrapper(
+                gzip.open(f"{output_dir}/{prefix}rna.fna.gz", "wb"), encoding="utf-8"
+            )
+        )
+
+        # Protein fasta file
+        protein_fasta_fh = stack.enter_context(
+            io.TextIOWrapper(
+                gzip.open(f"{output_dir}/{prefix}protein.faa.gz", "wb"),
+                encoding="utf-8",
+            )
+        )
+
+        geneinfo_fh = stack.enter_context(
+            io.TextIOWrapper(
+                gzip.open(f"{output_dir}/{prefix}geneinfo.gz", "wb"), encoding="utf-8"
+            )
+        )
+        geneinfo_writer = GeneInfoWriter(geneinfo_fh)
+
+        txinfo_fh = stack.enter_context(
+            io.TextIOWrapper(
+                gzip.open(f"{output_dir}/{prefix}txinfo.gz", "w"), encoding="utf-8"
+            )
+        )
+        txinfo_writer = TxInfoWriter(txinfo_fh)
+
+        assocacs_fh = stack.enter_context(
+            io.TextIOWrapper(
+                gzip.open(f"{output_dir}/{prefix}assocacs.gz", "w"), encoding="utf-8"
+            )
+        )
+        assocacs_writer = GeneAccessionsWriter(assocacs_fh)
+
+        total_genes = set()
+        total_skipped = set()
+        all_prefixes = Counter()
+        for gbff_fn in gbff_files:
+            logger.info(f"Processing {gbff_fn}")
+            gbff_file_handler = stack.enter_context(open_file(gbff_fn))
+            i = 0
+            genes = set()
+            skipped = set()
+            prefixes = Counter()
+            for r in Bio.SeqIO.parse(gbff_file_handler, "gb"):
+                srf = SeqRecordFacade(r)
+
+                # skip transcripts where the exon structure is unknown
+                if not srf.exons_se_i:
+                    skipped.add(srf.id)
+                    continue
+
+                prefixes.update([srf.id[:2]])
+                try:
+                    fna_record = SeqRecord(
+                        Seq(srf.feature_seq), id=srf.id, description=""
+                    )
+                    dna_fasta_fh.write(fna_record.format("fasta"))
+
+                    if srf.gene_id not in genes:
+                        geneinfo_writer.write(
+                            GeneInfo(
+                                gene_id=srf.gene_id,
+                                gene_symbol=srf.gene_symbol,
+                                tax_id="9606",
+                                hgnc=srf.gene_symbol,
+                                maploc="",
+                                aliases=srf.gene_synonyms,
+                                type=srf.gene_type,
+                                summary="",
+                                descr="",
+                                xrefs=srf.db_xrefs,
+                            )
+                        )
+
+                    txinfo_writer.write(
+                        TxInfo(
+                            origin=origin,
+                            ac=srf.id,
+                            gene_id=srf.gene_id,
+                            gene_symbol=srf.gene_symbol,
+                            cds_se_i=TxInfo.serialize_cds_se_i(srf.cds_se_i),
+                            exons_se_i=TxInfo.serialize_exons_se_i(srf.exons_se_i),
+                            codon_table=srf.codon_table,
+                            transl_except=TxInfo.serialize_transl_except(
+                                srf.transl_except
+                            ),
+                        )
+                    )
+
+                    # only write cds features for protein-coding transcripts
+                    if srf.cds_feature is not None:
+                        pro_record = SeqRecord(
+                            Seq(srf.cds_translation),
+                            id=srf.cds_protein_id,
+                            description=srf.cds_product,
+                        )
+                        protein_fasta_fh.write(pro_record.format("fasta"))
+
+                        assocacs_writer.write(
+                            GeneAccessions(
+                                origin=origin,
+                                gene_id=srf.gene_id,
+                                gene_symbol=srf.gene_symbol,
+                                tx_ac=srf.id,
+                                pro_ac=srf.cds_protein_id,
+                            )
+                        )
+
+                    genes.add(srf.gene_id)
+                    i += 1
+                    if i % 5000 == 0:
+                        logger.info(
+                            "  - {ng} genes in {fn} ({c}); {s} transcripts skipped".format(
+                                ng=len(genes),
+                                fn=gbff_fn,
+                                c=prefixes,
+                                s=len(skipped),
+                            )
+                        )
+                except SeqRecordFeatureError as e:
+                    logger.error(f"SeqRecordFeatureError processing {r.id}: {e}")
+                    raise
+                except ValueError as e:
+                    logger.error(f"ValueError processing {r.id}: {e}")
+                    raise
+
+
+            logger.info(
+                "{ng} genes in {fn} ({c}); {s} transcripts skipped".format(
+                    ng=len(genes), fn=gbff_fn, c=prefixes, s=len(skipped)
+                )
+            )
+            total_genes ^= genes
+            total_skipped ^= skipped
+            all_prefixes += prefixes
+        logger.info(
+            "{ng} genes in {nf} ({c}); {s} transcripts skipped".format(
+                ng=len(total_genes), nf=len(gbff_files), c=all_prefixes, s=len(total_skipped)
+            )
+        )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args.GBFF_FILES, args.origin, args.prefix, args.output_dir)
diff --git a/misc/refseq-historical-backfill/uta-extract-historical b/misc/refseq-historical-backfill/uta-extract-historical
new file mode 100755
index 0000000..1586cfe
--- /dev/null
+++ b/misc/refseq-historical-backfill/uta-extract-historical
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+# Download, then extract intermediate files out of the NCBI historical alignment files.
+
+set -e
+
+ncbi_dir=$1
+working_dir=$2
+log_dir=$3
+
+if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
+then
+    echo 'Usage: sbin/uta-extract-historical <ncbi_dir> <working_dir> <log_dir>'
+    exit 1
+fi
+
+download_ncbi_file () {
+    download_path=$1
+    download_dir=$2
+
+    download_module="${download_path%%/*}"
+    download_source="ftp.ncbi.nlm.nih.gov::$download_path"
+    download_destination="$download_dir/$download_module"
+
+    mkdir -p $download_destination
+    echo "Downloading $download_source to $download_destination"
+    rsync --no-motd -DHPRprtv "$download_source" "$download_destination"
+}
+
+relative_path="refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical"
+
+# download historical genbank file
+file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz"
+download_ncbi_file $file_path $ncbi_dir
+
+# download historical gff file
+file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz"
+download_ncbi_file $file_path $ncbi_dir
+
+# extract intermediate files from genbank file
+python misc/refseq-historical-backfill/ncbi_extract_gbff.py \
+  "$ncbi_dir/$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz" \
+  --output_dir "$working_dir" 2>&1 | tee "$log_dir/ncbi-parse-historical-ggbb.log"
+
+# extract exonset intermediate file from gff file
+python sbin/ncbi_parse_genomic_gff.py "$ncbi_dir/$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz" | \
+  gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | tee "$log_dir/ncbi-parse-historical-gff.log"
+
+# filter exonset alignments by txinfo
+sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \
+    --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \
+    tee "$log_dir/filter_exonset_transcripts.log"
diff --git a/misc/splign-manual/docker-compose-splign-manual.yml b/misc/splign-manual/docker-compose-splign-manual.yml
new file mode 100644
index 0000000..7ef0e87
--- /dev/null
+++ b/misc/splign-manual/docker-compose-splign-manual.yml
@@ -0,0 +1,16 @@
+# docker compose file for the splign-manual uta update procedure
+
+version: '3'
+
+services:
+  splign-manual:
+    image: uta-update
+    command: sbin/uta-splign-manual ${UTA_ETL_OLD_UTA_VERSION} /uta-splign-manual/input /uta-splign-manual/work /uta-splign-manual/logs
+    depends_on:
+      uta:
+        condition: service_healthy
+    volumes:
+      - ${UTA_SPLIGN_MANUAL_DIR}:/uta-splign-manual/input
+      - ${UTA_ETL_WORK_DIR}:/uta-splign-manual/work
+      - ${UTA_ETL_LOG_DIR}:/uta-splign-manual/logs
+    network_mode: host
diff --git a/misc/splign-manual/uta-splign-manual b/misc/splign-manual/uta-splign-manual
new file mode 100755
index 0000000..cb1e9ee
--- /dev/null
+++ b/misc/splign-manual/uta-splign-manual
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+# Process splign-manual alignments
+
+set -euxo pipefail
+
+source_uta_v=$1
+input_dir=$2
+working_dir=$3
+log_dir=$4
+
+if [ -z "$source_uta_v" ] || [ -z "$input_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
+then
+    echo 'Usage: misc/uta-splign-manual <source_uta_version> <input_dir> <working_dir> <log_dir>'
+    exit 1
+fi
+
+# set local variables and create working directories
+loading_uta_v="uta"
+working_dir="$working_dir/splign-manual"
+log_dir="$log_dir/splign-manual"
+mkdir -p "$log_dir"
+mkdir -p "$working_dir"
+
+# Generate txinfo.gz and exonset.gz files
+python sbin/generate-loading-data $input_dir/alignments/*.splign --txdata $input_dir/txdata.yaml \
+    --output-dir $working_dir 2>&1 | tee "$log_dir/generate-loading-data.log"
+
+# Generate fasta files
+seqrepo --root-directory "/biocommons/dl.biocommons.org/seqrepo" \
+    export $(gzip -cdq $working_dir/txinfo.gz  | cut -f2 | tail +2) \
+    --instance-name "master" | gzip -c > $working_dir/seqs.fa.gz 2>&1 | tee "$log_dir/seqrepo-export.log"
+
+# Generate seqinfo.gz file
+sbin/fasta-to-seqinfo -o NCBI $working_dir/seqs.fa.gz | gzip -c > $working_dir/seqinfo.gz 2>&1 | \
+    tee "$log_dir/fasta-to-seqinfo.log"
+
+# Load seqinfo
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-seqinfo $working_dir/seqinfo.gz 2>&1 | \
+    tee "$log_dir/load-seqinfo.log"
+
+# Load txinfo
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo $working_dir/txinfo.gz 2>&1 | \
+    tee "$log_dir/load-txinfo.log"
+
+# Load exonset
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset $working_dir/exonset.gz 2>&1 | \
+    tee "$log_dir/load-exonset.log"
+
+# Align exons
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | tee "$log_dir/align-exons.log"
+
+### run diff
+sbin/uta-diff "$source_uta_v" "$loading_uta_v" 2>&1 | tee "$log_dir/uta-diff.log"
+
+### psql_dump
+pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$working_dir/uta.pgd.gz"
diff --git a/pyproject.toml b/pyproject.toml
index 7bfc17d..af4e7e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "uta"
 dynamic = ["version"]
 description = "Universal Transcript Archive"
 readme = "README.md"
-requires-python = ">=3.5"
+requires-python = ">=3.9"
 license = {text = "Apache-2.0"}
 
 keywords = [
@@ -39,6 +39,7 @@ classifiers = [
 ]
 
 dependencies = [
+  "alembic",
   "attrs",
   "biocommons.seqrepo",
   "biopython>=1.69",
@@ -47,17 +48,24 @@ dependencies = [
   "configparser",
   "docopt",
   "eutils>=0.3.2",
+  "importlib_resources",
+  "more_itertools",
   "nose",
   "prettytable",
   "psycopg2-binary",
   "pytz",
   "recordtype",
+  "retry",
   "sqlalchemy",
-  "uta-align",
+  "uta-align>=0.3",
 ]
 
 [project.optional-dependencies]
-test = ["coverage", "testing.postgresql"]
+test = [
+  "coverage",
+  "parameterized",
+  "testing.postgresql",
+]
 
 [project.urls]  # Optional
 "Homepage" = "https://github.com/biocommons/uta"
diff --git a/sbin/assoc-acs-merge b/sbin/assoc-acs-merge
index 7c7ec00..1cfbff9 100755
--- a/sbin/assoc-acs-merge
+++ b/sbin/assoc-acs-merge
@@ -15,7 +15,6 @@ duplicates"""
 import csv
 import gzip
 import io
-import os
 import sys
 
 import attr
@@ -43,15 +42,15 @@ if __name__ == "__main__":
     aars = dict()
     for fn in sys.argv[1:]:
         ifh = csv.DictReader(anyopen(fn), delimiter="\t")
-        
+
         if ofh is None:
             ofh = csv.DictWriter(sys.stdout, fieldnames=out_header, delimiter="\t")
             ofh.writeheader()
-        
+
         for r in ifh:
             if r["tx_ac"] in ("","-") or r["pro_ac"] in ("","-"):
                 continue
-            
+
             aar = AssAccRec(tx_ac=r["tx_ac"], pro_ac=r["pro_ac"], origin=r["origin"])
             k = (aar.tx_ac, aar.origin)
             if k in aars:
diff --git a/sbin/coalesce_exonsets.py b/sbin/coalesce_exonsets.py
new file mode 100755
index 0000000..2ba0df8
--- /dev/null
+++ b/sbin/coalesce_exonsets.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+"""
+This script coalesces exonsets from multiple input files. It builds a cache of tx_ac/alt_ac pairs. If a mapping is
+seen in a later input file, the exonset is skipped. The output is written to stdout.
+"""
+
+import argparse
+import logging.config
+import sys
+from typing import Dict, List, Tuple
+
+import importlib_resources
+
+from uta.formats.exonset import ExonSetReader, ExonSetWriter
+from uta.tools.file_utils import open_file
+
+logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
+logging.config.fileConfig(logging_conf_fn)
+logging.getLogger().setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def coalesce_exonsets(exonset_files: List[str]) -> None:
+    skipped = 0
+    esw = ExonSetWriter(sys.stdout)
+    seen_ess: Dict[Tuple[str, str], str] = {}
+
+    for exonset_fn in exonset_files:
+        logger.info(f"  - processing exonset file {exonset_fn}")
+        with open_file(exonset_fn) as f:
+            exonsets = ExonSetReader(f)
+            for exonset in exonsets:
+                key = (exonset.tx_ac, exonset.alt_ac)
+                if key in seen_ess:
+                    logger.warning(f"  - exon set for transcript {exonset.tx_ac}/{exonset.alt_ac} already "
+                                   f"seen in {seen_ess[(exonset.tx_ac, exonset.alt_ac)]}. Skipping.")
+                    skipped += 1
+                else:
+                    seen_ess[key] = exonset_fn
+                    esw.write(exonset)
+
+    logger.info(f"Coalesced {len(seen_ess)} exonsets from {len(exonset_files)} files, skipped {skipped} duplicates.")
+    return seen_ess
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Coalesce exonsets.')
+    parser.add_argument('exonsets', nargs="+", help='Path to the exonset file')
+    args = parser.parse_args()
+
+    logger.info(f"Coalescing exonsets from {len(args.exonsets)} files")
+    coalesce_exonsets(args.exonsets)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sbin/exonset-to-seqinfo b/sbin/exonset-to-seqinfo
index 1ccbf95..21544f3 100755
--- a/sbin/exonset-to-seqinfo
+++ b/sbin/exonset-to-seqinfo
@@ -5,18 +5,17 @@
 import argparse
 import configparser as ConfigParser
 import gzip
+import importlib_resources
 import itertools
 import logging
 import logging.config
-import pkg_resources
 import re
 import sys
 
 from bioutils.digests import seq_md5
 from biocommons.seqrepo import SeqRepo
-# from multifastadb import MultiFastaDB
 
-from uta.formats.exonset import ExonSet, ExonSetReader
+from uta.formats.exonset import ExonSetReader
 from uta.formats.seqinfo import SeqInfo, SeqInfoWriter
 
 
@@ -32,16 +31,15 @@ def parse_args(argv):
                     required=True)
     ap.add_argument("--conf",
                     default=[
-                        pkg_resources.resource_filename("uta", "../etc/global.conf")]
-                    )
+                        importlib_resources.files("uta").joinpath("../../etc/global.conf")
+                    ])
 
     opts = ap.parse_args(argv)
     return opts
 
 
 if __name__ == "__main__":
-    logging_conf_fn = pkg_resources.resource_filename(
-        "uta", "etc/logging.conf")
+    logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
     logging.config.fileConfig(logging_conf_fn)
     logger = logging.getLogger(__name__)
     logger.setLevel(logging.INFO)
@@ -50,18 +48,16 @@ if __name__ == "__main__":
 
     opts = parse_args(sys.argv[1:])
 
-    cf = ConfigParser.SafeConfigParser()
+    cf = ConfigParser.ConfigParser()
     for conf_fn in opts.conf:
-        cf.readfp(open(conf_fn))
-        logger.info("loaded " + conf_fn)
+        cf.read_file(open(conf_fn))
+        logger.info("loaded " + str(conf_fn))
 
     in_fn = opts.FILES[0]
     in_fh = gzip.open(in_fn, 'rt') if in_fn.endswith(".gz") else open(in_fn)
     esr = ExonSetReader(in_fh)
     logger.info("opened " + in_fn)
 
-    #fa_dirs = cf.get("sequences", "fasta_directories").strip().splitlines()
-    #mfdb = MultiFastaDB(fa_dirs, use_meta_index=True)
     sr_dir = cf.get("sequences", "seqrepo")
     sr = SeqRepo(root_dir=sr_dir)
     logger.info("Opened sequence directories: " + sr_dir)
diff --git a/sbin/filter_exonset_transcripts.py b/sbin/filter_exonset_transcripts.py
new file mode 100755
index 0000000..1d97cc4
--- /dev/null
+++ b/sbin/filter_exonset_transcripts.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+import argparse
+import csv
+import logging.config
+import sys
+
+import importlib_resources
+
+from uta.formats.exonset import ExonSetReader, ExonSetWriter
+from uta.formats.txinfo import TxInfoReader
+from uta.tools.file_utils import open_file
+
+logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
+logging.config.fileConfig(logging_conf_fn)
+logging.getLogger().setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def filter_exonset(exonset_file, transcript_ids, missing_ids_file):
+    with open_file(exonset_file) as es_f, open(missing_ids_file, 'w') as missing_f:
+        exonsets = ExonSetReader(es_f)
+        esw = ExonSetWriter(sys.stdout)
+        writer_missing = csv.writer(missing_f)
+        missing_acs = set()
+
+        for exonset in exonsets:
+            if exonset.tx_ac in transcript_ids:
+                esw.write(exonset)
+            else:
+                logger.debug(f"Exon set transcript {exonset.tx_ac} not found in txinfo file. Filtering out.")
+                writer_missing.writerow([exonset.tx_ac])
+                missing_acs.add(exonset.tx_ac)
+    logger.info(f"Filtered out exon sets for {len(missing_acs)} transcript(s)")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Filter exonset data.')
+    parser.add_argument('--tx-info', help='Path to the transcript info file')
+    parser.add_argument('--exonsets', help='Path to the exonset file')
+    parser.add_argument('--missing-ids', help='Path to the missing transcript ids file')
+    args = parser.parse_args()
+
+    with open_file(args.tx_info) as f:
+        tx_reader = TxInfoReader(f)
+        transcript_ids = {row.ac for row in tx_reader}
+    filter_exonset(args.exonsets, transcript_ids, args.missing_ids)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/loading/data/splign-manual/generate-loading-data b/sbin/generate-loading-data
similarity index 70%
rename from loading/data/splign-manual/generate-loading-data
rename to sbin/generate-loading-data
index 86744ed..0b2fad3 100755
--- a/loading/data/splign-manual/generate-loading-data
+++ b/sbin/generate-loading-data
@@ -6,6 +6,7 @@ import argparse
 import csv
 import gzip
 import logging
+import os
 import sys
 
 import yaml
@@ -20,7 +21,6 @@ method = "splign-manual"
 
 txinfo_fn = "txinfo.gz"
 exonset_fn = "exonset.gz"
-txdata_fn = "txdata.yaml"
 
 
 def parse_args(argv):
@@ -31,6 +31,16 @@ def parse_args(argv):
         "FILES",
         nargs="*"
         )
+    ap.add_argument(
+        "--txdata",
+        required=True,
+        help="Path to txdata.yaml"
+        )
+    ap.add_argument(
+        "--output-dir",
+        required=True,
+        help="Path to output directory"
+        )
     opts = ap.parse_args(argv)
     return opts
 
@@ -61,15 +71,19 @@ def parse_splign(fn, txdata):
     try:
         txd = txdata[tx_ac]
     except KeyError:
-        raise KeyError(f"{tx_ac}: no cds or hgnc info in {txdata_fn}")
+        raise KeyError(f"{tx_ac}: no cds or gene_symbol info in txdata")
+
+    gene_symbol = txd["hgnc"]
 
-    hgnc = txd["hgnc"]
-    if hgnc is None:
-        _logger.warn(f"No HGNC symbol in {txdata_fn} for {tx_ac}")
+    gene_id = txd["gene_id"]
+    if gene_id is None:
+        msg = f"No gene id in txdata for {tx_ac}"
+        _logger.error(msg)
+        raise ValueError(msg)
 
     cds = txd["cds"]
     if cds is None:
-        _logger.warning(f"No CDS info {txdata_fn} for {tx_ac}; will be non-coding transcript")
+        _logger.warning(f"No CDS info txdata for {tx_ac}; will be non-coding transcript")
         cds_se_i = None
     else:
         cds = [int(i) for i in txd["cds"].split(",")]
@@ -79,9 +93,12 @@ def parse_splign(fn, txdata):
     txinfo = uta.formats.txinfo.TxInfo(
         origin=origin,
         ac=tx_ac,
-        hgnc=hgnc,
+        gene_id=gene_id,
+        gene_symbol=gene_symbol,
         cds_se_i=cds_se_i,
-        exons_se_i=tx_exons_str)
+        exons_se_i=tx_exons_str,
+        transl_except=None,
+    )
     exonset = uta.formats.exonset.ExonSet(
         tx_ac=tx_ac,
         alt_ac=alt_ac,
@@ -97,11 +114,11 @@ if __name__ == "__main__":
 
     opts = parse_args(sys.argv[1:])
 
-    txdata = yaml.load(open(txdata_fn), Loader=yaml.SafeLoader)
+    txdata = yaml.load(open(opts.txdata), Loader=yaml.SafeLoader)
+
+    txinfo_out = uta.formats.txinfo.TxInfoWriter(gzip.open(os.path.join(opts.output_dir, txinfo_fn), "wt"))
+    exonset_out = uta.formats.exonset.ExonSetWriter(gzip.open(os.path.join(opts.output_dir, exonset_fn), "wt"))
 
-    txinfo_out = uta.formats.txinfo.TxInfoWriter(gzip.open(txinfo_fn, "wt"))
-    exonset_out = uta.formats.exonset.ExonSetWriter(gzip.open(exonset_fn, "wt"))
-    
     for fn in opts.FILES:
         _logger.info("# " + fn)
         txinfo, exonset = parse_splign(fn, txdata)
diff --git a/sbin/ncbi-download b/sbin/ncbi-download
new file mode 100755
index 0000000..09ad4c2
--- /dev/null
+++ b/sbin/ncbi-download
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# This script downloads the files needed for a UTA+SeqRepo update into to the given directory.
+#
+# DESTINATION_DIR will have a directory structure matching the source.
+
+set -e
+
+FILE_PATH_CONFIG=$1
+DOWNLOAD_DIR=$2
+
+if [ -z "$FILE_PATH_CONFIG" ] || [ -z "$DOWNLOAD_DIR" ]
+then
+    echo 'Usage: sbin/ncbi-download <file path config> <download_dir> '
+    exit 1
+else
+    echo "Downloading files to $DOWNLOAD_DIR"
+fi
+
+grep -v -e '^#' -e '^$' "$FILE_PATH_CONFIG" | while read -r DOWNLOAD_PATH; do
+    # each top-level directory in NCBI is an rsync module.
+    # bash parameter expansion removes all content after first slash.
+    DOWNLOAD_MODULE="${DOWNLOAD_PATH%%/*}"
+    DOWNLOAD_SRC="ftp.ncbi.nlm.nih.gov::$DOWNLOAD_PATH"
+    DOWNLOAD_DST="$DOWNLOAD_DIR/$DOWNLOAD_MODULE"
+    mkdir -p $DOWNLOAD_DST
+    echo "Downloading $DOWNLOAD_SRC to $DOWNLOAD_DST"
+    rsync --no-motd -DHPRprtv "$DOWNLOAD_SRC" "$DOWNLOAD_DST"
+done
diff --git a/sbin/ncbi-parse-gbff b/sbin/ncbi-parse-gbff
index 45897c6..110f01a 100755
--- a/sbin/ncbi-parse-gbff
+++ b/sbin/ncbi-parse-gbff
@@ -9,8 +9,8 @@ See uta.formats for a description of those file formats.
 In a nutshell, this means that you'll get data like this:
 
 ncbi.txinfo.gz:
-origin  ac  hgnc    cds_se_i    exons_se_i
-NCBI RefSeq NM_053283.2 DCD 62,395  0,120;120,159;159,261;261,351;351,517
+origin  ac  gene_id gene_symbol    cds_se_i    exons_se_i
+NCBI RefSeq NM_053283.2 117159 DCD 62,395  0,120;120,159;159,261;261,351;351,517
 
 ncbi.exonsets.gz:
 tx_ac   alt_ac  method  strand  exons_se_i
@@ -27,23 +27,19 @@ from __future__ import division, unicode_literals
 import argparse
 from collections import Counter
 import gzip
+import importlib_resources
 import io
-import itertools
 import logging
 import logging.config
-import os
-import pprint
-import pkg_resources
 import re
 import sys
 
 import Bio.SeqIO
-import Bio.SeqRecord
 from bioutils.digests import seq_md5
 
-from uta.formats.exonset import ExonSet, ExonSetWriter
 from uta.formats.txinfo import TxInfo, TxInfoWriter
-from uta.formats.geneaccessions import GeneAccessionsReader
+from uta.parsers.seqrecord import SeqRecordFacade, SeqRecordFeatureError
+
 
 origin = "NCBI"
 
@@ -62,38 +58,6 @@ def parse_args(argv):
     return opts
 
 
-class SeqRecordFacade(Bio.SeqRecord.SeqRecord):
-
-    def __init__(self, seqrecord):
-        self._sr = seqrecord
-
-    @property
-    def id(self):
-        return self._sr.id
-
-    @property
-    def hgnc(self):
-        genes = [f for f in self._sr.features if f.type == "gene"][
-            0].qualifiers["gene"]
-        assert len(genes) == 1
-        return genes[0]
-
-    @property
-    def cds_se_i(self):
-        try:
-            cds = [f for f in self._sr.features if f.type == "CDS"][0]
-        except IndexError:
-            return None
-        return (cds.location.start.real, cds.location.end.real)
-
-    @property
-    def exons_se_i(self):
-        # ,"misc_feature"]]
-        exons = [f for f in self._sr.features if f.type in ["exon"]]
-        se = [(f.location.start.real, f.location.end.real) for f in exons]
-        return se
-
-
 def gbff_filter(it):
     """pre-filter genbank file stream for records that match a specific LOCUS pattern"""
     delim = "//"
@@ -107,6 +71,7 @@ def gbff_filter(it):
             if line.startswith(delim):
                 emit = False
 
+
 def gbff_block_reader(it):
     """yield strings, each representing a full genbank record"""
     delim = "//"
@@ -122,9 +87,9 @@ def gbff_block_reader(it):
             yield SeqRecordFacade(Bio.SeqIO.read(io.StringIO(emit), "gb"))
             emit = None
 
+
 if __name__ == "__main__":
-    logging_conf_fn = pkg_resources.resource_filename(
-        "uta", "etc/logging.conf")
+    logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
     logging.config.fileConfig(logging_conf_fn)
     logging.getLogger().setLevel(logging.INFO)
     logger = logging.getLogger(__name__)
@@ -146,17 +111,18 @@ if __name__ == "__main__":
             if srf.id.partition("_")[0] not in ["NM", "NR"]:
                 skipped_ids.add(srf.id)
                 continue
-            cds_se_i = srf.cds_se_i
-            ti = TxInfo(ac=srf.id,
-                        origin=opts.origin,
-                        hgnc=srf.hgnc,
-                        cds_se_i=None if cds_se_i is None else "{},{}".format(
-                            *cds_se_i),
-                        exons_se_i=";".join(
-                            ["{},{}".format(*ese) for ese in srf.exons_se_i])
-                        )
+            ti = TxInfo(
+                ac=srf.id,
+                origin=opts.origin,
+                gene_id=srf.gene_id,
+                gene_symbol=srf.gene_symbol,
+                cds_se_i=TxInfo.serialize_cds_se_i(srf.cds_se_i),
+                exons_se_i=TxInfo.serialize_exons_se_i(srf.exons_se_i),
+                codon_table=srf.codon_table,
+                transl_except=TxInfo.serialize_transl_except(srf.transl_except),
+            )
             tiw.write(ti)
-            genes.add(srf.hgnc)
+            genes.add(srf.gene_symbol)
         logger.info("{ng} genes in {fn} ({c})".format(ng=len(genes), fn=fn, c=prefixes))
         total_genes ^= genes
         all_prefixes += prefixes
diff --git a/sbin/ncbi-parse-gene2refseq b/sbin/ncbi-parse-gene2refseq
index 5d11206..1f2f6cb 100755
--- a/sbin/ncbi-parse-gene2refseq
+++ b/sbin/ncbi-parse-gene2refseq
@@ -10,8 +10,6 @@ ftp://ftp.ncbi.nih.gov/gene/DATA/gene2refseq.gz
 import io
 import sys
 
-from csv import DictReader
-
 from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter
 from uta.formats.ncbitsv import NCBITSVReader
 
@@ -38,16 +36,16 @@ if __name__ == "__main__":
         if rec["rna_nucleotide_accession.version"] == "-" and rec["protein_accession.version"] == "-":
             continue
 
-        ga = GeneAccessions(hgnc=rec["symbol"],
+        ga = GeneAccessions(gene_symbol=rec["symbol"],
                             tx_ac=rec["rna_nucleotide_accession.version"],
                             gene_id=rec["geneid"],
                             pro_ac=rec["protein_accession.version"],
                             origin="NCBI",
                             )
 
-        key = (ga.hgnc, ga.tx_ac, ga.gene_id, ga.pro_ac)
+        key = (ga.gene_symbol, ga.tx_ac, ga.gene_id, ga.pro_ac)
         if key in seen:
-           continue 
+            continue
 
         seen.add(key)
         gaw.write(ga)
diff --git a/sbin/ncbi-parse-geneinfo b/sbin/ncbi-parse-geneinfo
index 73966b8..bc33aad 100755
--- a/sbin/ncbi-parse-geneinfo
+++ b/sbin/ncbi-parse-geneinfo
@@ -30,10 +30,9 @@ if __name__ == "__main__":
     giw = GeneInfoWriter(sys.stdout)
 
     for rec in gi_in:
-        if rec["symbol_from_nomenclature_authority"] == "-":
-            continue
         gi = GeneInfo(
             tax_id=rec["tax_id"],
+            gene_symbol=rec["symbol"],
             gene_id=rec["geneid"],
             hgnc=rec["symbol_from_nomenclature_authority"],
             maploc=rec["map_location"],
diff --git a/sbin/ncbi-parse-gff b/sbin/ncbi-parse-gff
index 4a34ed2..cdc546a 100755
--- a/sbin/ncbi-parse-gff
+++ b/sbin/ncbi-parse-gff
@@ -1,17 +1,13 @@
 #!/usr/bin/env python
 
-"""Write exonsets and txinfo files from NCBI GFF alignments, as obtained from
-ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/alignments/
+"""Write exonsets files from NCBI GFF alignments, as obtained from
+ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_*/
 This service appeared in April 2015 and is due to update weekly.
 
 See uta.formats for a description of those file formats.
 
 In a nutshell, this means that you'll get data like this:
 
-ncbi.txinfo.gz:
-origin  ac  hgnc    cds_se_i    exons_se_i
-NCBI RefSeq NM_053283.2 DCD 62,395  0,120;120,159;159,261;261,351;351,517
-
 ncbi.exonsets.gz:
 tx_ac   alt_ac  method  strand  exons_se_i
 NM_130786.3 NC_000019.9 splign  -1  58864769,58864865;588646...
@@ -35,12 +31,12 @@ from __future__ import division
 import argparse
 import collections
 import gzip
+import importlib_resources
 import io
 import itertools
 import logging.config
 import os
 import pprint
-import pkg_resources
 import re
 import sys
 
@@ -50,6 +46,7 @@ import prettytable
 from uta.formats.exonset import ExonSet, ExonSetWriter
 from uta.formats.txinfo import TxInfo, TxInfoWriter, TxInfoReader
 from uta.formats.geneaccessions import GeneAccessionsReader
+from uta.tools.file_utils import open_file
 
 origin = "NCBI"
 
@@ -138,23 +135,22 @@ class TranscriptAlignment(object):
         return self.exon_alignments[0].pct_identity_ungap
 
 
-def parse_args(argv):
+def parse_args():
     ap = argparse.ArgumentParser(
         description=__doc__,
     )
-    ap.add_argument("in_fn")
+    ap.add_argument("GFF_files", nargs="+",
+                    help="NCBI GFF files to process")
     ap.add_argument("--origin", "-o",
                     default=origin)
     ap.add_argument("--prefix", "-p",
                     default="ncbi-gff")
-    ap.add_argument("--geneacs", "-G")
-    ap.add_argument("--txinfo", "-T", required=False)
     ap.add_argument("--strict-coverage",         "-C", type=float, default=95.0)
     ap.add_argument("--min-coverage",            "-c", type=float, default=85.0)
     ap.add_argument("--strict-pct-identity-gap", "-I", type=float, default=95.0)
     ap.add_argument("--min-pct-identity-gap",    "-i", type=float, default=85.0)
 
-    opts = ap.parse_args(argv)
+    opts = ap.parse_args()
 
     assert opts.strict_coverage > opts.min_coverage
     assert opts.strict_pct_identity_gap > opts.min_pct_identity_gap
@@ -165,24 +161,28 @@ def parse_args(argv):
 def read_exon_alignments(fn):
     """read lines of NCBI's alignment gff file, fn, returning ExonAlignment records"""
 
-    # NC_000007.13	RefSeq	cDNA_match	50344265	50344518	254	+	.	ID=aln58042;Target=NM_001220765.2 1 254 +;gap_count=0;identity=0.0691326;idty=1;num_ident=428;num_mismatch=0;pct_coverage=6.91326;pct_identity_gap=100;pct_identity_ungap=100;score=254
-    # NC_000002.11  RefSeq  cDNA_match  179671939   179672150             212 -   .               ID=ed951d46-194c-477a-a480-4bc64530c5ba;Target=NM_001267550.2 1 212 +;gap_count=0;identity=0.999991;idty=1;num_ident=109223;num_mismatch=1;pct_coverage=100;pct_identity_gap=99.9991;pct_identity_ungap=99.9991
+    # NC_000022.10    RefSeq  cDNA_match      20783512        20783627        116     -       .       ID=7b8c7a437b92bf9dee20d81acadadd8e;Target=NM_182895.5 1496 1611 +;consensus_splices=20;exon_identity=0.99769;for_remapping=2;gap_count=3;identity=0.99769;idty=1;matches=3455;num_ident=3455;num_mismatch=5;pct_coverage=99.9134;pct_coverage_hiqual=99.9134;pct_identity_gap=99.769;pct_identity_ungap=99.8555;product_coverage=1;rank=1;splices=20;weighted_identity=0.996898
+    # NC_000022.10    RefSeq  cDNA_match      20781685        20781837        153     -       .       ID=7b8c7a437b92bf9dee20d81acadadd8e;Target=NM_182895.5 1612 1764 +;consensus_splices=20;exon_identity=0.99769;for_remapping=2;gap_count=3;identity=0.99769;idty=1;matches=3455;num_ident=3455;num_mismatch=5;pct_coverage=99.9134;pct_coverage_hiqual=99.9134;pct_identity_gap=99.769;pct_identity_ungap=99.8555;product_coverage=1;rank=1;splices=20;weighted_identity=0.996898
+    # NC_000022.10    RefSeq  cDNA_match      20778874        20780569        1676.05 -       .       ID=7b8c7a437b92bf9dee20d81acadadd8e;Target=NM_182895.5 1765 3463 +;consensus_splices=20;exon_identity=0.99769;for_remapping=2;gap_count=3;identity=0.99769;idty=0.995291;matches=3455;num_ident=3455;num_mismatch=5;pct_coverage=99.9134;pct_coverage_hiqual=99.9134;pct_identity_gap=99.769;pct_identity_ungap=99.8555;product_coverage=1;rank=1;splices=20;weighted_identity=0.996898;Gap=M540 I1 M5 I1 M51 I1 M1100
     line_re = re.compile(
         "(?P<ref_ac>\S+)\s+(?P<origin>\S+)\s+(?P<match_type>\S+)\s+"
         "(?P<g_start>\d+)\s+(?P<g_end>\d+)\s+(?P<score>\S+)\s+"
         "(?P<strand>[-+])\s+\.\s+ID=(?P<aln_id>[^;]+);Target=(?P<tx_ac>\S+)"
         "\s+(?P<tx_start>\d+)\s+(?P<tx_end>\d+).+?"
-        "pct_coverage=(?P<pct_coverage>[^;]+);"
+        "pct_coverage=(?P<pct_coverage>[^;]+);.+?"
         "pct_identity_gap=(?P<pct_identity_gap>[^;]+);"
         "pct_identity_ungap=(?P<pct_identity_ungap>[^;]+)"
     )
-    fh = io.open(fn, "rt")
-    for line in fh.readlines():
-        if not line.startswith('#'):
-            try:
-                yield ExonAlignment(**line_re.match(line).groupdict())
-            except (AttributeError, ValueError):
-                raise Exception("Failed at", line)
+
+    with open_file(fn) as fh:
+        for line in fh:
+            if not line.startswith('#'):
+                try:
+                    re_match = line_re.match(line)
+                    if re_match and re_match["match_type"] == "cDNA_match":
+                        yield ExonAlignment(**line_re.match(line).groupdict())
+                except (AttributeError, ValueError):
+                    raise Exception("Failed at", line)
 
 
 def read_transcript_alignments(fn):
@@ -217,14 +217,8 @@ def group_transcript_alignments(transcript_alignments):
             for key, alns_i in itertools.groupby(transcript_alignments, key=_key))
 
 
-def convert_exon_data(opts, transcript_alignment):
+def convert_exon_data(transcript_alignment):
     """return (TxInfo,ExonSet) tuple for given exon record data"""
-    ti = TxInfo(ac=transcript_alignment.tx_ac,
-                origin=opts.origin,
-                hgnc=None,
-                cds_se_i=None,
-                exons_se_i=transcript_alignment.tx_exons_se_i
-                )
     es = ExonSet(
         tx_ac=transcript_alignment.tx_ac,
         alt_ac=transcript_alignment.ref_ac,
@@ -232,67 +226,20 @@ def convert_exon_data(opts, transcript_alignment):
         strand=-1 if transcript_alignment.strand == "-" else 1,
         exons_se_i=transcript_alignment.ref_exons_se_i
     )
-    return (ti, es)
-
+    return es
 
-if __name__ == "__main__":
-    logging_conf_fn = pkg_resources.resource_filename(
-        "uta", "etc/logging.conf")
-    logging.config.fileConfig(logging_conf_fn)
-    logging.getLogger().setLevel(logging.INFO)
-    logger = logging.getLogger(__name__)
-
-    opts = parse_args(sys.argv[1:])
-
-    if opts.geneacs:
-        gar = GeneAccessionsReader(gzip.open(opts.geneacs, "rt"))
-        tx2gene = {ga.tx_ac: ga.hgnc for ga in gar}
-        logger.info(
-            "read {} gene-accession mappings from {}".format(len(tx2gene), opts.geneacs))
-    else:
-        tx2gene = None
-        logger.info("No geneacs (-G) file provided; gene info will be empty.")
-        
-    if opts.txinfo:
-        tir = TxInfoReader(gzip.open(opts.txinfo, "rt"))
-        tx2ti = {ti.ac: ti for ti in tir}
-        logger.info(
-            "read {} CDS data from {}".format(len(tx2ti), opts.txinfo))
-        # add any gene-accession mappings from txinfo file if they are not in geneacs file; log warning if they disagree
-        if tx2gene:
-            for ti_ac in tx2ti:
-                if not tx2gene.get(ti_ac):
-                    tx2gene[ti_ac] = tx2ti[ti_ac].hgnc
-                if tx2gene[ti_ac] != tx2ti[ti_ac].hgnc:
-                    logger.warning('HGNC symbol disagrees in txinfo ({tx2ti_hgnc}) and geneacs ({tx2gene_hgnc}) files for accession {ti_ac}'.format(
-                        tx2ti_hgnc=tx2ti[ti_ac].hgnc,
-                        tx2gene_hgnc=tx2gene[ti_ac].hgnc,
-                        ti_ac=ti_ac
-                    ))
 
-    else:
-        tx2ti = None
-        logger.info("No gbff txinfo provided (-T); CDS start,end will be undefined for all transcripts and transcript-genome exon structures will not be verified")
-
-    es_fn = opts.prefix + "exonset.gz"
-    ti_fn = opts.prefix + "txinfo.gz"
-
-    esw = ExonSetWriter(gzip.open(es_fn + ".tmp", "wt"))
-    tiw = TxInfoWriter(gzip.open(ti_fn + ".tmp", "wt"))
-
-    ties = {}
-    ti_written = collections.defaultdict(lambda: False)
-    ac_not_in_gbff = set()
-    ac_exons_differ = set()
+def write_exonsets_from_gff_file(gff_fn, logger, opts, esw):
+    """write exonsets from a single gff file"""
     ac_in_source = set()
     ac_failed = set()
 
-    bins = "nogbff esdiffer unique multiple minimum none".split()
+    bins = "unique multiple minimum none skipped".split()
     sets = collections.defaultdict(lambda: {k: list() for k in bins})
 
-    transcript_alignments = read_transcript_alignments(opts.in_fn)
+    transcript_alignments = read_transcript_alignments(gff_fn)
     logger.info(
-        "read {} transcript alignments from {}".format(len(transcript_alignments), opts.in_fn))
+        "read {} transcript alignments from {}".format(len(transcript_alignments), gff_fn))
 
     for _, txalns in group_transcript_alignments(transcript_alignments):
         assert len(txalns) > 0
@@ -300,40 +247,13 @@ if __name__ == "__main__":
         ta0 = txalns[0]
         tx_ac, ref_ac = ta0.tx_ac, ta0.ref_ac
         skey = "{:.2s} {:.2s}".format(tx_ac, ref_ac)
+        if not tx_ac[:2] in ("NM", "NR") or not ref_ac[:2] == "NC":
+            sets[skey]["skipped"] += [txalns]
+            continue
+
         bin = None
-        
-        # ############################################################
-        # Optionally compare exon structure from gbff with input gff
-        # And get cds s,e from gbff (sigh)
-        if tx2ti is None:
-            cds_se_i = None
-            txalns_esm = txalns
-        else:
-            if tx_ac not in tx2ti:
-                logger.warning("{ta.tx_ac}~{ta.ref_ac}: no transcript info in {opts.txinfo}; skipping transcript".format(
-                    ta=ta0, opts=opts))
-                ac_not_in_gbff.add(tx_ac)
-                bin = "nogbff"
-                sets[skey][bin] += [txalns]
-                continue
-
-            gbff_ti = tx2ti[tx_ac]
-            txalns_esm = [ta for ta in txalns if ta.tx_exons_se_i == gbff_ti.exons_se_i]
-            n_rm = len(txalns) - len(txalns_esm)
-            if n_rm > 0:
-                logger.warning("{ta.tx_ac}~{ta.ref_ac}: Removed {n_rm}/{n_tot} exon structures that differ from gbff definition".format(
-                    n_rm=n_rm, n_tot=len(txalns), ta=ta, opts=opts))
-            if len(txalns_esm) == 0:
-                logger.warning("{ta.tx_ac}~{ta.ref_ac}: All {n} exon structures differ from gbff definition; skipping alignment".format(
-                    ta=ta, opts=opts, n=len(txalns)))
-                ac_exons_differ.add(tx_ac)
-                bin = "esdiffer"
-                sets[skey][bin] += [txalns]
-                continue
-
-            cds_se_i = gbff_ti.cds_se_i  # possibly None
-
-        
+        txalns_load = []
+
         # ############################################################
         # Filter alignments by coverage and pct_identity_gap
         # From Terence Murphy, NCBI:
@@ -341,7 +261,7 @@ if __name__ == "__main__":
         # and RefSeqGene alignments that meet the filter:
         # 'pct_identity_gap >= 99.5 and pct_coverage >= 95'"
         txalns_strict = [txaln
-                         for txaln in txalns_esm
+                         for txaln in txalns
                          if (txaln.pct_coverage > opts.strict_coverage
                              and txaln.pct_identity_gap > opts.strict_pct_identity_gap)]
 
@@ -354,90 +274,76 @@ if __name__ == "__main__":
 
             logger.warning("{ta.tx_ac}~{ta.ref_ac}: Multiple ({n}) strict alignments; cov/pig: {stats}".format(
                 ta=txalns_strict[0], n=len(txalns_strict), opts=opts,
-                stats = "; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns_strict),
-                ))
+                stats="; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns_strict),
+            ))
             txalns_load = txalns_strict
             bin = "multiple"
 
         if len(txalns_strict) == 0:
 
             txalns_min = [txaln
-                          for txaln in txalns_esm
+                          for txaln in txalns
                           if (txaln.pct_coverage > opts.min_coverage
                               and txaln.pct_identity_gap > opts.min_pct_identity_gap)]
             if len(txalns_min) == 0:
                 logger.warning("{ta.tx_ac}~{ta.ref_ac}: No usable alignments; cov/pig: {stats}".format(
                     ta=txalns[0],
-                    stats = "; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns_esm),
-                    ))
+                    stats="; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns),
+                ))
                 bin = "none"
+                ac_failed.add(skey)
             else:
-                logger.warning("{ta.tx_ac}~{ta.ref_ac}: Resorting to minimum criteria; loading {n} alignments; cov/pig: {stats}".format(
-                    ta=txalns_min[0], n=len(txalns_min),
-                    stats = "; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns_min),
+                logger.warning(
+                    "{ta.tx_ac}~{ta.ref_ac}: Resorting to minimum criteria; loading {n} alignments; cov/pig: {stats}".format(
+                        ta=txalns_min[0], n=len(txalns_min),
+                        stats="; ".join("{ta.pct_coverage}/{ta.pct_identity_gap}".format(ta=ta) for ta in txalns_min),
                     ))
                 bin = "minimum"
             txalns_load = txalns_min
 
-        sets[skey][bin] += [txalns_esm]
+        sets[skey][bin] += [txalns]
 
         for ta in txalns_load:
-            ti, es = convert_exon_data(opts, ta)
-            ti.cds_se_i = cds_se_i
+            es = convert_exon_data(ta)
             ac_in_source.add(tx_ac)
-            ti.hgnc = tx2gene.get(ti.ac, None)
-
-            if not ti_written[ti.ac]:
-                # write a single txinfo line once; multiple may occur for multiple alignments of e.g., one NM to NC, NW, NT
-                tiw.write(ti)
-                ti_written[ti.ac] = True
 
             esw.write(es)
 
     # END HEINOUS LOOP
 
-    for fn in [ti_fn, es_fn]:
-        os.rename(fn + ".tmp", fn)
-
     seen_but_failed = ac_failed - ac_in_source
     if seen_but_failed:
         logger.warning("{n_acv} acvs seen but failed criteria: {acs}".format(
             n_acv=len(seen_but_failed), acs=",".join(sorted(seen_but_failed))))
 
-    if ac_not_in_gbff:
-        s_not_g_b = set(k.partition(".")[
-                        0] for k in ac_in_source) - set(k.partition(".")[0] for k in tx2gene.keys())
-        logger.warning("{n_acv} acvs ({n_ac} base acs) in source not in geneacs file: {acs}".format(
-            n_acv=len(ac_not_in_gbff), n_ac=len(s_not_g_b), opts=opts, acs=",".join(sorted(ac_not_in_gbff))))
-
-    if ac_exons_differ:
-        logger.warning("{n} accessions in gbff-derived txinfo have different exon coordinates: {acs}".format(
-            n=len(ac_exons_differ), opts=opts, acs=",".join(sorted(ac_exons_differ))))
-
-    pprint.pprint(opts)
     pt = prettytable.PrettyTable(field_names=["ac_pair"]
-                                 + bins
-                                 + "max_coverage max_pct_identity_gap nobgffs nogbff_noup esdiffers nones".split()
+                                             + bins
+                                             + "max_coverage max_pct_identity_gap nones".split()
                                  )
     for ack in sorted(sets.keys()):
         n = 5
-        nogbff_acs = sorted(set(ta.tx_ac for ta in itertools.chain.from_iterable(sets[ack]["nogbff"])))[:n]
-        esdiffer_acs = sorted(set(ta.tx_ac for ta in itertools.chain.from_iterable(sets[ack]["esdiffer"])))[:n]
         nones = list(itertools.chain.from_iterable(sets[ack]["none"]))
         nones_acs = sorted(set(ta.tx_ac for ta in nones))[:n]
         max_pct_identity_gap = "{:.2f}".format(max(ta.pct_identity_gap for ta in nones)) if nones else "n/a"
         max_pct_coverage = "{:.2f}".format(max(ta.pct_coverage for ta in nones)) if nones else "n/a"
 
-        nogbff_noup = sorted(
-            set(ta.tx_ac.split('.')[0] for ta in itertools.chain.from_iterable(sets[ack]["nogbff"]))
-            - set(ta.tx_ac.split('.')[0] for ta in itertools.chain.from_iterable(sets[ack]["unique"]))
-            - set(ta.tx_ac.split('.')[0] for ta in itertools.chain.from_iterable(sets[ack]["multiple"]))
-            )
-
         pt.add_row([ack] + [len(sets[ack][bk]) for bk in bins] +
                    [max_pct_coverage, max_pct_identity_gap,
-                    " ".join(nogbff_acs),
-                    str(len(nogbff_noup)) + ": " + " ".join(nogbff_noup[:n]),
-                    " ".join(esdiffer_acs),
-                    " ".join(nones_acs) ])
-    print(pt)
+                    " ".join(nones_acs)])
+    logger.info("summary in table below...\n" + str(pt))
+
+
+
+if __name__ == "__main__":
+    logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
+    logging.config.fileConfig(logging_conf_fn)
+    logging.getLogger().setLevel(logging.INFO)
+    logger = logging.getLogger(__name__)
+
+    opts = parse_args()
+
+    esw = ExonSetWriter(sys.stdout)
+
+    for gff_fn in opts.GFF_files:
+        logger.info("processing {}".format(gff_fn))
+        write_exonsets_from_gff_file(gff_fn, logger, opts, esw)
diff --git a/sbin/ncbi_parse_genomic_gff.py b/sbin/ncbi_parse_genomic_gff.py
new file mode 100755
index 0000000..0035d31
--- /dev/null
+++ b/sbin/ncbi_parse_genomic_gff.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+
+"""Write exonsets from NCBI GFF alignments, as obtained from
+ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions
+This service appeared in April 2015 and is due to update weekly.
+
+See uta.formats for a description of those file formats.
+
+In a nutshell, this means that you'll get data like this:
+
+ncbi-gff.exonsets.gz:
+tx_ac   alt_ac  method  strand  exons_se_i
+NM_130786.3 NC_000019.9 splign  -1  58864769,58864865;588646...
+NM_130786.3 NC_018930.2 splign  -1  58858699,58858795;588585...
+NM_130786.3 AC_000151.1 splign  -1  55173924,55174020;551738...
+NM_138933.2 NC_000010.10    splign  -1  52645340,52645435;52...
+
+UTA requires that the exon structure of a transcript accession as
+defined on its own sequence is unique. Although this is mostly true,
+there are instances where NCBI reports different exon structures for a
+single transcript. For example, NM_001300954.1 aligns with 11 exons on
+NC_000011.9 and 5 exons on NW_003871081.1, and the differences are NOT
+due merely to concatenation of adjacent spans.
+"""
+
+import argparse
+import importlib_resources
+import logging.config
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import List, Optional
+
+from uta.formats.exonset import ExonSet, ExonSetWriter
+from uta.tools.file_utils import open_file
+
+
+@dataclass
+class GFFRecord:
+    seqid: str
+    start: int
+    end: int
+    strand: str
+    exon_number: int
+    parent_id: str
+    transcript_id: str
+
+    @property
+    def key(self) -> str:
+        return f"{self.transcript_id}:{self.seqid}"
+
+
+def _sort_exons(exons: List[GFFRecord]) -> List[GFFRecord]:
+    return sorted(exons, key=lambda e: e.exon_number)
+
+
+def parse_gff_record(line: str) -> Optional[GFFRecord]:
+    """Parses a single line from a GFF file and returns a GFFRecord if record is an exon aligned to an NC_ chromosome and has a transcript id starting with NM_ or NR_."""
+    # NC_000001.10	BestRefSeq	exon	11874	12227	.	+	.	ID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD/H-box helicase 11 like 1 (pseudogene);pseudo=true;transcript_id=NR_046018.2
+    # NC_000001.10	BestRefSeq	exon	12613	12721	.	+	.	ID=exon-NR_046018.2-2;Parent=rna-NR_046018.2;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD/H-box helicase 11 like 1 (pseudogene);pseudo=true;transcript_id=NR_046018.2
+    # NC_000001.10	BestRefSeq	exon	13221	14409	.	+	.	ID=exon-NR_046018.2-3;Parent=rna-NR_046018.2;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD/H-box helicase 11 like 1 (pseudogene);pseudo=true;transcript_id=NR_046018.2
+
+    fields = line.strip().split("\t")
+    if len(fields) != 9:
+        raise ValueError(f"Expected 9 tab-separated fields, got {len(fields)}")
+
+    seqid, source, feature, start, end, score, strand, phase, attributes_str = fields
+
+    if feature != "exon":
+        return
+
+    attributes = {}
+    for attr_str in attributes_str.split(";"):
+        if "=" in attr_str:
+            key, value = attr_str.split("=")
+            attributes[key.lower()] = value
+
+    parent_id = attributes.get("parent")
+    transcript_id = attributes.get("transcript_id")
+    if (
+        not transcript_id
+        or (not transcript_id.startswith("NM_") and not transcript_id.startswith("NR_"))
+        or not parent_id
+    ):
+        return
+    try:
+        exon_number = _get_exon_number_from_id(alignment_id=attributes.get("id"))
+    except (ValueError, IndexError):
+        raise ValueError(f'Failed to parse exon number from {attributes.get("id")}')
+
+    return GFFRecord(
+        seqid=seqid,
+        start=int(start),
+        end=int(end),
+        strand=strand,
+        exon_number=exon_number,
+        parent_id=parent_id,
+        transcript_id=transcript_id,
+    )
+
+
+def _get_exon_number_from_id(alignment_id: str) -> int:
+    """
+    Pulls the exon number from the alignment id. Expects the id to be in the format
+    exon-<transcript_id>-<exon_number>
+    """
+    return int(alignment_id.split("-")[-1])
+
+
+def parse_gff_files(file_paths: List[str]) -> dict[str, List[GFFRecord]]:
+    tx_data = defaultdict(list)
+    for file_path in file_paths:
+        with open_file(file_path) as f:
+            for line in f:
+                if line.startswith("#"):
+                    continue
+                try:
+                    record = parse_gff_record(line)
+                except ValueError as e:
+                    raise Exception(f"Failed at line :{line} with error: {e}")
+                if record:
+                    tx_data[record.key].append(record)
+    return {k: _sort_exons(v) for k, v in tx_data.items()}
+
+
+def get_zero_based_exon_ranges(transcript_exons: List[GFFRecord]) -> str:
+    """Convert exon ranges to 0-based half-open format"""
+    formatted_exons = []
+    for ex in transcript_exons:
+        formatted_exons.append(",".join(map(str, (ex.start - 1, ex.end))))
+    return ";".join(formatted_exons)
+
+
+if __name__ == "__main__":
+    logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
+    logging.config.fileConfig(logging_conf_fn)
+    logging.getLogger().setLevel(logging.INFO)
+    logger = logging.getLogger(__name__)
+
+    parser = argparse.ArgumentParser(description="Parse GFF file.")
+    parser.add_argument("gff_files", nargs="+", type=str, help="Path to GFF file(s)")
+    args = parser.parse_args()
+
+    gff_files = args.gff_files
+    esw = ExonSetWriter(sys.stdout)
+
+    transcript_alignments = parse_gff_files(gff_files)
+    logger.info(
+        f"read {len(transcript_alignments)} transcript alignments from file(s): {', '.join(gff_files)}"
+    )
+
+    for transcript_exons in transcript_alignments.values():
+        exons_se = get_zero_based_exon_ranges(transcript_exons)
+        e = transcript_exons[0]
+        es = ExonSet(
+            tx_ac=e.transcript_id,
+            alt_ac=e.seqid,
+            method="splign",
+            strand=-1 if e.strand == "-" else 1,
+            exons_se_i=exons_se,
+        )
+        esw.write(es)
diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py
new file mode 100755
index 0000000..b414f2e
--- /dev/null
+++ b/sbin/ncbi_process_mito.py
@@ -0,0 +1,366 @@
+#!/usr/bin/env python
+
+"""
+Download mito fasta and gbff file. Use BioPython to parse the features in the Mitochondrial genbank file to get
+the attributes of a region of the genome that correspond to genes along with their attributes. Output gene/tx/alignment
+details to intermediate file needed to update UTA database and SeqRepo.
+
+    FEATURES             Location/Qualifiers
+     source          1..16569
+                     /organism="Homo sapiens"
+                     /organelle="mitochondrion"
+                     /mol_type="genomic DNA"
+                     /isolation_source="caucasian"
+                     /db_xref="taxon:9606"
+                     /tissue_type="placenta"
+                     /country="United Kingdom: Great Britain"
+                     /note="this is the rCRS"
+     D-loop          complement(join(16024..16569,1..576))
+     gene            577..647
+                     /gene="TRNF"
+                     /nomenclature="Official Symbol: MT-TF | Name:
+                     mitochondrially encoded tRNA phenylalanine | Provided by:
+                     HGNC:HGNC:7481"
+                     /db_xref="GeneID:4558"
+                     /db_xref="HGNC:HGNC:7481"
+                     /db_xref="MIM:590070"
+     tRNA            577..647
+                     /gene="TRNF"
+                     /product="tRNA-Phe"
+                     /note="NAR: 1455"
+                     /anticodon=(pos:611..613,aa:Phe,seq:gaa)
+                     /codon_recognized="UUC"
+                     /db_xref="GeneID:4558"
+                     /db_xref="HGNC:HGNC:7481"
+                     /db_xref="MIM:590070"
+     gene            648..1601
+                     /gene="RNR1"
+                     /gene_synonym="MTRNR1"
+                     /nomenclature="Official Symbol: MT-RNR1 | Name:
+                     mitochondrially encoded 12S RNA | Provided by:
+                     HGNC:HGNC:7470"
+                     /db_xref="GeneID:4549"
+                     /db_xref="HGNC:HGNC:7470"
+                     /db_xref="MIM:561000"
+     rRNA            648..1601
+                     /gene="RNR1"
+                     /gene_synonym="MTRNR1"
+                     /product="s-rRNA"
+                     /note="12S rRNA; 12S ribosomal RNA"
+                     /db_xref="GeneID:4549"
+                     /db_xref="HGNC:HGNC:7470"
+                     /db_xref="MIM:561000"
+                     ...
+"""
+import argparse
+import dataclasses
+import gzip
+import importlib_resources
+import logging
+import logging.config
+from typing import Dict, Iterable, List, Optional
+
+from Bio.Seq import Seq
+import Bio.SeqIO
+from Bio.SeqFeature import SeqFeature
+from Bio.SeqRecord import SeqRecord
+from bioutils.digests import seq_md5
+from more_itertools import one
+
+from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter
+from uta.formats.geneinfo import GeneInfo, GeneInfoWriter
+from uta.formats.seqinfo import SeqInfo, SeqInfoWriter
+from uta.formats.txinfo import TxInfo, TxInfoWriter
+from uta.formats.exonset import ExonSet, ExonSetWriter
+from uta.tools.eutils import download_from_eutils, NcbiFileFormatEnum
+
+
+@dataclasses.dataclass
+class MitoGeneData:
+    gene_id: int
+    gene_symbol: str
+    name: str
+    synonym: str
+    xrefs: List[str]
+    type: str
+    tx_ac: str
+    tx_seq: str
+    tx_start: int
+    tx_end: int
+    alt_ac: str
+    alt_start: int
+    alt_end: int
+    strand: int
+    origin: str = "NCBI"
+    alignment_method: str = "splign"
+    transl_table: Optional[str] = None
+    transl_except: Optional[List[str]] = None
+    pro_ac: Optional[str] = None
+    pro_seq: Optional[str] = None
+
+    def exons_se_i(self) -> str:
+        return f"{self.tx_start},{self.tx_end}"
+
+    def cds_se_i(self) -> str:
+        return self.exons_se_i() if self.pro_ac else ""
+
+    def alt_exons_se_i(self) -> str:
+        return f"{self.alt_start},{self.alt_end}"
+
+
+logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
+logging.config.fileConfig(logging_conf_fn)
+logging.getLogger().setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("accession", type=str)
+    parser.add_argument("--output-dir", "-o", default=".", type=str)
+    return parser.parse_args()
+
+
+def download_mito_files(output_dir: str, accession: str) -> Dict[str, str]:
+    logger.info(f"downloading files for {accession}")
+    mt_gb_filepath = f"{output_dir}/{accession}.gbff"
+    mt_fa_filepath = f"{output_dir}/{accession}.fna"
+
+    logger.info(f"downloading {NcbiFileFormatEnum.GENBANK} file to {mt_gb_filepath}")
+    download_from_eutils(accession, NcbiFileFormatEnum.GENBANK, mt_gb_filepath)
+
+    logger.info(f"downloading {NcbiFileFormatEnum.FASTA} file to {mt_fa_filepath}")
+    download_from_eutils(accession, NcbiFileFormatEnum.FASTA, mt_fa_filepath)
+
+    return {"gbff": mt_gb_filepath, "fna": mt_fa_filepath}
+
+
+def parse_db_xrefs(gb_feature: SeqFeature) -> Dict[str, str]:
+    """
+    Example:
+        Key: db_xref
+        Value: ['GeneID:4558', 'HGNC:HGNC:7481', 'MIM:590070']
+    """
+    return {
+        x.partition(":")[0].strip(): x.partition(":")[2].strip()
+        for x in gb_feature.qualifiers.get("db_xref", [])
+    }
+
+
+def parse_nomenclature_value(gb_feature: SeqFeature) -> Dict[str, str]:
+    """
+    Example:
+        Key: nomenclature
+        Value: ['Official Symbol: MT-TF | Name: mitochondrially encoded tRNA phenylalanine | Provided by: HGNC:HGNC:7481']
+    """
+    nomenclature_key = "nomenclature"
+    nomenclature_results: Dict[str, str] = {}
+    if nomenclature_key in gb_feature.qualifiers:
+        nomenclature_list = list(
+            map(
+                lambda x: x.strip(),
+                one(gb_feature.qualifiers[nomenclature_key]).split("|"),
+            )
+        )
+        nomenclature_results = {
+            x.partition(":")[0].strip(): x.partition(":")[2].strip()
+            for x in nomenclature_list
+        }
+
+    return nomenclature_results
+
+
+def get_mito_genes(gbff_filepath: str) -> Iterable[MitoGeneData]:
+    logger.info(f"processing NCBI GBFF file from {gbff_filepath}")
+    with open(gbff_filepath) as fh:
+        # Bio.SeqIO.parse(fh, "gb") returns an empty iterator for .fna files and does not fail
+        for record in Bio.SeqIO.parse(fh, "gb"):
+            for feature in record.features:
+                xrefs = parse_db_xrefs(feature)
+
+                feature_start, feature_end = (
+                    feature.location.start,
+                    feature.location.end,
+                )
+
+                # dependent on feature type, process data and output if appropriate
+                if feature.type == "gene":
+                    # assert subsequent features represent the same location
+                    assert feature_start == feature.location.start
+                    assert feature_end == feature.location.end
+                    # for gene feature do not yield anything, just set gene level attributes
+                    gene_id = int(xrefs["GeneID"])
+                    nomenclature = parse_nomenclature_value(feature)
+                    hgnc = nomenclature["Official Symbol"]
+                    name = nomenclature["Name"]
+
+                elif feature.type in ("tRNA", "rRNA", "CDS"):
+                    # assert subsequent features represent the same location and gene
+                    assert int(xrefs["GeneID"]) == gene_id
+                    assert feature_start == feature.location.start
+                    assert feature_end == feature.location.end
+
+                    # retrieve sequence, and reverse compliment if on reverse strand
+                    ac = f"{record.id}_{feature.location.start:05}_{feature.location.end:05}"
+                    feature_seq = record.seq[feature_start:feature_end]
+                    gene_synonym = feature.qualifiers.get("gene_synonym", "")
+                    type = feature.type
+                    if feature.location.strand == -1:
+                        feature_seq = feature_seq.reverse_complement()
+
+                    if feature.type == "CDS":
+                        # override defaults for CDS features
+                        type = "protein-coding"
+                        pro_ac = one(feature.qualifiers["protein_id"])
+                        pro_seq = str(one(feature.qualifiers["translation"]))
+                        transl_table = one(feature.qualifiers["transl_table"])
+                        transl_except = feature.qualifiers.get("transl_except")
+                    else:
+                        pro_ac = None
+                        pro_seq = None
+                        transl_table = None
+                        transl_except = None
+
+                    # yield gene data
+                    yield MitoGeneData(
+                        gene_id=gene_id,
+                        gene_symbol=hgnc,
+                        name=name,
+                        synonym=gene_synonym,
+                        xrefs=[f"{k}:{v}" for k, v in xrefs.items()],
+                        type=type,
+                        tx_ac=ac,
+                        tx_seq=str(feature_seq),
+                        tx_start=0,
+                        tx_end=feature.location.end - feature.location.start,
+                        alt_ac=record.id,
+                        alt_start=feature_start,
+                        alt_end=feature_end,
+                        strand=feature.location.strand,
+                        transl_table=transl_table,
+                        transl_except=transl_except,
+                        pro_ac=pro_ac,
+                        pro_seq=pro_seq,
+                    )
+
+
+def main(ncbi_accession: str, output_dir: str) -> None:
+    # get input files
+    input_files = download_mito_files(output_dir=output_dir, accession=ncbi_accession)
+
+    # extract Mitochondrial gene information
+    mito_genes = [mg for mf in input_files.values() for mg in get_mito_genes(mf)]
+    logger.info(f"found {len(mito_genes)} genes from parsing {input_files['gbff']}")
+
+    # write gene information
+    with gzip.open(f"{output_dir}/geneinfo.gz", "wt") as o_file:
+        giw = GeneInfoWriter(o_file)
+        for mg in mito_genes:
+            giw.write(
+                GeneInfo(
+                    mg.gene_id,
+                    mg.gene_symbol,
+                    9606,
+                    mg.gene_symbol,
+                    "",
+                    mg.synonym,
+                    mg.type,
+                    mg.name,
+                    mg.name,
+                    mg.xrefs,
+                )
+            )
+
+    # write gene accession associations
+    with gzip.open(f"{output_dir}/assocacs.gz", "wt") as o_file:
+        gaw = GeneAccessionsWriter(o_file)
+        for mg in mito_genes:
+            if mg.pro_ac is not None:
+                gaw.write(
+                    GeneAccessions(
+                        mg.gene_symbol, mg.tx_ac, mg.gene_id, mg.pro_ac, mg.origin
+                    )
+                )
+
+    # write sequence information
+    with gzip.open(f"{output_dir}/seqinfo.gz", "wt") as o_file:
+        siw = SeqInfoWriter(o_file)
+        for mg in mito_genes:
+            siw.write(
+                SeqInfo(
+                    seq_md5(mg.tx_seq),
+                    mg.origin,
+                    mg.tx_ac,
+                    mg.name,
+                    len(mg.tx_seq),
+                    None,
+                )
+            )
+            if mg.pro_ac is not None:
+                siw.write(
+                    SeqInfo(
+                        seq_md5(mg.pro_seq),
+                        mg.origin,
+                        mg.pro_ac,
+                        mg.name,
+                        len(mg.pro_seq),
+                        None,
+                    )
+                )
+
+    # write out transcript sequence fasta files.
+    with gzip.open(f"{output_dir}/{ncbi_accession}.rna.fna.gz", "wt") as o_file:
+        for mg in mito_genes:
+            record = SeqRecord(
+                Seq(mg.tx_seq),
+                id=mg.tx_ac,
+                description=mg.name,
+            )
+            o_file.write(record.format("fasta"))
+
+    # write out protein sequence fasta files.
+    with gzip.open(f"{output_dir}/{ncbi_accession}.protein.faa.gz", "wt") as o_file:
+        for mg in mito_genes:
+            if mg.pro_ac is not None:
+                record = SeqRecord(
+                    Seq(mg.pro_seq),
+                    id=mg.pro_ac,
+                    description=mg.name,
+                )
+                o_file.write(record.format("fasta"))
+
+    # write transcript information
+    with gzip.open(f"{output_dir}/txinfo.gz", "wt") as o_file:
+        tiw = TxInfoWriter(o_file)
+        for mg in mito_genes:
+            tiw.write(
+                TxInfo(
+                    mg.origin,
+                    mg.tx_ac,
+                    mg.gene_id,
+                    mg.gene_symbol,
+                    mg.cds_se_i(),
+                    mg.exons_se_i(),
+                    mg.transl_table,
+                    TxInfo.serialize_transl_except(mg.transl_except),
+                )
+            )
+
+    # write exonset
+    with gzip.open(f"{output_dir}/exonsets.gz", "wt") as o_file:
+        esw = ExonSetWriter(o_file)
+        for mg in mito_genes:
+            esw.write(
+                ExonSet(
+                    mg.tx_ac,
+                    mg.alt_ac,
+                    mg.alignment_method,
+                    mg.strand,
+                    mg.alt_exons_se_i(),
+                )
+            )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args.accession, args.output_dir)
diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load
new file mode 100755
index 0000000..95b662f
--- /dev/null
+++ b/sbin/seqrepo-load
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+sequence_dir=$1
+log_dir=$2
+
+if [ -z "$sequence_dir" ] || [ -z "$log_dir" ]
+then
+    echo 'Usage: sbin/seqrepo-load <sequence_dir> <log_dir>'
+    exit 1
+fi
+
+# find all fasta files in the working directory
+mapfile -t FASTA_FILES < <(find "$sequence_dir" -type f -name "*.f[an]a*")
+
+# Load SeqRepo with new sequences
+seqrepo --root-directory "/biocommons/dl.biocommons.org/seqrepo" \
+    load -n NCBI --instance-name "master" \
+    "${FASTA_FILES[@]}" 2>&1 | \
+    tee "$log_dir/seqrepo-load.log"
diff --git a/sbin/seqrepo-pull b/sbin/seqrepo-pull
new file mode 100755
index 0000000..894dffa
--- /dev/null
+++ b/sbin/seqrepo-pull
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+SEQREPO_DIR="/biocommons/dl.biocommons.org/seqrepo"
+
+# pull the latest seqrepo version from biocommons
+latest_version=$(seqrepo list-remote-instances | tail -n 1 | xargs)
+cd "$SEQREPO_DIR"
+rsync -rtHP --no-motd dl.biocommons.org::seqrepo/"$latest_version" .
+
+# setup seqrepo build directory
+mkdir -p master/sequences
+cd "$latest_version"
+cp -av aliases.sqlite3 "$SEQREPO_DIR"/master/
+chmod u+w "$SEQREPO_DIR"/master/aliases.sqlite3
+cd sequences
+cp -av db.sqlite3 "$SEQREPO_DIR"/master/sequences/
+chmod u+w "$SEQREPO_DIR"/master/sequences/db.sqlite3
+for d in 2???; do
+    cp -alv $d "$SEQREPO_DIR"/master/sequences/
+done
diff --git a/sbin/update-ncbi b/sbin/update-ncbi
index 33211c7..41271d3 100755
--- a/sbin/update-ncbi
+++ b/sbin/update-ncbi
@@ -88,7 +88,7 @@ if __name__ == "__main__":
     logging.basicConfig(level=logging.DEBUG)
     logger = logging.getLogger()
 
-    url = "postgresql://uta_admin@localhost/uta_dev"
+    url = "postgresql://uta_admin@localhost/uta"
     uta, ncbi = sys.argv[1:3]
     conn = psycopg2.connect(url)
 
diff --git a/sbin/uta-diff b/sbin/uta-diff
index 489562d..d89798d 100755
--- a/sbin/uta-diff
+++ b/sbin/uta-diff
@@ -14,7 +14,8 @@ cmp_cols = collections.defaultdict(lambda: ['*'])
 cmp_cols.update({
     "associated_accessions": "tx_ac pro_ac origin".split(),
     "exon_aln": "exon_aln_id tx_exon_id alt_exon_id cigar added".split(),
-    "gene": "hgnc".split(),
+    "gene": "gene_id".split(),
+    "seq_anno": "seq_anno_id seq_id origin_id ac added".split(),
     "transcript": "ac".split(),
     })
 
@@ -41,10 +42,10 @@ def cmp1(con, tbl, s1, s2):
 
 
 if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
+    logging.basicConfig(level=logging.INFO)
     logger = logging.getLogger()
 
-    url = "postgresql://uta_admin@localhost/uta_dev"
+    url = "postgresql://uta_admin@localhost/uta"
     tables = ["associated_accessions", "exon", "exon_aln", "exon_set",
               "gene", "meta", "origin", "seq", "seq_anno", "transcript",]
 
@@ -66,7 +67,7 @@ if __name__ == "__main__":
     print("""UTA comparison: url={url}, s1={s1}, s2={s2}
 t: time taken (seconds)
 n1, n2: total number of rows in schemas s1 and s2
-nu1, nu2, c: number of rows unique to s1, unique to s2, and common to both
+nu1, nu2, nc: number of rows unique to s1, unique to s2, and common to both
 cols: cols used for comparison
 """.format(url=url, s1=s1, s2=s2))
     print(pt)
diff --git a/sbin/uta-extract b/sbin/uta-extract
new file mode 100755
index 0000000..9dbf716
--- /dev/null
+++ b/sbin/uta-extract
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+# Extract data from NCBI files into intermediate files.
+
+set -euxo pipefail
+
+ncbi_dir=$1
+working_dir=$2
+log_dir=$3
+
+if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
+then
+    echo 'Usage: sbin/uta-extract <ncbi_dir> <working_dir> <log_dir>'
+    exit 1
+fi
+
+# genes
+sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz | \
+    gzip -c > "$working_dir/geneinfo.gz" 2>&1 | tee "$log_dir/ncbi-parse-geneinfo.log"
+
+# transcript protein associations
+sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2refseq.gz | gzip -c > "$working_dir/assocacs.gz" 2>&1 | \
+    tee "$log_dir/ncbi-fetch-assoc-acs.log"
+
+# parse transcript info from GBFF input files
+mapfile -t GBFF_FILES < <(find "$ncbi_dir/refseq" -type f -name "human.*.rna.gbff.gz")
+sbin/ncbi-parse-gbff "${GBFF_FILES[@]}" | gzip -c > "$working_dir/txinfo.gz" 2>&1 | \
+    tee "$log_dir/ncbi-parse-gbff.log"
+
+# parse alignments from GFF input files
+# Due to NCBI's handling of transcripts with "frameshifting insertions and deletions with micro-introns" we
+# need to parse out the cDNA_match alignment and use them preferentially over exons from genome GFF files.
+# The cDNA_match records include the indels and do not have micro-introns.
+mapfile -t GFF_FILES < <(find "$ncbi_dir/genomes" -type f -name "GCF_*_genomic.gff.gz")
+sbin/ncbi-parse-gff "${GFF_FILES[@]}" | gzip -c > "$working_dir/cdna_match.exonsets.gz" 2>&1 | \
+    tee "$log_dir/ncbi_parse_gff.log"
+
+# extract exon blocks from GFF files
+sbin/ncbi_parse_genomic_gff.py "${GFF_FILES[@]}" | gzip -c > "$working_dir/exon_block.exonsets.gz" 2>&1 | \
+    tee "$log_dir/ncbi-parse-genomic-gff.log"
+
+# coalesce exonsets
+sbin/coalesce_exonsets.py "$working_dir/cdna_match.exonsets.gz" "$working_dir/exon_block.exonsets.gz" | \
+    gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | tee "$log_dir/coalesce_exonsets.log"
+
+# filter transcripts
+sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \
+    --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \
+    tee "$log_dir/filter_exonset_transcripts.log"
+
+# move fasta files into same dir
+find "$ncbi_dir" -type f -name "*.f[an]a.gz" -print0 | xargs -i --null cp {} "$working_dir/"
diff --git a/sbin/uta-load b/sbin/uta-load
new file mode 100755
index 0000000..d419007
--- /dev/null
+++ b/sbin/uta-load
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+# This script updates UTA and SeqRepo using NCBI files.
+# source_uta_v is the UTA version before the update.
+# ncbi_dir is where the script looks for NCBI data files.
+# working_dir stores intermediate data files and the final database dump.
+# log_dir stores log files.
+
+# Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo.
+
+set -euxo pipefail
+
+source_uta_v=$1
+dest_uta_v=$2
+ncbi_dir=$3
+working_dir=$4
+log_dir=$5
+
+if [ -z "$source_uta_v" ] || [ -z "$dest_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
+then
+    echo 'Usage: uta-load <source_uta_v> <dest_uta_v> <ncbi_dir> <working_dir> <log_dir>'
+    exit 1
+fi
+
+# set local variables and create working directories
+loading_uta_v="uta"
+mkdir -p "$log_dir"
+
+## Drop loading schema, and recreate
+etc/scripts/delete-schema.sh "$loading_uta_v"
+etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v"
+
+## apply any outstanding alembic migrations and update schema version if necessary
+alembic -c etc/alembic.ini upgrade head
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf update-meta-data
+
+# generate seqinfo files from exonsets (this step requires seqrepo)
+sbin/exonset-to-seqinfo -o NCBI "$working_dir/exonsets.gz" | gzip -c > "$working_dir/seqinfo.gz" 2>&1 | \
+    tee "$log_dir/exonset-to-seqinfo.log"
+
+# Filter out columns from assocacs file.
+sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-ac.gz" 2>&1 | \
+    tee "$log_dir/assoc-acs-merge.log"
+
+# Load genes into gene table.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
+    tee "$log_dir/load-geneinfo.log"
+
+# Load accessions into associated_accessions table.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \
+    tee "$log_dir/load-assoc-ac.log"
+
+# Load transcript info into transcript and exon_set tables.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$working_dir/txinfo.gz" 2>&1 | \
+    tee "$log_dir/load-txinfo.log"
+
+# Load exon sets into into exon_set and exon tables.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$working_dir/exonsets.gz" 2>&1 | \
+    tee "$log_dir/load-exonsets.log"
+
+# Load seqinfo into the seq and seqanno tables.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-seqinfo "$working_dir/seqinfo.gz" 2>&1 | \
+    tee "$log_dir/load-seqinfo.log"
+
+# Create cigar strings for all rows in tx_alt_exon_pairs_v view and update exon_aln table.
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 | \
+    tee "$log_dir/align-exons.log"
+
+### run diff
+sbin/uta-diff "$source_uta_v" "$loading_uta_v"
+
+## Rename schema to destination schema name and export to dump file
+psql -h localhost -U uta_admin -d uta -c "DROP SCHEMA IF EXISTS $dest_uta_v CASCADE;"
+psql -h localhost -U uta_admin -d uta -c "ALTER SCHEMA uta RENAME TO $dest_uta_v";
+pg_dump -h localhost -U uta_admin -d uta -n "$dest_uta_v" | \
+ gzip -c > "$working_dir/$dest_uta_v.pgd.gz"
diff --git a/src/alembic/README b/src/alembic/README
new file mode 100644
index 0000000..98e4f9c
--- /dev/null
+++ b/src/alembic/README
@@ -0,0 +1 @@
+Generic single-database configuration.
\ No newline at end of file
diff --git a/src/alembic/env.py b/src/alembic/env.py
new file mode 100644
index 0000000..1e8e830
--- /dev/null
+++ b/src/alembic/env.py
@@ -0,0 +1,92 @@
+from logging.config import fileConfig
+
+from sqlalchemy import engine_from_config
+from sqlalchemy import pool
+
+from alembic import context
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+# from myapp import mymodel
+# target_metadata = mymodel.Base.metadata
+from uta.models import Base
+target_metadata = Base.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def include_name(name, type_, parent_names) -> bool:
+    if type_ == "schema":
+        return name in ["uta"]
+    else:
+        return True
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = config.get_main_option("sqlalchemy.url")
+    context.configure(
+        url=url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    connectable = engine_from_config(
+        config.get_section(config.config_ini_section, {}),
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection,
+            target_metadata=target_metadata,
+            version_table_schema=target_metadata.schema,
+            include_schemas=True,
+            include_name=include_name,
+        )
+
+        with context.begin_transaction():
+            context.execute(f'create schema if not exists {target_metadata.schema};')
+            context.execute(f'set search_path to {target_metadata.schema}')
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
diff --git a/src/alembic/script.py.mako b/src/alembic/script.py.mako
new file mode 100644
index 0000000..fbc4b07
--- /dev/null
+++ b/src/alembic/script.py.mako
@@ -0,0 +1,26 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision: str = ${repr(up_revision)}
+down_revision: Union[str, None] = ${repr(down_revision)}
+branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
+depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    ${downgrades if downgrades else "pass"}
diff --git a/src/alembic/versions/14eed54ff90d_create_translation_exception_table.py b/src/alembic/versions/14eed54ff90d_create_translation_exception_table.py
new file mode 100644
index 0000000..b55a336
--- /dev/null
+++ b/src/alembic/versions/14eed54ff90d_create_translation_exception_table.py
@@ -0,0 +1,37 @@
+"""create translation_exception table
+
+Revision ID: 14eed54ff90d
+Revises: f85dd97bd9f5
+Create Date: 2024-04-25 23:57:12.455316
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '14eed54ff90d'
+down_revision: Union[str, None] = 'f85dd97bd9f5'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        'translation_exception',
+        sa.Column('translation_exception_id', sa.Integer(), autoincrement=True, nullable=False),
+        sa.Column('tx_ac', sa.Text(), nullable=False),
+        sa.Column('start_position', sa.Integer(), nullable=False),
+        sa.Column('end_position', sa.Integer(), nullable=False),
+        sa.Column('amino_acid', sa.Text(), nullable=False),
+        sa.CheckConstraint('start_position <= end_position', name='start_less_than_or_equal_to_end'),
+        sa.ForeignKeyConstraint(['tx_ac'], ['uta.transcript.ac'], onupdate='CASCADE', ondelete='CASCADE'),
+        sa.PrimaryKeyConstraint('translation_exception_id'),
+        schema='uta',
+    )
+
+
+def downgrade() -> None:
+    op.drop_table('translation_exception', schema='uta')
diff --git a/src/alembic/versions/19561fe444c8_create_materialized_view_for_tx_exon_.py b/src/alembic/versions/19561fe444c8_create_materialized_view_for_tx_exon_.py
new file mode 100644
index 0000000..ae21036
--- /dev/null
+++ b/src/alembic/versions/19561fe444c8_create_materialized_view_for_tx_exon_.py
@@ -0,0 +1,32 @@
+"""create materialized view for tx_exon_aln_v
+
+Revision ID: 19561fe444c8
+Revises: f885cb84efce
+Create Date: 2024-05-07 21:59:09.078549
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '19561fe444c8'
+down_revision: Union[str, None] = 'f885cb84efce'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_aln_mv CASCADE;")
+    op.execute("""
+            CREATE MATERIALIZED VIEW tx_exon_aln_mv AS SELECT * FROM tx_exon_aln_v WITH NO DATA;
+            CREATE INDEX tx_exon_aln_mv_tx_alt_ac_ix ON tx_exon_aln_mv(tx_ac, alt_ac, alt_aln_method);
+            GRANT SELECT ON tx_exon_set_summary_mv TO public;
+            REFRESH MATERIALIZED VIEW tx_exon_aln_mv;
+        """)
+
+
+def downgrade() -> None:
+    op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_aln_mv CASCADE;")
diff --git a/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py b/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py
new file mode 100644
index 0000000..06156bb
--- /dev/null
+++ b/src/alembic/versions/595a586e6de7_add_gene_id_to_gene_and_transcript.py
@@ -0,0 +1,44 @@
+"""add gene_id to gene and transcript
+
+Revision ID: 595a586e6de7
+Revises: a697b584f699
+Create Date: 2024-04-10 19:47:43.685672
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '595a586e6de7'
+down_revision: Union[str, None] = 'a697b584f699'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('gene', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta')
+    op.add_column('gene', sa.Column('type', sa.Text(), nullable=True), schema='uta')
+    op.add_column('gene', sa.Column('xrefs', sa.Text(), nullable=True), schema='uta')
+    op.add_column('transcript', sa.Column('gene_id', sa.Text(), nullable=True), schema='uta')
+    # ### end Alembic commands ###
+
+    # ### commands to drop existing primary key on gene table ###
+    op.drop_constraint('gene_pkey', 'gene', schema='uta')
+    # ### end of commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('transcript', 'gene_id', schema='uta')
+    op.drop_column('gene', 'xrefs', schema='uta')
+    op.drop_column('gene', 'type', schema='uta')
+    op.drop_column('gene', 'gene_id', schema='uta')
+    # ### end Alembic commands ###
+
+    # ### commands to add primary key on gene table ###
+    op.create_primary_key('gene_pkey', 'gene', ['hgnc'], schema='uta')
+    # ### end of commands ###
diff --git a/src/alembic/versions/77076df4224c_add_tx_hgnc_index.py b/src/alembic/versions/77076df4224c_add_tx_hgnc_index.py
new file mode 100644
index 0000000..0a684a2
--- /dev/null
+++ b/src/alembic/versions/77076df4224c_add_tx_hgnc_index.py
@@ -0,0 +1,30 @@
+"""hgnc test
+
+Revision ID: 77076df4224c
+Revises: 19561fe444c8
+Create Date: 2024-08-26 17:08:13.160259
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '77076df4224c'
+down_revision: Union[str, None] = '19561fe444c8'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_index(op.f('ix_uta_transcript_hgnc'), 'transcript', ['hgnc'], unique=False, schema='uta')
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f('ix_uta_transcript_hgnc'), table_name='transcript', schema='uta')
+    # ### end Alembic commands ###
diff --git a/src/alembic/versions/a697b584f699_add_codon_table_to_transcript.py b/src/alembic/versions/a697b584f699_add_codon_table_to_transcript.py
new file mode 100644
index 0000000..d440495
--- /dev/null
+++ b/src/alembic/versions/a697b584f699_add_codon_table_to_transcript.py
@@ -0,0 +1,33 @@
+"""add codon_table to Transcript
+
+Revision ID: a697b584f699
+Revises: cc51f50ae896
+Create Date: 2024-04-08 17:27:41.570024
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'a697b584f699'
+down_revision: Union[str, None] = 'cc51f50ae896'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('transcript', sa.Column('codon_table', sa.Text(), nullable=True), schema='uta')
+    # ### end Alembic commands ###
+    # ### population of codon_table column with data ###
+    op.execute("UPDATE transcript SET codon_table = '1' WHERE cds_start_i NOTNULL;")
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('transcript', 'codon_table', schema='uta')
+    # ### end Alembic commands ###
diff --git a/src/alembic/versions/cc51f50ae896_add_sqlalchemy_model_for_assocacs.py b/src/alembic/versions/cc51f50ae896_add_sqlalchemy_model_for_assocacs.py
new file mode 100644
index 0000000..c8ee756
--- /dev/null
+++ b/src/alembic/versions/cc51f50ae896_add_sqlalchemy_model_for_assocacs.py
@@ -0,0 +1,44 @@
+"""add sqlalchemy model for assocacs
+
+Revision ID: cc51f50ae896
+Revises: edadb97f6502
+Create Date: 2024-04-05 00:33:40.105587
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'cc51f50ae896'
+down_revision: Union[str, None] = 'edadb97f6502'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.alter_column('associated_accessions', 'tx_ac',
+               existing_type=sa.TEXT(),
+               nullable=False,
+               schema='uta')
+    op.alter_column('associated_accessions', 'pro_ac',
+               existing_type=sa.TEXT(),
+               nullable=False,
+               schema='uta')
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.alter_column('associated_accessions', 'pro_ac',
+               existing_type=sa.TEXT(),
+               nullable=True,
+               schema='uta')
+    op.alter_column('associated_accessions', 'tx_ac',
+               existing_type=sa.TEXT(),
+               nullable=True,
+               schema='uta')
+    # ### end Alembic commands ###
diff --git a/src/alembic/versions/edadb97f6502_initial_state.py b/src/alembic/versions/edadb97f6502_initial_state.py
new file mode 100644
index 0000000..ba46093
--- /dev/null
+++ b/src/alembic/versions/edadb97f6502_initial_state.py
@@ -0,0 +1,390 @@
+"""initial state
+
+Revision ID: edadb97f6502
+Revises: 
+Create Date: 2024-04-03 21:41:05.875580
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'edadb97f6502'
+down_revision: Union[str, None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('gene',
+    sa.Column('hgnc', sa.Text(), nullable=False),
+    sa.Column('maploc', sa.Text(), nullable=True),
+    sa.Column('descr', sa.Text(), nullable=True),
+    sa.Column('summary', sa.Text(), nullable=True),
+    sa.Column('aliases', sa.Text(), nullable=True),
+    sa.Column('added', sa.DateTime(), nullable=False),
+    sa.PrimaryKeyConstraint('hgnc'),
+    schema='uta'
+    )
+    op.create_table('meta',
+    sa.Column('key', sa.Text(), nullable=False),
+    sa.Column('value', sa.Text(), nullable=False),
+    sa.PrimaryKeyConstraint('key'),
+    schema='uta'
+    )
+    op.create_table('origin',
+    sa.Column('origin_id', sa.Integer(), autoincrement=True, nullable=False),
+    sa.Column('name', sa.Text(), nullable=False),
+    sa.Column('descr', sa.Text(), nullable=True),
+    sa.Column('updated', sa.DateTime(), nullable=True),
+    sa.Column('url', sa.Text(), nullable=True),
+    sa.Column('url_ac_fmt', sa.Text(), nullable=True),
+    sa.PrimaryKeyConstraint('origin_id'),
+    sa.UniqueConstraint('name'),
+    schema='uta'
+    )
+    op.create_table('seq',
+    sa.Column('seq_id', sa.Text(), nullable=False),
+    sa.Column('len', sa.Integer(), nullable=False),
+    sa.Column('seq', sa.Text(), nullable=True),
+    sa.PrimaryKeyConstraint('seq_id'),
+    schema='uta'
+    )
+    op.create_table('seq_anno',
+    sa.Column('seq_anno_id', sa.Integer(), autoincrement=True, nullable=False),
+    sa.Column('seq_id', sa.Text(), nullable=True),
+    sa.Column('origin_id', sa.Integer(), nullable=False),
+    sa.Column('ac', sa.Text(), nullable=False),
+    sa.Column('descr', sa.Text(), nullable=True),
+    sa.Column('added', sa.DateTime(), nullable=False),
+    sa.ForeignKeyConstraint(['origin_id'], ['uta.origin.origin_id'], onupdate='CASCADE', ondelete='CASCADE'),
+    sa.ForeignKeyConstraint(['seq_id'], ['uta.seq.seq_id'], onupdate='CASCADE', ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('seq_anno_id'),
+    schema='uta'
+    )
+    op.create_index(op.f('ix_uta_seq_anno_ac'), 'seq_anno', ['ac'], unique=False, schema='uta')
+    op.create_index(op.f('ix_uta_seq_anno_seq_id'), 'seq_anno', ['seq_id'], unique=False, schema='uta')
+    op.create_index('seq_anno_ac_unique_in_origin', 'seq_anno', ['origin_id', 'ac'], unique=True, schema='uta')
+    op.create_table('transcript',
+    sa.Column('ac', sa.Text(), nullable=False),
+    sa.Column('origin_id', sa.Integer(), nullable=False),
+    sa.Column('hgnc', sa.Text(), nullable=True),
+    sa.Column('cds_start_i', sa.Integer(), nullable=True),
+    sa.Column('cds_end_i', sa.Integer(), nullable=True),
+    sa.Column('cds_md5', sa.Text(), nullable=True),
+    sa.Column('added', sa.DateTime(), nullable=False),
+    sa.CheckConstraint('cds_start_i <= cds_end_i', name='cds_start_i_must_be_le_cds_end_i'),
+    sa.ForeignKeyConstraint(['origin_id'], ['uta.origin.origin_id'], onupdate='CASCADE', ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('ac'),
+    schema='uta'
+    )
+    op.create_index(op.f('ix_uta_transcript_cds_md5'), 'transcript', ['cds_md5'], unique=False, schema='uta')
+    op.create_index(op.f('ix_uta_transcript_origin_id'), 'transcript', ['origin_id'], unique=False, schema='uta')
+    op.create_table('exon_set',
+    sa.Column('exon_set_id', sa.Integer(), autoincrement=True, nullable=False),
+    sa.Column('tx_ac', sa.Text(), nullable=False),
+    sa.Column('alt_ac', sa.Text(), nullable=False),
+    sa.Column('alt_strand', sa.SmallInteger(), nullable=False),
+    sa.Column('alt_aln_method', sa.Text(), nullable=False),
+    sa.Column('added', sa.DateTime(), nullable=False),
+    sa.ForeignKeyConstraint(['tx_ac'], ['uta.transcript.ac'], onupdate='CASCADE', ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('exon_set_id'),
+    sa.UniqueConstraint('tx_ac', 'alt_ac', 'alt_aln_method', name='<transcript,reference,method> must be unique'),
+    schema='uta'
+    )
+    op.create_table('exon',
+    sa.Column('exon_id', sa.Integer(), autoincrement=True, nullable=False),
+    sa.Column('exon_set_id', sa.Integer(), nullable=False),
+    sa.Column('start_i', sa.Integer(), nullable=False),
+    sa.Column('end_i', sa.Integer(), nullable=False),
+    sa.Column('ord', sa.Integer(), nullable=False),
+    sa.Column('name', sa.Text(), nullable=True),
+    sa.CheckConstraint('start_i < end_i', name='exon_start_i_must_be_lt_end_i'),
+    sa.ForeignKeyConstraint(['exon_set_id'], ['uta.exon_set.exon_set_id'], onupdate='CASCADE', ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('exon_id'),
+    sa.UniqueConstraint('exon_set_id', 'end_i', name='end_i_must_be_unique_in_exon_set'),
+    sa.UniqueConstraint('exon_set_id', 'start_i', name='start_i_must_be_unique_in_exon_set'),
+    schema='uta'
+    )
+    op.create_index(op.f('ix_uta_exon_exon_set_id'), 'exon', ['exon_set_id'], unique=False, schema='uta')
+    op.create_table('exon_aln',
+    sa.Column('exon_aln_id', sa.Integer(), autoincrement=True, nullable=False),
+    sa.Column('tx_exon_id', sa.Integer(), nullable=False),
+    sa.Column('alt_exon_id', sa.Integer(), nullable=False),
+    sa.Column('cigar', sa.Text(), nullable=False),
+    sa.Column('added', sa.DateTime(), nullable=False),
+    sa.Column('tx_aseq', sa.Text(), nullable=True),
+    sa.Column('alt_aseq', sa.Text(), nullable=True),
+    sa.ForeignKeyConstraint(['alt_exon_id'], ['uta.exon.exon_id'], onupdate='CASCADE', ondelete='CASCADE'),
+    sa.ForeignKeyConstraint(['tx_exon_id'], ['uta.exon.exon_id'], onupdate='CASCADE', ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('exon_aln_id'),
+    schema='uta'
+    )
+    op.create_index(op.f('ix_uta_exon_aln_alt_exon_id'), 'exon_aln', ['alt_exon_id'], unique=False, schema='uta')
+    op.create_index(op.f('ix_uta_exon_aln_tx_exon_id'), 'exon_aln', ['tx_exon_id'], unique=False, schema='uta')
+    # ### end Alembic commands ###
+
+    # ### custom commands to match the initial UTA database schema 1.1 ###
+    op.create_table('associated_accessions',
+    sa.Column('associated_accession_id', sa.Integer(), autoincrement=True, nullable=False),
+    sa.Column('tx_ac', sa.Text(), nullable=True),
+    sa.Column('pro_ac', sa.Text(), nullable=True),
+    sa.Column('origin', sa.Text(), nullable=False),
+    sa.Column('added', postgresql.TIMESTAMP(timezone=True), server_default=sa.text('now()'), nullable=False),
+    sa.PrimaryKeyConstraint('associated_accession_id'),
+    schema='uta'
+    )
+    op.create_index('associated_accessions_tx_ac', 'associated_accessions', ['tx_ac'], unique=False, schema='uta')
+    op.create_index('associated_accessions_pro_ac', 'associated_accessions', ['pro_ac'], unique=False, schema='uta')
+    op.create_index('unique_pair_in_origin', 'associated_accessions', ['origin', 'tx_ac', 'pro_ac'], unique=True, schema='uta')
+    op.create_table_comment('associated_accessions', 'transcript-protein accession pairs associated in source databases', schema='uta')
+    # ### end custom commands ###
+
+    # ### custom SQL to add views to match the initial UTA database schema 1.1 ###
+    op.execute("""
+        CREATE VIEW _cds_exons_v AS
+            WITH cds_exons as (
+            SELECT ES.exon_set_id, T.ac AS tx_ac, E.ord,
+                   E.start_i, E.end_i, 
+                   CASE WHEN E.end_i >= T.cds_start_i AND E.start_i <= T.cds_end_i THEN greatest(E.start_i,T.cds_start_i) ELSE NULL end AS cds_ex_start_i,
+                   CASE WHEN E.end_i >= T.cds_start_i AND E.start_i <= T.cds_end_i THEN least(E.end_i,T.cds_end_i) ELSE NULL end AS cds_ex_end_i
+            FROM transcript T
+            JOIN exon_set ES ON T.ac = ES.tx_ac AND ES.alt_aln_METHOD = 'transcript'
+            JOIN exon E ON ES.exon_set_id=E.exon_set_id
+            WHERE T.cds_start_i IS NOT NULL AND T.cds_end_i IS NOT NULL
+            )
+            select *, end_i - start_i as ex_len, cds_ex_end_i - cds_ex_start_i as cds_ex_len from cds_exons;
+    """)
+    op.execute("""
+        CREATE VIEW _cds_exons_flat_v AS
+            SELECT exon_set_id,tx_ac,MIN(ord) AS cds_start_exon,MAX(ord) AS cds_end_exon,
+                   ARRAY_TO_STRING(ARRAY_AGG(format('%s,%s',cds_ex_start_i,cds_ex_end_i) ORDER BY ord),';') AS cds_se_i,
+                   ARRAY_TO_STRING(ARRAY_AGG(cds_ex_len ORDER BY ord),';') AS cds_exon_lengths
+            FROM _cds_exons_v
+            WHERE cds_ex_start_i IS NOT NULL
+            GROUP BY exon_set_id, tx_ac;
+    """)
+    op.execute("""
+        CREATE VIEW _seq_anno_most_recent AS
+            SELECT DISTINCT ON (ac) *
+            FROM seq_anno
+            ORDER BY ac,added DESC;
+    """)
+    op.execute("""
+        CREATE VIEW _cds_exons_fp_v AS
+            SELECT SA.seq_id, md5(format('%s;%s',LOWER(SA.seq_id),CTEF.cds_se_i)) AS cds_es_fp,
+                   md5(cds_exon_lengths) AS cds_exon_lengths_fp, CTEF.*
+            FROM _cds_exons_flat_v CTEF
+            JOIN _seq_anno_most_recent SA ON CTEF.tx_ac=SA.ac;
+    """)
+    op.execute("""
+        CREATE VIEW _discontiguous_tx AS 
+            SELECT t.hgnc,
+            es.exon_set_id,
+            es.tx_ac,
+            format('[%s-%s]'::text, e1.end_i, e2.start_i) AS gap,
+            e1.exon_id AS e1_exon_id,
+            e1.ord AS e1_ord,
+            e1.start_i AS e1_start_i,
+            e1.end_i AS e1_end_i,
+            e2.exon_id AS e2_exon_id,
+            e2.ord AS e2_ord,
+            e2.start_i AS e2_start_i,
+            e2.end_i AS e2_end_i
+           FROM exon_set es
+             LEFT JOIN transcript t ON es.tx_ac = t.ac
+             JOIN exon e1 ON es.exon_set_id = e1.exon_set_id
+             JOIN exon e2 ON es.exon_set_id = e2.exon_set_id AND e2.ord = (e1.ord + 1) AND e1.end_i <> e2.start_i
+          WHERE es.alt_aln_method = 'transcript'::text;
+    """)
+    op.execute("""
+        CREATE VIEW tx_alt_exon_pairs_v AS
+            SELECT T.hgnc,TES.exon_SET_id AS tes_exon_SET_id,AES.exon_SET_id AS aes_exon_SET_id,
+               TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,AES.alt_strand,AES.alt_aln_method,
+               TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,AEX.exon_id AS alt_exon_id,
+               TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, AEX.start_i AS alt_start_i,AEX.END_i AS alt_END_i,
+               EA.exon_aln_id,EA.cigar
+            FROM exon_SET tes
+            JOIN transcript t ON tes.tx_ac=t.ac
+            JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method!='transcript'
+            JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id
+            JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD
+            LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id;
+    """)
+    op.execute("""
+        CREATE VIEW tx_exon_aln_v AS 
+            SELECT T.hgnc,T.ac as tx_ac,AES.alt_ac,AES.alt_aln_method,AES.alt_strand,
+                   TE.ord, TE.start_i as tx_start_i,TE.end_i as tx_end_i,
+                   AE.start_i as alt_start_i, AE.end_i as alt_end_i,
+                   EA.cigar, EA.tx_aseq, EA.alt_aseq,
+                   TES.exon_set_id AS tx_exon_set_id,AES.exon_set_id as alt_exon_set_id,
+                   TE.exon_id as tx_exon_id, AE.exon_id as alt_exon_id,
+                   EA.exon_aln_id
+            FROM transcript T
+            JOIN exon_set TES ON T.ac=TES.tx_ac AND TES.alt_aln_method ='transcript'
+            JOIN exon_set AES on T.ac=AES.tx_ac and AES.alt_aln_method!='transcript'
+            JOIN exon TE ON TES.exon_set_id=TE.exon_set_id
+            JOIN exon AE ON AES.exon_set_id=AE.exon_set_id AND TE.ord=AE.ord
+            LEFT JOIN exon_aln EA ON TE.exon_id=EA.tx_exon_id AND AE.exon_id=EA.alt_exon_id;
+    """)
+    op.execute("""
+        CREATE VIEW exon_set_exons_v AS
+            SELECT ES.*,EL.n_exons,EL.se_i,EL.starts_i,EL.ends_i,EL.lengths
+            FROM exon_set ES
+            JOIN (SELECT 
+                 iES.exon_set_id,
+                 count(*) AS n_exons,
+                 array_to_string(array_agg(format('%s,%s',iE.start_i,iE.end_i) ORDER BY iE.ord),';') AS se_i,
+                 array_agg(iE.start_i            ORDER BY iE.ord) AS starts_i,
+                 array_agg(iE.end_i              ORDER BY iE.ord) AS ends_i,
+                 array_agg((iE.end_i-iE.start_i) ORDER BY iE.ord) AS lengths
+                 FROM exon_set iES
+                 JOIN exon iE ON iES.exon_set_id=iE.exon_set_id
+                 GROUP BY iES.exon_set_id) EL
+                 ON ES.exon_set_id = EL.exon_set_id;
+    """)
+    op.execute("""
+        COMMENT ON VIEW exon_set_exons_v IS 'defining view of "flat" (aggregated) exons on a sequence; use _mv; for faster materialized version';
+    """)
+    op.execute("""
+        CREATE VIEW  exon_set_exons_fp_v AS
+            SELECT ESE.*,md5(format('%s;%s',lower(ASA.seq_id),ESE.se_i)) AS es_fingerprint
+            FROM exon_set_exons_v ESE
+            JOIN _seq_anno_most_recent ASA ON ESE.alt_ac=ASA.ac;
+    """)
+    op.execute("""
+        COMMENT ON VIEW exon_set_exons_fp_v IS 'flattened (aggregated) exons with exon set fingerprint';
+    """)
+    op.execute("""
+        CREATE MATERIALIZED VIEW exon_set_exons_fp_mv AS SELECT * FROM exon_set_exons_fp_v WITH NO DATA;
+        CREATE INDEX exon_set_exons_fp_mv_tx_ac_ix ON exon_set_exons_fp_mv(tx_ac);
+        CREATE INDEX exon_set_exons_fp_mv_alt_ac_ix ON exon_set_exons_fp_mv(alt_ac);
+        CREATE INDEX exon_set_exons_fp_mv_alt_aln_method_ix ON exon_set_exons_fp_mv(alt_aln_method);
+        GRANT SELECT ON exon_set_exons_fp_mv TO public;
+    """)
+    op.execute("""
+        CREATE OR replace VIEW tx_exon_set_summary_dv AS
+            SELECT hgnc,cds_md5,es_fingerprint,tx_ac,alt_ac,alt_aln_method,alt_strand,exon_set_id,n_exons,se_i,starts_i,ends_i,lengths
+            FROM transcript T
+            JOIN exon_set_exons_fp_mv ESE ON T.ac=ESE.tx_ac;
+    """)
+    op.execute("""
+        CREATE MATERIALIZED VIEW tx_exon_set_summary_mv AS SELECT * FROM tx_exon_set_summary_dv WITH NO DATA;
+        CREATE INDEX tx_exon_set_summary_mv_cds_md5_ix ON tx_exon_set_summary_mv(cds_md5);
+        CREATE INDEX tx_exon_set_summary_mv_es_fingerprint_ix ON tx_exon_set_summary_mv(es_fingerprint);
+        CREATE INDEX tx_exon_set_summary_mv_tx_ac_ix ON tx_exon_set_summary_mv(tx_ac);
+        CREATE INDEX tx_exon_set_summary_mv_alt_ac_ix ON tx_exon_set_summary_mv(alt_ac);
+        CREATE INDEX tx_exon_set_summary_mv_alt_aln_method_ix ON tx_exon_set_summary_mv(alt_aln_method);
+        GRANT SELECT ON tx_exon_set_summary_mv TO public;
+    """)
+    op.execute("""
+        CREATE VIEW tx_def_summary_dv AS
+            SELECT TESS.exon_set_id, TESS.tx_ac, TESS.alt_ac, TESS.alt_aln_method, TESS.alt_strand,
+                   TESS.hgnc, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp, CEF.cds_exon_lengths_fp, 
+                   TESS.n_exons, TESS.se_i, CEF.cds_se_i, TESS.starts_i, TESS.ends_i, TESS.lengths, 
+                   T.cds_start_i, T.cds_end_i, CEF.cds_start_exon, CEF.cds_end_exon
+            FROM tx_exon_set_summary_mv TESS
+            JOIN transcript T ON TESS.tx_ac=T.ac
+            LEFT JOIN _cds_exons_fp_v CEF ON TESS.exon_set_id=CEF.exon_set_id
+            WHERE TESS.alt_aln_method = 'transcript';
+    """)
+    op.execute("""
+        COMMENT ON VIEW tx_def_summary_dv IS 'transcript definitions, with exon structures';
+    """)
+    op.execute("""
+        CREATE MATERIALIZED VIEW tx_def_summary_mv AS SELECT * FROM tx_def_summary_dv WITH NO DATA;
+    """)
+    op.execute("""
+        CREATE VIEW tx_def_summary_v AS SELECT * FROM tx_def_summary_mv;
+    """)
+    op.execute("""
+        COMMENT ON MATERIALIZED VIEW tx_def_summary_mv IS 'transcript definitions, with exon structures and fingerprints';
+    """)
+    op.execute("""
+        create index tx_def_summary_mv_tx_ac on tx_def_summary_mv (tx_ac);
+        create index tx_def_summary_mv_alt_ac on tx_def_summary_mv (alt_ac);
+        create index tx_def_summary_mv_alt_aln_method on tx_def_summary_mv (alt_aln_method);
+        create index tx_def_summary_mv_hgnc on tx_def_summary_mv (hgnc);
+    """)
+    op.execute("""
+        CREATE VIEW tx_def_summary_v AS
+            SELECT * FROM tx_def_summary_mv;
+    """)
+    op.execute("""
+        CREATE OR REPLACE VIEW tx_similarity_v AS
+            SELECT DISTINCT
+                   D1.tx_ac as tx_ac1, D2.tx_ac as tx_ac2,
+                   D1.hgnc = D2.hgnc as hgnc_eq,
+                   D1.cds_md5=D2.cds_md5 as cds_eq,
+                   D1.es_fingerprint=D2.es_fingerprint as es_fp_eq,
+                   D1.cds_es_fp=D2.cds_es_fp as cds_es_fp_eq,
+                   D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp as cds_exon_lengths_fp_eq
+            FROM tx_def_summary_mv D1
+            JOIN tx_def_summary_mv D2 on (D1.tx_ac != D2.tx_ac
+                                          and (D1.hgnc=D2.hgnc
+                                               or D1.cds_md5=D2.cds_md5
+                                               or D1.es_fingerprint=D2.es_fingerprint
+                                               or D1.cds_es_fp=D2.cds_es_fp
+                                               or D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp
+                                               ));
+    """)
+    # ### end custom SQL commands ###
+
+
+def downgrade() -> None:
+    # ### custom SQL to remove views ###
+    op.execute("DROP VIEW tx_similarity_v CASCADE;")
+    op.execute("DROP VIEW tx_def_summary_v CASCADE;")
+    op.execute("DROP INDEX tx_def_summary_mv_hgnc CASCADE")
+    op.execute("DROP INDEX tx_def_summary_mv_alt_aln_method CASCADE")
+    op.execute("DROP INDEX tx_def_summary_mv_alt_ac CASCADE")
+    op.execute("DROP INDEX tx_def_summary_mv_tx_ac CASCADE")
+    op.execute("DROP VIEW tx_def_summary_v CASCADE;")
+    op.execute("DROP MATERIALIZED VIEW tx_def_summary_mv CASCADE;")
+    op.execute("DROP VIEW tx_def_summary_dv CASCADE;")
+    op.execute("DROP MATERIALIZED VIEW tx_exon_set_summary_mv CASCADE;")
+    op.execute("DROP VIEW tx_exon_set_summary_dv CASCADE;")
+    op.execute("DROP MATERIALIZED VIEW exon_set_exons_fp_mv CASCADE;")
+    op.execute("DROP VIEW exon_set_exons_fp_v CASCADE;")
+    op.execute("DROP VIEW exon_set_exons_v CASCADE;")
+    op.execute("DROP VIEW tx_exon_aln_v CASCADE;")
+    op.execute("DROP VIEW tx_alt_exon_pairs_v CASCADE;")
+    op.execute("DROP VIEW _discontiguous_tx CASCADE;")
+    op.execute("DROP VIEW _cds_exons_fp_v CASCADE;")
+    op.execute("DROP VIEW _seq_anno_most_recent CASCADE;")
+    op.execute("DROP VIEW _cds_exons_flat_v CASCADE;")
+    op.execute("DROP VIEW _cds_exons_v CASCADE;")
+    # ### end custom SQL commands ###
+
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f('ix_uta_exon_aln_tx_exon_id'), table_name='exon_aln', schema='uta')
+    op.drop_index(op.f('ix_uta_exon_aln_alt_exon_id'), table_name='exon_aln', schema='uta')
+    op.drop_table('exon_aln', schema='uta')
+    op.drop_index(op.f('ix_uta_exon_exon_set_id'), table_name='exon', schema='uta')
+    op.drop_table('exon', schema='uta')
+    op.drop_table('exon_set', schema='uta')
+    op.drop_index(op.f('ix_uta_transcript_origin_id'), table_name='transcript', schema='uta')
+    op.drop_index(op.f('ix_uta_transcript_cds_md5'), table_name='transcript', schema='uta')
+    op.drop_table('transcript', schema='uta')
+    op.drop_index('seq_anno_ac_unique_in_origin', table_name='seq_anno', schema='uta')
+    op.drop_index(op.f('ix_uta_seq_anno_seq_id'), table_name='seq_anno', schema='uta')
+    op.drop_index(op.f('ix_uta_seq_anno_ac'), table_name='seq_anno', schema='uta')
+    op.drop_table('seq_anno', schema='uta')
+    op.drop_table('seq', schema='uta')
+    op.drop_table('origin', schema='uta')
+    op.drop_table('meta', schema='uta')
+    op.drop_table('gene', schema='uta')
+    # ### end Alembic commands ###
+
+    # ### custom commands to remove items not autogenerated by Alembic ###
+    op.drop_index('unique_pair_in_origin', table_name='associated_accessions')
+    op.drop_index('associated_accessions_pro_ac', table_name='associated_accessions')
+    op.drop_index('associated_accessions_tx_ac', table_name='associated_accessions')
+    op.drop_table_comment('associated_accessions', existing_comment='transcript-protein accession pairs associated in source databases', schema='uta')
+    op.drop_table('associated_accessions')
+    # ### end custom commands ###
\ No newline at end of file
diff --git a/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py b/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py
new file mode 100644
index 0000000..1548b95
--- /dev/null
+++ b/src/alembic/versions/f85dd97bd9f5_set_gene_id_and_primary_and_foreign_keys.py
@@ -0,0 +1,305 @@
+"""set gene_id and primary and foreign keys
+
+Revision ID: f85dd97bd9f5
+Revises: 595a586e6de7
+Create Date: 2024-04-10 22:14:14.055461
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'f85dd97bd9f5'
+down_revision: Union[str, None] = '595a586e6de7'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.alter_column(
+        "gene", "gene_id", existing_type=sa.TEXT(), nullable=False, schema="uta"
+    )
+    op.create_primary_key("gene_pkey", "gene", ["gene_id"], schema="uta")
+    op.create_index(op.f("ix_uta_gene_hgnc"), "gene", ["hgnc"], unique=False, schema="uta")
+    op.alter_column(
+        "transcript", "gene_id", existing_type=sa.TEXT(), nullable=False, schema="uta"
+    )
+    op.create_index(
+        op.f("ix_uta_transcript_gene_id"),
+        "transcript",
+        ["gene_id"],
+        unique=False,
+        schema="uta",
+    )
+    op.create_foreign_key(None, 'transcript', 'gene', ['gene_id'], ['gene_id'], source_schema='uta', referent_schema='uta')
+    # ### end Alembic commands ###
+
+    # ### handle first part of hgnc -> gene_symbol column rename ###
+    op.add_column("gene", sa.Column("symbol", sa.Text(), nullable=True), schema="uta")
+    op.create_index(op.f("ix_uta_gene_symbol"), "gene", ["symbol"], unique=False, schema="uta")
+    op.execute("UPDATE gene SET symbol = hgnc;")
+    op.alter_column('gene', 'symbol',
+                    existing_type=sa.TEXT(),
+                    nullable=False,
+                    schema='uta')
+    # ### end of hgnc -> gene_symbol column rename ###
+
+    # ### updates required to existing views needed to drop hgnc from transcript. ###
+    op.execute("DROP VIEW IF EXISTS tx_similarity_v CASCADE;")
+    op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;")
+    op.execute("DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;")
+    op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;")
+    op.execute("DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;")
+    op.execute("DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;")
+    op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;")
+    op.execute("DROP VIEW IF EXISTS _discontiguous_tx CASCADE;")
+    op.execute("""
+            CREATE VIEW _discontiguous_tx AS 
+                SELECT g.symbol,
+                g.symbol as hgnc,
+                g.gene_id,
+                es.exon_set_id,
+                es.tx_ac,
+                format('[%s-%s]'::text, e1.end_i, e2.start_i) AS gap,
+                e1.exon_id AS e1_exon_id,
+                e1.ord AS e1_ord,
+                e1.start_i AS e1_start_i,
+                e1.end_i AS e1_end_i,
+                e2.exon_id AS e2_exon_id,
+                e2.ord AS e2_ord,
+                e2.start_i AS e2_start_i,
+                e2.end_i AS e2_end_i
+               FROM exon_set es
+                 JOIN transcript t ON es.tx_ac = t.ac
+                 JOIN gene as g ON t.gene_id = g.gene_id
+                 JOIN exon e1 ON es.exon_set_id = e1.exon_set_id
+                 JOIN exon e2 ON es.exon_set_id = e2.exon_set_id AND e2.ord = (e1.ord + 1) AND e1.end_i <> e2.start_i
+              WHERE es.alt_aln_method = 'transcript'::text;
+        """)
+    op.execute("""
+            CREATE VIEW tx_alt_exon_pairs_v AS
+                SELECT g.symbol, g.symbol as hgnc, g.gene_id,TES.exon_SET_id AS tes_exon_SET_id,
+                   AES.exon_SET_id AS aes_exon_SET_id, TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,
+                   AES.alt_strand,AES.alt_aln_method, TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,
+                   AEX.exon_id AS alt_exon_id, TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, 
+                   AEX.start_i AS alt_start_i, AEX.END_i AS alt_END_i, EA.exon_aln_id,EA.cigar
+                FROM exon_SET tes
+                JOIN transcript t ON tes.tx_ac=t.ac
+                JOIN gene g ON t.gene_id=g.gene_id
+                JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method!='transcript'
+                JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id
+                JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD
+                LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id;
+        """)
+    op.execute("""
+            CREATE VIEW tx_exon_aln_v AS 
+                SELECT G.symbol, G.symbol AS hgnc, G.gene_id, T.ac as tx_ac, AES.alt_ac,
+                       AES.alt_aln_method,AES.alt_strand, TE.ord, TE.start_i as tx_start_i,
+                       TE.end_i as tx_end_i, AE.start_i as alt_start_i, AE.end_i as alt_end_i,
+                       EA.cigar, EA.tx_aseq, EA.alt_aseq, TES.exon_set_id AS tx_exon_set_id,
+                       AES.exon_set_id as alt_exon_set_id, TE.exon_id as tx_exon_id, 
+                       AE.exon_id as alt_exon_id, EA.exon_aln_id
+                FROM transcript T
+                JOIN gene G ON T.gene_id=G.gene_id
+                JOIN exon_set TES ON T.ac=TES.tx_ac AND TES.alt_aln_method ='transcript'
+                JOIN exon_set AES on T.ac=AES.tx_ac and AES.alt_aln_method!='transcript'
+                JOIN exon TE ON TES.exon_set_id=TE.exon_set_id
+                JOIN exon AE ON AES.exon_set_id=AE.exon_set_id AND TE.ord=AE.ord
+                LEFT JOIN exon_aln EA ON TE.exon_id=EA.tx_exon_id AND AE.exon_id=EA.alt_exon_id;
+        """)
+    op.execute("""
+            CREATE VIEW tx_exon_set_summary_dv AS
+                SELECT G.symbol, G.symbol as hgnc, G.gene_id, cds_md5, es_fingerprint, tx_ac, alt_ac, 
+                       alt_aln_method, alt_strand, exon_set_id, n_exons, se_i, starts_i, ends_i, lengths
+                FROM transcript T
+                JOIN gene G ON T.gene_id=G.gene_id
+                JOIN exon_set_exons_fp_mv ESE ON T.ac=ESE.tx_ac;
+        """)
+    op.execute("""
+            CREATE MATERIALIZED VIEW tx_exon_set_summary_mv AS SELECT * FROM tx_exon_set_summary_dv WITH NO DATA;
+            CREATE INDEX tx_exon_set_summary_mv_cds_md5_ix ON tx_exon_set_summary_mv(cds_md5);
+            CREATE INDEX tx_exon_set_summary_mv_es_fingerprint_ix ON tx_exon_set_summary_mv(es_fingerprint);
+            CREATE INDEX tx_exon_set_summary_mv_tx_ac_ix ON tx_exon_set_summary_mv(tx_ac);
+            CREATE INDEX tx_exon_set_summary_mv_alt_ac_ix ON tx_exon_set_summary_mv(alt_ac);
+            CREATE INDEX tx_exon_set_summary_mv_alt_aln_method_ix ON tx_exon_set_summary_mv(alt_aln_method);
+            GRANT SELECT ON tx_exon_set_summary_mv TO public;
+            REFRESH MATERIALIZED VIEW tx_exon_set_summary_mv;
+        """)
+    op.execute("""
+            CREATE VIEW tx_def_summary_dv AS
+                SELECT TESS.exon_set_id, TESS.tx_ac, TESS.alt_ac, TESS.alt_aln_method, TESS.alt_strand,
+                       TESS.symbol, TESS.hgnc, TESS.gene_id, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp, 
+                       CEF.cds_exon_lengths_fp, TESS.n_exons, TESS.se_i, CEF.cds_se_i, TESS.starts_i, 
+                       TESS.ends_i, TESS.lengths, T.cds_start_i, T.cds_end_i, CEF.cds_start_exon, CEF.cds_end_exon
+                FROM tx_exon_set_summary_mv TESS
+                JOIN transcript T ON TESS.tx_ac=T.ac
+                LEFT JOIN _cds_exons_fp_v CEF ON TESS.exon_set_id=CEF.exon_set_id
+                WHERE TESS.alt_aln_method = 'transcript';
+        """)
+    op.execute("""
+            CREATE MATERIALIZED VIEW tx_def_summary_mv AS SELECT * FROM tx_def_summary_dv WITH NO DATA;
+            CREATE INDEX tx_def_summary_mv_tx_ac ON tx_def_summary_mv (tx_ac);
+            CREATE INDEX tx_def_summary_mv_alt_ac ON tx_def_summary_mv (alt_ac);
+            CREATE INDEX tx_def_summary_mv_alt_aln_method ON tx_def_summary_mv (alt_aln_method);
+            CREATE INDEX tx_def_summary_mv_hgnc ON tx_def_summary_mv (hgnc);
+            CREATE INDEX tx_def_summary_mv_symbol ON tx_def_summary_mv (symbol);
+            CREATE INDEX tx_def_summary_mv_gene_id ON tx_def_summary_mv (gene_id);
+            REFRESH MATERIALIZED VIEW tx_def_summary_mv;
+        """)
+    op.execute("""
+        CREATE VIEW tx_similarity_v AS
+        SELECT DISTINCT
+               D1.tx_ac as tx_ac1, D2.tx_ac as tx_ac2,
+               D1.hgnc = D2.hgnc as hgnc_eq,
+               D1.symbol = D2.symbol as symbol_eq,
+               D1.cds_md5=D2.cds_md5 as cds_eq,
+               D1.es_fingerprint=D2.es_fingerprint as es_fp_eq,
+               D1.cds_es_fp=D2.cds_es_fp as cds_es_fp_eq,
+               D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp as cds_exon_lengths_fp_eq
+        FROM tx_def_summary_mv D1
+        JOIN tx_def_summary_mv D2 on (D1.tx_ac != D2.tx_ac
+                                      and (D1.symbol=D2.symbol
+                                           or D1.cds_md5=D2.cds_md5
+                                           or D1.es_fingerprint=D2.es_fingerprint
+                                           or D1.cds_es_fp=D2.cds_es_fp
+                                           or D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp
+                                           ));
+    """)
+    # ### end of updates to existing views ###
+
+
+def downgrade() -> None:
+    # ### commands to downgrade views before adding hgnc to transcript ###
+    op.execute("DROP VIEW IF EXISTS tx_similarity_v CASCADE;")
+    op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_def_summary_mv CASCADE;")
+    op.execute("DROP VIEW IF EXISTS tx_def_summary_dv CASCADE;")
+    op.execute("DROP MATERIALIZED VIEW IF EXISTS tx_exon_set_summary_mv CASCADE;")
+    op.execute("DROP VIEW IF EXISTS tx_exon_set_summary_dv CASCADE;")
+    op.execute("DROP VIEW IF EXISTS tx_exon_aln_v CASCADE;")
+    op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;")
+    op.execute("DROP VIEW IF EXISTS _discontiguous_tx CASCADE;")
+    op.execute("""
+            CREATE VIEW _discontiguous_tx AS 
+                SELECT t.hgnc,
+                es.exon_set_id,
+                es.tx_ac,
+                format('[%s-%s]'::text, e1.end_i, e2.start_i) AS gap,
+                e1.exon_id AS e1_exon_id,
+                e1.ord AS e1_ord,
+                e1.start_i AS e1_start_i,
+                e1.end_i AS e1_end_i,
+                e2.exon_id AS e2_exon_id,
+                e2.ord AS e2_ord,
+                e2.start_i AS e2_start_i,
+                e2.end_i AS e2_end_i
+               FROM exon_set es
+                 JOIN transcript t ON es.tx_ac = t.ac
+                 JOIN exon e1 ON es.exon_set_id = e1.exon_set_id
+                 JOIN exon e2 ON es.exon_set_id = e2.exon_set_id AND e2.ord = (e1.ord + 1) AND e1.end_i <> e2.start_i
+              WHERE es.alt_aln_method = 'transcript'::text;
+        """)
+    op.execute("""
+            CREATE VIEW tx_alt_exon_pairs_v AS
+                SELECT t.hgnc,TES.exon_SET_id AS tes_exon_SET_id,AES.exon_SET_id AS aes_exon_SET_id,
+                   TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,AES.alt_strand,AES.alt_aln_method,
+                   TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,AEX.exon_id AS alt_exon_id,
+                   TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, AEX.start_i AS alt_start_i,AEX.END_i AS alt_END_i,
+                   EA.exon_aln_id,EA.cigar
+                FROM exon_SET tes
+                JOIN transcript t ON tes.tx_ac=t.ac
+                JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method!='transcript'
+                JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id
+                JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD
+                LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id;
+        """)
+    op.execute("""
+                CREATE VIEW tx_exon_aln_v AS 
+                    SELECT T.hgnc,T.ac as tx_ac,AES.alt_ac,AES.alt_aln_method,AES.alt_strand,
+                           TE.ord, TE.start_i as tx_start_i,TE.end_i as tx_end_i,
+                           AE.start_i as alt_start_i, AE.end_i as alt_end_i,
+                           EA.cigar, EA.tx_aseq, EA.alt_aseq,
+                           TES.exon_set_id AS tx_exon_set_id,AES.exon_set_id as alt_exon_set_id,
+                           TE.exon_id as tx_exon_id, AE.exon_id as alt_exon_id,
+                           EA.exon_aln_id
+                    FROM transcript T
+                    JOIN exon_set TES ON T.ac=TES.tx_ac AND TES.alt_aln_method ='transcript'
+                    JOIN exon_set AES on T.ac=AES.tx_ac and AES.alt_aln_method!='transcript'
+                    JOIN exon TE ON TES.exon_set_id=TE.exon_set_id
+                    JOIN exon AE ON AES.exon_set_id=AE.exon_set_id AND TE.ord=AE.ord
+                    LEFT JOIN exon_aln EA ON TE.exon_id=EA.tx_exon_id AND AE.exon_id=EA.alt_exon_id;
+            """)
+    op.execute("""
+                CREATE VIEW tx_exon_set_summary_dv AS
+                    SELECT T.hgnc,cds_md5,es_fingerprint,tx_ac,alt_ac,alt_aln_method,alt_strand,exon_set_id,n_exons,se_i,starts_i,ends_i,lengths
+                    FROM transcript T
+                    JOIN exon_set_exons_fp_mv ESE ON T.ac=ESE.tx_ac;
+            """)
+    op.execute("""
+            CREATE MATERIALIZED VIEW tx_exon_set_summary_mv AS SELECT * FROM tx_exon_set_summary_dv WITH NO DATA;
+            CREATE INDEX tx_exon_set_summary_mv_cds_md5_ix ON tx_exon_set_summary_mv(cds_md5);
+            CREATE INDEX tx_exon_set_summary_mv_es_fingerprint_ix ON tx_exon_set_summary_mv(es_fingerprint);
+            CREATE INDEX tx_exon_set_summary_mv_tx_ac_ix ON tx_exon_set_summary_mv(tx_ac);
+            CREATE INDEX tx_exon_set_summary_mv_alt_ac_ix ON tx_exon_set_summary_mv(alt_ac);
+            CREATE INDEX tx_exon_set_summary_mv_alt_aln_method_ix ON tx_exon_set_summary_mv(alt_aln_method);
+            GRANT SELECT ON tx_exon_set_summary_mv TO public;
+            REFRESH MATERIALIZED VIEW tx_exon_set_summary_mv;
+        """)
+    op.execute("""
+                CREATE VIEW tx_def_summary_dv AS
+                    SELECT TESS.exon_set_id, TESS.tx_ac, TESS.alt_ac, TESS.alt_aln_method, TESS.alt_strand,
+                           TESS.hgnc, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp, 
+                           CEF.cds_exon_lengths_fp, TESS.n_exons, TESS.se_i, CEF.cds_se_i, TESS.starts_i, 
+                           TESS.ends_i, TESS.lengths, T.cds_start_i, T.cds_end_i, CEF.cds_start_exon, CEF.cds_end_exon
+                    FROM tx_exon_set_summary_mv TESS
+                    JOIN transcript T ON TESS.tx_ac=T.ac
+                    LEFT JOIN _cds_exons_fp_v CEF ON TESS.exon_set_id=CEF.exon_set_id
+                    WHERE TESS.alt_aln_method = 'transcript';
+            """)
+    op.execute("""
+                CREATE MATERIALIZED VIEW tx_def_summary_mv AS SELECT * FROM tx_def_summary_dv WITH NO DATA;
+                CREATE INDEX tx_def_summary_mv_tx_ac ON tx_def_summary_mv (tx_ac);
+                CREATE INDEX tx_def_summary_mv_alt_ac ON tx_def_summary_mv (alt_ac);
+                CREATE INDEX tx_def_summary_mv_alt_aln_method ON tx_def_summary_mv (alt_aln_method);
+                CREATE INDEX tx_def_summary_mv_hgnc ON tx_def_summary_mv (hgnc);
+                REFRESH MATERIALIZED VIEW tx_def_summary_mv;
+            """)
+    op.execute("""
+    CREATE VIEW tx_similarity_v AS
+    SELECT DISTINCT
+           D1.tx_ac as tx_ac1, D2.tx_ac as tx_ac2,
+           D1.hgnc = D2.hgnc as hgnc_eq,
+           D1.cds_md5=D2.cds_md5 as cds_eq,
+           D1.es_fingerprint=D2.es_fingerprint as es_fp_eq,
+           D1.cds_es_fp=D2.cds_es_fp as cds_es_fp_eq,
+           D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp as cds_exon_lengths_fp_eq
+    FROM tx_def_summary_mv D1
+    JOIN tx_def_summary_mv D2 on (D1.tx_ac != D2.tx_ac
+                                  and (D1.hgnc=D2.hgnc
+                                       or D1.cds_md5=D2.cds_md5
+                                       or D1.es_fingerprint=D2.es_fingerprint
+                                       or D1.cds_es_fp=D2.cds_es_fp
+                                       or D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp
+                                       ));
+    """)
+    # ### end of updates to views ###
+
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_constraint(None, 'transcript', schema='uta', type_='foreignkey')
+    op.drop_index(op.f("ix_uta_transcript_gene_id"), table_name="transcript", schema="uta")
+    op.alter_column("transcript", "gene_id",
+               existing_type=sa.TEXT(),
+               nullable=True,
+               schema="uta")
+    op.drop_index(op.f("ix_uta_gene_hgnc"), table_name="gene", schema="uta")
+    op.drop_constraint("gene_pkey", "gene", schema="uta")
+    op.alter_column("gene", "gene_id",
+               existing_type=sa.TEXT(),
+               nullable=True,
+               schema="uta")
+    op.drop_index(op.f("ix_uta_gene_symbol"), table_name="gene", schema="uta")
+    op.drop_column("gene", "symbol", schema="uta")
+    # ### end Alembic commands ###
diff --git a/src/alembic/versions/f885cb84efce_update_tx_alt_exon_pairs_v.py b/src/alembic/versions/f885cb84efce_update_tx_alt_exon_pairs_v.py
new file mode 100644
index 0000000..5ecaa9a
--- /dev/null
+++ b/src/alembic/versions/f885cb84efce_update_tx_alt_exon_pairs_v.py
@@ -0,0 +1,57 @@
+"""update tx_alt_exon_pairs_v
+
+Revision ID: f885cb84efce
+Revises: 14eed54ff90d
+Create Date: 2024-05-07 21:01:03.693969
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'f885cb84efce'
+down_revision: Union[str, None] = '14eed54ff90d'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;")
+    op.execute("""
+                CREATE VIEW tx_alt_exon_pairs_v AS
+                    SELECT g.symbol, g.symbol as hgnc, g.gene_id,TES.exon_SET_id AS tes_exon_SET_id,
+                       AES.exon_SET_id AS aes_exon_SET_id, TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,
+                       AES.alt_strand,AES.alt_aln_method, TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,
+                       AEX.exon_id AS alt_exon_id, TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, 
+                       AEX.start_i AS alt_start_i, AEX.END_i AS alt_END_i, EA.exon_aln_id,EA.cigar
+                    FROM exon_SET tes
+                    JOIN transcript t ON tes.tx_ac=t.ac
+                    JOIN gene g ON t.gene_id=g.gene_id
+                    JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method !~ 'transcript'
+                    JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id
+                    JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD
+                    LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id;
+            """)
+
+
+def downgrade() -> None:
+    op.execute("DROP VIEW IF EXISTS tx_alt_exon_pairs_v CASCADE;")
+    op.execute("""
+                CREATE VIEW tx_alt_exon_pairs_v AS
+                    SELECT g.symbol, g.symbol as hgnc, g.gene_id,TES.exon_SET_id AS tes_exon_SET_id,
+                       AES.exon_SET_id AS aes_exon_SET_id, TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,
+                       AES.alt_strand,AES.alt_aln_method, TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,
+                       AEX.exon_id AS alt_exon_id, TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, 
+                       AEX.start_i AS alt_start_i, AEX.END_i AS alt_END_i, EA.exon_aln_id,EA.cigar
+                    FROM exon_SET tes
+                    JOIN transcript t ON tes.tx_ac=t.ac
+                    JOIN gene g ON t.gene_id=g.gene_id
+                    JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method!='transcript'
+                    JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id
+                    JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD
+                    LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id;
+            """)
+
diff --git a/src/uta/__init__.py b/src/uta/__init__.py
index ff05e64..eb783bc 100644
--- a/src/uta/__init__.py
+++ b/src/uta/__init__.py
@@ -24,7 +24,7 @@
 
 def connect(db_url=default_db_url):
     """
-    Connect to a UTA database instance and return a UTA0 interface instance.
+    Connect to a UTA database instance and return a sqlalchemy Session.
 
     When called with an explicit db_url argument, that db_url is used for connecting.
 
diff --git a/src/uta/cli.py b/src/uta/cli.py
index 4b85440..bda63b0 100644
--- a/src/uta/cli.py
+++ b/src/uta/cli.py
@@ -6,6 +6,7 @@
   uta (-C CONF ...) [options] shell
   uta (-C CONF ...) [options] drop-schema
   uta (-C CONF ...) [options] create-schema
+  uta (-C CONF ...) [options] update-meta-data
   uta (-C CONF ...) [options] load-sql FILES ...
   uta (-C CONF ...) [options] rebuild
   uta (-C CONF ...) [options] load-origin FILE
@@ -13,13 +14,14 @@
   uta (-C CONF ...) [options] load-geneinfo FILE
   uta (-C CONF ...) [options] load-txinfo FILE
   uta (-C CONF ...) [options] load-exonset FILE
+  uta (-C CONF ...) [options] load-assoc-ac FILE
   uta (-C CONF ...) [options] load-sequences
   uta (-C CONF ...) [options] align-exons [--sql SQL]
   uta (-C CONF ...) [options] load-ncbi-seqgene FILE
   uta (-C CONF ...) [options] grant-permissions
   uta (-C CONF ...) [options] refresh-matviews
   uta (-C CONF ...) [options] analyze
-  
+
 Options:
   -C CONF, --conf CONF	Configuration to read (required)
 
@@ -67,8 +69,10 @@ def main():
         ("align-exons",         ul.align_exons),
         ("analyze",             ul.analyze),
         ("create-schema",       ul.create_schema),
+        ("update-meta-data",    ul.update_meta_data),
         ("drop-schema",         ul.drop_schema),
         ("grant-permissions",   ul.grant_permissions),
+        ("load-assoc-ac",       ul.load_assoc_ac),
         ("load-exonset",        ul.load_exonset),
         ("load-geneinfo",       ul.load_geneinfo),
         ("load-origin",         ul.load_origin),
@@ -118,13 +122,10 @@ def main():
         cmd=cmd, elapsed=time.time() - t0))
 
 
-
 if __name__ == "__main__":
     main()
 
 
-
-
 # <LICENSE>
 # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta)
 ##
diff --git a/src/uta/exceptions.py b/src/uta/exceptions.py
index 36454a2..4691ed8 100644
--- a/src/uta/exceptions.py
+++ b/src/uta/exceptions.py
@@ -17,6 +17,15 @@ class InvalidIntervalError(UTAError):
 class InvalidHGVSVariantError(UTAError):
     pass
 
+
+class EutilsDownloadError(Exception):
+    pass
+
+
+class ExonStructureMismatchError(UTAError):
+    pass
+
+
 # <LICENSE>
 # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta)
 ##
diff --git a/src/uta/formats/geneaccessions.py b/src/uta/formats/geneaccessions.py
index 9837d82..3b5985b 100644
--- a/src/uta/formats/geneaccessions.py
+++ b/src/uta/formats/geneaccessions.py
@@ -3,7 +3,7 @@
 
 
 class GeneAccessions(recordtype.recordtype('GeneAccessions',
-                                           ['hgnc', 'tx_ac', 'gene_id', 'pro_ac', 'origin'])):
+                                           ['gene_symbol', 'tx_ac', 'gene_id', 'pro_ac', 'origin'])):
     pass
 
 
diff --git a/src/uta/formats/geneinfo.py b/src/uta/formats/geneinfo.py
index f4a6015..e094adf 100644
--- a/src/uta/formats/geneinfo.py
+++ b/src/uta/formats/geneinfo.py
@@ -5,7 +5,7 @@
 
 
 class GeneInfo(recordtype.recordtype('GeneInfo',
-                                     ['gene_id', 'tax_id', 'hgnc', 'maploc', 'aliases', 'type', 'summary', 'descr', 'xrefs'])):
+                                     ['gene_id', 'gene_symbol', 'tax_id', 'hgnc', 'maploc', 'aliases', 'type', 'summary', 'descr', 'xrefs'])):
     pass
 
 
@@ -38,7 +38,6 @@ def __next__(self):
         return GeneInfo(**d)
 
 
-
 if __name__ == '__main__':
     tmpfn = '/tmp/exonset'
 
diff --git a/src/uta/formats/txinfo.py b/src/uta/formats/txinfo.py
index f8d6cf5..42660a4 100644
--- a/src/uta/formats/txinfo.py
+++ b/src/uta/formats/txinfo.py
@@ -1,10 +1,36 @@
 import csv
 import recordtype
+from typing import List, Optional
 
 
-class TxInfo(recordtype.recordtype('TxInfo',
-                                   ['origin', 'ac', 'hgnc', 'cds_se_i', 'exons_se_i'])):
-    pass
+# transl_except should be a semicolon-separated list:
+# (pos:333..335,aa:Sec);(pos:1017,aa:TERM)
+class TxInfo(
+    recordtype.recordtype(
+        'TxInfo',
+        ['origin', 'ac', 'gene_id', 'gene_symbol', 'cds_se_i', 'exons_se_i', 'codon_table', 'transl_except'],
+)):
+
+    @staticmethod
+    def serialize_transl_except(transl_except_list: Optional[List[str]]) -> Optional[str]:
+        """Helper for formatting transl_except list as a string."""
+        if transl_except_list is None:
+            return None
+        else:
+            return ";".join(transl_except_list)
+
+    @staticmethod
+    def serialize_cds_se_i(cds_se_i: Optional[tuple]) -> Optional[str]:
+        """Helper for formatting cds_se_i tuple as a string."""
+        if cds_se_i is None:
+            return None
+        else:
+            return "{},{}".format(*cds_se_i)
+
+    @staticmethod
+    def serialize_exons_se_i(exons_se_i: List[tuple]) -> str:
+        """Helper for formatting exons_se_i list as a string."""
+        return ";".join(["{},{}".format(*ese) for ese in exons_se_i])
 
 
 class TxInfoWriter(csv.DictWriter):
diff --git a/src/uta/loading.py b/src/uta/loading.py
index 2bc9b5e..3301c40 100644
--- a/src/uta/loading.py
+++ b/src/uta/loading.py
@@ -7,16 +7,19 @@
 import itertools
 import logging
 import time
+from typing import Any, Dict, List
 
 from biocommons.seqrepo import SeqRepo
 from bioutils.coordinates import strand_pm_to_int, MINUS_STRAND
 from bioutils.digests import seq_md5
 from bioutils.sequences import reverse_complement
 from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import Session
 from sqlalchemy.orm.exc import NoResultFound
+from sqlalchemy import text
 import psycopg2.extras
 import six
-import uta_align.align.algorithms as utaaa
+from uta_align.align.algorithms import cigar_alignment, needleman_wunsch_gotoh_align
 
 from uta.lru_cache import lru_cache
 
@@ -27,6 +30,7 @@
 import uta.formats.txinfo as ufti
 import uta.parsers.geneinfo
 import uta.parsers.seqgene
+from uta.exceptions import ExonStructureMismatchError
 
 usam = uta.models
 
@@ -46,10 +50,10 @@ def _get_cursor(con):
         return cur
 
     def align(s1, s2):
-        score, cigar = utaaa.needleman_wunsch_gotoh_align(s1.encode("ascii"),
-                                                          s2.encode("ascii"),
-                                                          extended_cigar=True)
-        tx_aseq, alt_aseq = utaaa.cigar_alignment(
+        score, cigar = needleman_wunsch_gotoh_align(s1.encode("ascii"),
+                                                    s2.encode("ascii"),
+                                                    extended_cigar=True)
+        tx_aseq, alt_aseq = cigar_alignment(
             tx_seq, alt_seq, cigar, hide_match=False)
         return tx_aseq.decode("ascii"), alt_aseq.decode("ascii"), cigar.to_string().decode("ascii")
 
@@ -150,27 +154,27 @@ def _fetch_seq(ac, s, e):
 
 
 def analyze(session, opts, cf):
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
     cmds = [
         "analyze verbose"
     ]
     for cmd in cmds:
         logger.info(cmd)
-        session.execute(cmd)
+        session.execute(text(cmd))
     session.commit()
 
 
 def create_schema(session, opts, cf):
     """Create and populate initial schema"""
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
 
     if session.bind.name == "postgresql" and usam.use_schema:
-        session.execute("create schema " + usam.schema_name)
-        session.execute("set search_path = " + usam.schema_name)
+        session.execute(text("create schema " + usam.schema_name))
+        session.execute(text("set search_path = " + usam.schema_name))
         session.commit()
 
     usam.Base.metadata.create_all(session.bind)
@@ -184,11 +188,38 @@ def create_schema(session, opts, cf):
     logger.info("created schema")
 
 
+def update_meta_data(session, opts, cf):
+    """Update Meta table with schema version"""
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
+
+    # check if schema version is up-to-date
+    md_schema_version = session.query(usam.Meta).filter_by(key="schema_version").one()
+    if md_schema_version.value != usam.schema_version:
+        logger.info(f"updating schema version from {md_schema_version.value} to {usam.schema_version}")
+        md_schema_version.value = usam.schema_version
+        session.commit()
+    else:
+        logger.info(f"schema version {md_schema_version.value} is already up-to-date")
+
+    # set updated on
+    md_updated_on = session.query(usam.Meta).filter_by(key="updated on").one_or_none()
+    if md_updated_on is None:
+        session.add(usam.Meta(key="updated on", value=datetime.datetime.now().isoformat()))
+        session.commit()
+        logger.info("added updated on")
+    else:
+        md_updated_on.value = datetime.datetime.now().isoformat()
+        session.commit()
+        logger.info("updated updated on")
+
+
 def drop_schema(session, opts, cf):
     if session.bind.name == "postgresql" and usam.use_schema:
         session.execute(
-            "set role {admin_role};".format(admin_role=cf.get("uta", "admin_role")))
-        session.execute("set search_path = " + usam.schema_name)
+            text("set role {admin_role};".format(admin_role=cf.get("uta", "admin_role"))))
+        session.execute(text("set search_path = " + usam.schema_name))
 
         ddl = "drop schema if exists " + usam.schema_name + " cascade"
         session.execute(ddl)
@@ -199,9 +230,9 @@ def drop_schema(session, opts, cf):
 def grant_permissions(session, opts, cf):
     schema = usam.schema_name
 
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
 
     cmds = [
         # alter db doesn't belong here, and probably better to avoid the implicit behevior this encourages
@@ -211,60 +242,134 @@ def grant_permissions(session, opts, cf):
 
     sql = "select concat(schemaname,'.',tablename) as fqrn from pg_tables where schemaname='{schema}'".format(
         schema=schema)
-    rows = list(session.execute(sql))
+    rows = list(session.execute(text(sql)))
     cmds += ["grant select on {fqrn} to PUBLIC".format(
-        fqrn=row["fqrn"]) for row in rows]
+        fqrn=row.fqrn) for row in rows]
     cmds += ["alter table {fqrn} owner to uta_admin".format(
-        fqrn=row["fqrn"]) for row in rows]
+        fqrn=row.fqrn) for row in rows]
 
     sql = "select concat(schemaname,'.',viewname) as fqrn from pg_views where schemaname='{schema}'".format(
         schema=schema)
-    rows = list(session.execute(sql))
+    rows = list(session.execute(text(sql)))
     cmds += ["grant select on {fqrn} to PUBLIC".format(
-        fqrn=row["fqrn"]) for row in rows]
+        fqrn=row.fqrn) for row in rows]
     cmds += ["alter view {fqrn} owner to uta_admin".format(
-        fqrn=row["fqrn"]) for row in rows]
+        fqrn=row.fqrn) for row in rows]
 
     sql = "select concat(schemaname,'.',matviewname) as fqrn from pg_matviews where schemaname='{schema}'".format(
         schema=schema)
-    rows = list(session.execute(sql))
+    rows = list(session.execute(text(sql)))
     cmds += ["grant select on {fqrn} to PUBLIC".format(
-        fqrn=row["fqrn"]) for row in rows]
+        fqrn=row.fqrn) for row in rows]
     cmds += ["alter materialized view {fqrn} owner to uta_admin".format(
-        fqrn=row["fqrn"]) for row in rows]
+        fqrn=row.fqrn) for row in rows]
 
     for cmd in sorted(cmds):
         logger.info(cmd)
-        session.execute(cmd)
+        session.execute(text(cmd))
     session.commit()
 
 
+def load_assoc_ac(session, opts, cf):
+    """
+    Insert rows into `associated_accessions` table in the UTA database,
+    using data from a file written by sbin/assoc-acs-merge.
+    """
+    logger.info("load_assoc_ac")
+
+    admin_role = cf.get("uta", "admin_role")
+    session.execute(text(f"set role {admin_role};"))
+    session.execute(text(f"set search_path = {usam.schema_name};"))
+    fname = opts["FILE"]
+
+    with gzip.open(fname, "rt") as fhandle:
+        for file_row in csv.DictReader(fhandle, delimiter="\t"):
+            row = {
+                "origin": file_row["origin"],
+                "pro_ac": file_row["pro_ac"],
+                "tx_ac": file_row["tx_ac"],
+            }
+            aa, created = _get_or_insert(
+                session=session,
+                table=usam.AssociatedAccessions,
+                row=row,
+                row_identifier=('origin', 'tx_ac', 'pro_ac'),
+            )
+            if created:
+                # If committing on every insert is too slow, we can
+                # look into committing in batches like load_txinfo does.
+                session.commit()
+                logger.info(f"Added: {aa.tx_ac}, {aa.pro_ac}, {aa.origin}")
+            else:
+                logger.info(f"Already exists: {file_row}")
+                # All fields should should match when unique identifiers match.
+                # Discrepancies should be investigated.
+                existing_row = {
+                    "origin": aa.origin,
+                    "pro_ac": aa.pro_ac,
+                    "tx_ac": aa.tx_ac,
+                }
+
+
 def load_exonset(session, opts, cf):
     # exonsets and associated exons are loaded together
 
     update_period = 25
 
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(
+        text("set role {admin_role};".format(admin_role=cf.get("uta", "admin_role")))
+    )
+    session.execute(text("set search_path = " + usam.schema_name))
 
-    n_rows = len(gzip.open(opts["FILE"], 'rt').readlines()) - 1
-    esr = ufes.ExonSetReader(gzip.open(opts["FILE"], 'rt'))
+    n_rows = len(gzip.open(opts["FILE"], "rt").readlines()) - 1
+    esr = ufes.ExonSetReader(gzip.open(opts["FILE"], "rt"))
     logger.info("opened " + opts["FILE"])
 
     n_new = 0
     n_unchanged = 0
     n_deprecated = 0
+    n_skipped = 0
     n_errors = 0
     for i_es, es in enumerate(esr):
+        skipped = False
         try:
-            n, o = _upsert_exon_set_record(session, es.tx_ac, es.alt_ac, es.strand, es.method, es.exons_se_i)
-            session.commit()
+            # determine if alignment and transcript have the same exon structure
+            tx_es = (
+                session.query(usam.ExonSet)
+                .filter(
+                    usam.ExonSet.tx_ac == es.tx_ac,
+                    usam.ExonSet.alt_ac == es.tx_ac,
+                    usam.ExonSet.alt_aln_method == "transcript",
+                )
+                .one()
+            )
+            tx_exon_count = len(tx_es.exons_se_i())
+            aln_exon_count = len(es.exons_se_i.split(";"))
+            if tx_exon_count == aln_exon_count:
+                n, o = _upsert_exon_set_record(
+                    session, es.tx_ac, es.alt_ac, es.strand, es.method, es.exons_se_i
+                )
+                session.commit()
+            else:
+                raise ExonStructureMismatchError(
+                    "Exon structure mismatch: {tx_exon_count} exons in transcript {es.tx_ac}; {aln_exon_count} in alignment {es.alt_ac}".format(
+                        tx_exon_count=tx_exon_count,
+                        aln_exon_count=aln_exon_count,
+                        es=es,
+                    )
+                )
         except IntegrityError as e:
             logger.exception(e)
             session.rollback()
             n_errors += 1
-        finally:        
+        except NoResultFound as e:
+            logger.exception(e)
+            logger.warning("NoResultFound for transcript ExonSet: {es.tx_ac}".format(es=es))
+            skipped = True
+        except ExonStructureMismatchError as e:
+            logger.exception(e)
+            skipped = True
+        else:
             (no) = (n is not None, o is not None)
             if no == (True, False):
                 n_new += 1
@@ -272,19 +377,30 @@ def load_exonset(session, opts, cf):
                 n_deprecated += 1
             elif no == (False, True):
                 n_unchanged += 1
-
+        finally:
+            if skipped:
+                n_skipped += 1
             if i_es % update_period == 0 or i_es + 1 == n_rows:
-                logger.info("{i_es}/{n_rows} {p:.1f}%; {n_new} new, {n_unchanged} unchanged, {n_deprecated} deprecated, {n_errors} n_errors".format(
-                    i_es=i_es, n_rows=n_rows,
-                    n_new=n_new, n_unchanged=n_unchanged, n_deprecated=n_deprecated, n_errors=n_errors,
-                    p=(i_es + 1) / n_rows * 100))
+                logger.info(
+                    "{i_es}/{n_rows} {p:.1f}%; {n_new} new, {n_unchanged} unchanged, {n_deprecated} deprecated, {n_skipped} skipped, {n_errors} n_errors".format(
+                        i_es=i_es,
+                        n_rows=n_rows,
+                        n_new=n_new,
+                        n_unchanged=n_unchanged,
+                        n_deprecated=n_deprecated,
+                        n_skipped=n_skipped,
+                        n_errors=n_errors,
+                        p=(i_es + 1) / n_rows * 100,
+                    )
+                )
+
     session.commit()
 
 
 def load_geneinfo(session, opts, cf):
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
 
     gir = ufgi.GeneInfoReader(gzip.open(opts["FILE"], 'rt'))
     logger.info("opened " + opts["FILE"])
@@ -292,40 +408,17 @@ def load_geneinfo(session, opts, cf):
     for i_gi, gi in enumerate(gir):
         session.merge(
             usam.Gene(
+                gene_id=gi.gene_id,
                 hgnc=gi.hgnc,
+                symbol=gi.gene_symbol,
                 maploc=gi.maploc,
                 descr=gi.descr,
                 summary=gi.summary,
                 aliases=gi.aliases,
+                type=gi.type,
+                xrefs=gi.xrefs,
             ))
-        logger.info("Added {gi.hgnc} ({gi.summary})".format(gi=gi))
-    session.commit()
-
-
-def load_ncbi_geneinfo(session, opts, cf):
-    """
-    import data as downloaded (by you) from 
-    ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
-    """
-
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
-
-    gip = uta.parsers.geneinfo.GeneInfoParser(gzip.open(opts["FILE"], 'rt'))
-    for gi in gip:
-        if gi["tax_id"] != "9606" or gi["Symbol_from_nomenclature_authority"] == "-":
-            continue
-        g = usam.Gene(
-            gene_id=gi["GeneID"],
-            hgnc=gi["Symbol_from_nomenclature_authority"],
-            maploc=gi["map_location"],
-            descr=gi["Full_name_from_nomenclature_authority"],
-            aliases=gi["Synonyms"],
-            strand=gi[""],
-        )
-        session.add(g)
-        logger.info("loaded gene {g.hgnc} ({g.descr})".format(g=g))
+        logger.debug("Added {gi.gene_symbol}: {gi.gene_id} ({gi.summary})".format(gi=gi))
     session.commit()
 
 
@@ -362,9 +455,9 @@ def _seqgene_recs_to_tx_info(ac, assy, recs):
         return ti
 
 
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
 
     o_refseq = session.query(usam.Origin).filter(
         usam.Origin.name == "NCBI RefSeq").one()
@@ -404,9 +497,9 @@ def load_origin(session, opts, cf):
     def _none_if_empty(s):
         return None if s == "" else s
 
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
 
     orir = csv.DictReader(open(opts["FILE"]), delimiter='\t')
     for rec in orir:
@@ -441,9 +534,9 @@ def load_seqinfo(session, opts, cf):
     max_len = int(2e6)
 
 
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
 
     n_rows = len(gzip.open(opts["FILE"]).readlines()) - 1
 
@@ -474,7 +567,7 @@ def _upsert_seq(si):
     for md5, si_iter in itertools.groupby(sorted(sir, key=lambda si: si.md5),
                                           key=lambda si: si.md5):
         sis = list(si_iter)
-    
+
         # if sequence doesn't exist in sequence table, make it
         # this is to satisfy a FK dependency, which should be reconsidered
         si = sis[0]
@@ -502,6 +595,7 @@ def _upsert_seq(si):
                     session.merge(u_seqanno)
             else:
                 # create the new annotation
+                logger.debug("creating seq_anno({si.origin},{si.ac},{si.md5})".format(si=si))
                 u_seqanno = usam.SeqAnno(origin_id=u_ori.origin_id, seq_id=si.md5,
                                          ac=si.ac, descr=si.descr)
                 session.add(u_seqanno)
@@ -512,6 +606,7 @@ def _upsert_seq(si):
             logger.info("{n_created} annotations created/{i_md5} sequences seen ({p:.1f}%)/{n_rows} sequences total".format(
                 n_created=n_created, i_md5=i_md5, n_rows=n_rows, md5=md5, p=i_md5 / n_rows * 100))
             session.commit()
+    session.commit()
 
 
 def load_sequences(session, opts, cf):
@@ -521,9 +616,9 @@ def load_sequences(session, opts, cf):
     # 2e6 was chosen empirically based on sizes of NMs, NGs, NWs, NTs, NCs
     max_len = int(2e6)
 
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
 
     sf = _get_seqfetcher(cf)
 
@@ -567,9 +662,9 @@ def _fetch_first(acs):
 
 def load_sql(session, opts, cf):
     """Create views"""
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
 
     for fn in opts["FILES"]:
         logger.info("loading " + fn)
@@ -596,9 +691,9 @@ def _fetch_origin_by_name(name):
     tir = ufti.TxInfoReader(gzip.open(opts["FILE"], 'rt'))
     logger.info("opened " + opts["FILE"])
 
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
 
     n_new = 0
     n_unchanged = 0
@@ -612,8 +707,10 @@ def _fetch_origin_by_name(name):
 
         if ti.cds_se_i:
             cds_start_i, cds_end_i = map(int, ti.cds_se_i.split(","))
+            codon_table = ti.codon_table
         else:
             cds_start_i = cds_end_i = None
+            codon_table = None
             cds_md5 = None
 
         # 1. Fetch or make the Transcript record
@@ -633,6 +730,29 @@ def _fetch_origin_by_name(name):
                 u_tx = None
                 n_cds_changed += 1
 
+            if ti.transl_except:
+                # if the transl_except exists, make sure it exists in the database.
+                te_list = _create_translation_exceptions(
+                    transcript=ti.ac, transl_except_list=ti.transl_except.split(";")
+                )
+                for te_data in te_list:
+                    te, created = _get_or_insert(
+                        session=session,
+                        table=usam.TranslationException,
+                        row=te_data,
+                        row_identifier=("tx_ac", "start_position", "end_position", "amino_acid"),
+                    )
+                    if created:
+                        logger.info(
+                            f"TranslationException added: {te.tx_ac}, {te.start_position}, {te.end_position}, {te.amino_acid}"
+                        )
+                    else:
+                        logger.info(
+                            f"TranslationException already exists: {te.tx_ac}, {te.start_position}, {te.end_position}, {te.amino_acid}"
+                        )
+
+
+
         # state: u_tx is set if a transcript was found and was
         # unchanged, or None if 1) no such was found or 2) was found
         # and had updated CDS coords.
@@ -654,17 +774,24 @@ def _fetch_origin_by_name(name):
             u_tx = usam.Transcript(
                 ac=ti.ac,
                 origin=ori,
-                hgnc=ti.hgnc,
+                gene_id=ti.gene_id,
                 cds_start_i=cds_start_i,
                 cds_end_i=cds_end_i,
                 cds_md5=cds_md5,
+                codon_table=codon_table,
             )
             session.add(u_tx)
 
-        if u_tx.hgnc != ti.hgnc:
-            logger.warn("{ti.ac}: HGNC symbol changed from {u_tx.hgnc} to {ti.hgnc}".format(
-                u_tx=u_tx, ti=ti))
-            u_tx.hgnc = ti.hgnc
+            if ti.transl_except:
+                # if transl_except exists, it looks like this:
+                # (pos:333..335,aa:Sec);(pos:1017,aa:TERM)
+                transl_except_list = ti.transl_except.split(';')
+                te_list = _create_translation_exceptions(transcript=ti.ac, transl_except_list=transl_except_list)
+                for te in te_list:
+                    session.add(usam.TranslationException(**te))
+
+        if u_tx.gene_id != ti.gene_id:
+            logger.warning("{ti.ac}: GeneID changed from {u_tx.gene_id} to {ti.gene_id}".format(u_tx=u_tx, ti=ti))
 
         # state: transcript now exists, either existing or freshly-created
 
@@ -675,7 +802,7 @@ def _fetch_origin_by_name(name):
         if no == (True, False):
             n_new += 1
         elif no == (True, True):
-            logger.warn("Transcript {ti.ac} exon structure changed".format(ti=ti))
+            logger.warning("Transcript {ti.ac} exon structure changed".format(ti=ti))
             n_exons_changed += 1
         elif no == (False, True):
             logger.debug("Transcript {ti.ac} exon structure unchanged".format(ti=ti))
@@ -688,14 +815,46 @@ def _fetch_origin_by_name(name):
                 i_ti=i_ti, n_rows=n_rows,
                 n_new=n_new, n_unchanged=n_unchanged, n_cds_changed=n_cds_changed, n_exons_changed=n_exons_changed,
                 p=(i_ti + 1) / n_rows * 100))
-            
 
 
+def _create_translation_exceptions(transcript: str, transl_except_list: List[str]) -> List[Dict]:
+    """
+    Create TranslationException object data where start and end positions are 0-based, from transl_except data that is 1-based.
+    For example, [(pos:333..335,aa:Sec), (pos:1017,aa:TERM)] should result in start and end positions [(332, 335), (1016, 1017)]
+    """
+    result = []
+
+    for te in transl_except_list:
+        # remove parens
+        te = te.replace('(','').replace(')','')
+
+        # extract positions
+        pos_str, aa_str = te.split(',')
+        pos_str = pos_str.removeprefix('pos:')
+        if '..' in pos_str:
+            start_position, _, end_position = pos_str.partition('..')
+        else:
+            start_position = end_position = pos_str
+
+        # extract amino acid
+        amino_acid = aa_str.removeprefix('aa:')
+
+        result.append(
+            {
+                'tx_ac': transcript,
+                'start_position': int(start_position) - 1,
+                'end_position': int(end_position),
+                'amino_acid': amino_acid,
+            }
+        )
+
+    return result
+
 
 def refresh_matviews(session, opts, cf):
-    session.execute("set role {admin_role};".format(
-        admin_role=cf.get("uta", "admin_role")))
-    session.execute("set search_path = " + usam.schema_name)
+    session.execute(text("set role {admin_role};".format(
+        admin_role=cf.get("uta", "admin_role"))))
+    session.execute(text("set search_path = " + usam.schema_name))
 
     # matviews must be updated in dependency order. Unfortunately,
     # it's difficult to determine this programmatically. The "right"
@@ -714,13 +873,12 @@ def refresh_matviews(session, opts, cf):
         "refresh materialized view exon_set_exons_fp_mv",
         "refresh materialized view tx_exon_set_summary_mv",
         "refresh materialized view tx_def_summary_mv",
-        # "refresh materialized view tx_aln_cigar_mv",
-        # "refresh materialized view tx_aln_summary_mv",
+        "refresh materialized view tx_exon_aln_mv",
     ]
 
     for cmd in cmds:
         logger.info(cmd)
-        session.execute(cmd)
+        session.execute(text(cmd))
     session.commit()
 
 
@@ -740,6 +898,34 @@ def _get_seqrepo(cf):
 _get_seqfetcher = _get_seqrepo
 
 
+def _get_or_insert(
+    session: Session,
+    table: type[usam.Base],
+    row: dict[str, Any],
+    row_identifier: str | tuple[str, ...],
+) -> tuple[usam.Base, bool]:
+    """
+    Returns a sqlalchemy model of the inserted or fetched row.
+
+    `session` is a sqlalchemy session.
+    `table` is the database table in which to insert `row`.
+    `row` is the a list of key-value pairs to insert into the table.
+    `row_identifier` is a map of key-value pairs which define a match between `row` and an existing row in the table.
+
+    sqlalchemy.orm.exc.MultipleResultsFound may be raised if `row_identifier` does not uniquely identify a row.
+    KeyError may be raised if `row_identifier` refers to columns not present as keys in `row`.
+    sqlalchemy.exc.IntegrityError (raised from psycopg2.errors.ForeignKeyViolation) may be raised if a foreign key reference does not exist
+    """
+    row_filter = {ri: row[ri] for ri in row_identifier}
+    try:
+        row_instance = session.query(table).filter_by(**row_filter).one()
+        created = False
+    except NoResultFound:
+        row_instance = table(**row)
+        session.add(row_instance)
+        created = True
+    return row_instance, created
+
 
 def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess):
 
@@ -747,9 +933,9 @@ def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess):
     returns tuple of (new_record, old_record) as follows:
 
     (new, None) -- no prior record; new was inserted
-    (None, old) -- prior record and unchaged; nothing was inserted
+    (None, old) -- prior record and unchanged; nothing was inserted
     (new, old)  -- prior record existed and was changed
-    
+
     """
 
     key = (tx_ac, alt_ac, method)
@@ -781,9 +967,29 @@ def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess):
             usam.ExonSet.alt_aln_method == alt_aln_method_with_hash,
             )
         if existing.count() == 1:
+            logger.warning(
+                "Exon set {tx_ac}/{alt_ac} with method {method} already exists with hash {esh}".format(
+                    tx_ac=tx_ac,
+                    alt_ac=alt_ac,
+                    method=method,
+                    esh=alt_aln_method_with_hash,
+                )
+            )
             return (None, existing[0])
 
         # update aln_method to add a unique exon set hash based on the *existing* exon set string
+        logger.warning(
+            "Exon set {tx_ac}/{alt_ac} with method {method} already exists, but with different exons; "
+            "existing exon set: {es_ess}; new exon set: {ess}; updated alt_aln_method of exonset to "
+            "{alt_aln_method_with_hash}".format(
+                tx_ac=tx_ac,
+                alt_ac=alt_ac,
+                method=method,
+                es_ess=es_ess,
+                ess=ess,
+                alt_aln_method_with_hash=alt_aln_method_with_hash,
+            )
+        )
         es.alt_aln_method = alt_aln_method_with_hash
         session.flush()
         old_es = es
diff --git a/src/uta/models.py b/src/uta/models.py
index 2305666..a8ec1dd 100644
--- a/src/uta/models.py
+++ b/src/uta/models.py
@@ -6,21 +6,22 @@
 
 import sqlalchemy as sa
 import sqlalchemy.orm as sao
+import sqlalchemy.types
+import sqlalchemy.sql.functions
 from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.dialects import postgresql
 
 
 ############################################################################
 # schema name support
 # also see etc/uta.conf
 
-schema_version = "1.1"
+schema_version = "1.2"
 use_schema = True
 if use_schema:
-    schema_name = "uta_" + schema_version.replace(".","_")
-    schema_name_dot = schema_name + "."
+    schema_name = "uta"
 else:
     schema_name = None
-    schema_name_dot = ""
 
 
 ############################################################################
@@ -97,13 +98,17 @@ class Gene(Base):
     __tablename__ = "gene"
 
     # columns:
-    hgnc = sa.Column(sa.Text, primary_key=True)
+    gene_id = sa.Column(sa.Text, primary_key=True)
+    hgnc = sa.Column(sa.Text, nullable=False, index=True)
+    symbol = sa.Column(sa.Text, nullable=False, index=True)
     maploc = sa.Column(sa.Text)
     descr = sa.Column(sa.Text)
     summary = sa.Column(sa.Text)
     aliases = sa.Column(sa.Text)
     added = sa.Column(
         sa.DateTime, nullable=False, default=datetime.datetime.now())
+    type = sa.Column(sa.Text)
+    xrefs = sa.Column(sa.Text)
 
     # methods:
 
@@ -123,17 +128,43 @@ class Transcript(Base):
     ac = sa.Column(sa.Text, primary_key=True)
     origin_id = sa.Column(
         sa.Integer, sa.ForeignKey("origin.origin_id", onupdate="CASCADE", ondelete="CASCADE"), nullable=False, index=True)
-    hgnc = sa.Column(sa.Text)  # , sa.ForeignKey("gene.hgnc"))
-    cds_start_i = sa.Column(sa.Integer) #, nullable=False)
-    cds_end_i = sa.Column(sa.Integer) #, nullable=False)
+    gene_id = sa.Column(sa.Text, sa.ForeignKey("gene.gene_id"), nullable=False, index=True)
+    hgnc = sa.Column(sa.Text, nullable=True, index=True)
+    cds_start_i = sa.Column(sa.Integer)
+    cds_end_i = sa.Column(sa.Integer)
     cds_md5 = sa.Column(sa.Text, index=True)
     added = sa.Column(
         sa.DateTime, default=datetime.datetime.now(), nullable=False)
+    codon_table = sa.Column(sa.Text, nullable=True, server_default='1')  # 1 = standard, 2 = mitochondrial
 
     # relationships:
     origin = sao.relationship("Origin", backref="transcripts")
 
 
+class TranslationException(Base):
+    """
+    Represents `transl_except` annotations on CDS features in transcript records from NCBI.
+
+    Examples:
+    /transl_except=(pos:333..335,aa:Sec)
+    /transl_except=(pos:1017,aa:TERM)
+    """
+
+    __tablename__ = "translation_exception"
+    __table_args__ = (
+        sa.CheckConstraint("start_position <= end_position", "start_less_than_or_equal_to_end"),
+    )
+
+    translation_exception_id = sa.Column(sa.Integer, autoincrement=True, primary_key=True)
+    tx_ac = sa.Column(sa.Text, sa.ForeignKey("transcript.ac", onupdate="CASCADE", ondelete="CASCADE"), nullable=False)
+    start_position = sa.Column(sa.Integer, nullable=False)
+    end_position = sa.Column(sa.Integer, nullable=False)
+    amino_acid = sa.Column(sa.Text, nullable=False)
+
+    # relationships:
+    transcript = sao.relationship("Transcript", backref="translation_exceptions")
+
+
 class ExonSet(Base):
     __tablename__ = "exon_set"
     __table_args__ = (
@@ -208,8 +239,8 @@ class ExonAln(Base):
     cigar = sa.Column(sa.Text, nullable=False)
     added = sa.Column(
         sa.DateTime, default=datetime.datetime.now(), nullable=False)
-    tx_aseq = sa.Column(sa.Text, nullable=False)
-    alt_aseq = sa.Column(sa.Text, nullable=False)
+    tx_aseq = sa.Column(sa.Text, nullable=True)
+    alt_aseq = sa.Column(sa.Text, nullable=True)
 
     # relationships:
     tx_exon = sao.relationship(
@@ -220,6 +251,27 @@ class ExonAln(Base):
     # methods:
 
 
+class AssociatedAccessions(Base):
+    __tablename__ = "associated_accessions"
+    __table_args__ = (
+        sa.Index("unique_pair_in_origin", "origin", "tx_ac", "pro_ac", unique=True),
+        sa.Index("associated_accessions_pro_ac", "pro_ac"),
+        sa.Index("associated_accessions_tx_ac", "tx_ac"),
+        {"comment": "transcript-protein accession pairs associated in source databases"},
+    )
+
+    # columns:
+    associated_accession_id = sa.Column(sa.Integer, primary_key=True, autoincrement=True)
+    tx_ac = sa.Column(sa.Text, nullable=False)
+    pro_ac = sa.Column(sa.Text, nullable=False)
+    origin = sa.Column(sa.Text, nullable=False)
+    added = sa.Column(
+        postgresql.TIMESTAMP(timezone=True),
+        server_default=sqlalchemy.sql.functions.now(),
+        nullable=False,
+    )
+
+
 # <LICENSE>
 # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta)
 ##
diff --git a/src/uta/parsers/seqrecord.py b/src/uta/parsers/seqrecord.py
new file mode 100644
index 0000000..9168b42
--- /dev/null
+++ b/src/uta/parsers/seqrecord.py
@@ -0,0 +1,189 @@
+from collections import defaultdict
+from functools import cached_property
+from typing import List, Optional
+
+import Bio.SeqRecord
+from Bio.SeqFeature import SeqFeature
+
+
+class SeqRecordFeatureError(Exception):
+    """Raised when SeqRecord does not have the expected features."""
+
+
+class SeqRecordFacade:
+    def __init__(self, seqrecord: Bio.SeqRecord.SeqRecord):
+        self._sr = seqrecord
+
+    @cached_property
+    def features_by_type(self) -> dict[str, list]:
+        result = defaultdict(list)
+        for feat in self._sr.features:
+            result[feat.type].append(feat)
+        return result
+
+    @cached_property
+    def cds_feature(self) -> Optional[SeqFeature]:
+        """
+        Returns the CDS feature for any coding transcript, None for any non-coding transcript.
+        Some NCBI records will contain multiple CDS features. In these one CDS describes a protein
+        with accession and protein sequence, the other CDS features describes a pseudogene. This method
+        will preferentially choose the CDS feature with a protein sequence.
+        Example:
+                 CDS             422..778
+                                 /gene="C6orf119"
+                                 /gene_synonym="dJ427A4.2"
+                                 /codon_start=1
+                                 /product="chromosome 6 open reading frame 119"
+                                 /protein_id="NP_001012240.1"
+                                 /db_xref="GI:59276067"
+                                 /db_xref="GeneID:353267"
+                                 /translation="MTDTAEAVPNFEEMFASRFTENDKEYQEYLKRPPESPPIVEEWN
+                                 SRAGGNQRNRGNRLQDNRQFRGRDNRWGWPSDNRSNQWHGRSWGNNYPQHRQEPYYPQ
+                                 QYGHYGYNQRPPYGYY"
+                 CDS             422..775
+                                 /locus_tag="RP3-427A4.2-001"
+                                 /note="match: proteins: Q9BTL3 Q9CQY2 Q9CWI1"
+                                 /pseudo
+                                 /codon_start=1
+                                 /product="Novel pseudogene"
+        """
+        cds_features = self.features_by_type.get("CDS")
+        if cds_features is None:
+            return None
+        else:
+            # Prefer CDS with protein accession and translated sequence.
+            translated_cds_features = [
+                f
+                for f in cds_features
+                if all([key in f.qualifiers for key in ("protein_id", "translation")])
+            ]
+            if len(translated_cds_features) != 1:
+                raise SeqRecordFeatureError("Expected one `CDS` feature at most")
+            return translated_cds_features[0]
+
+    @cached_property
+    def gene_feature(self) -> SeqFeature:
+        """Returns the gene feature, which should exist for all transcripts."""
+        gene_features = self.features_by_type.get("gene")
+        if gene_features is None or len(gene_features) != 1:
+            raise SeqRecordFeatureError(f"Expected exactly one `gene` feature, for {self.id} "
+                                        f"found {len(gene_features) if gene_features is not None else None}")
+
+        return gene_features[0]
+
+    @property
+    def id(self):
+        return self._sr.id
+
+    @property
+    def gene_symbol(self):
+        return self.gene_feature.qualifiers["gene"][0]
+
+    @property
+    def gene_synonyms(self):
+        if "gene_synonym" in self.gene_feature.qualifiers:
+            return [gs.strip() for gs in self.gene_feature.qualifiers["gene_synonym"][0].split(";")]
+        else:
+            return []
+
+    @property
+    def gene_type(self):
+        if self.cds_feature:
+            return "protein-coding"
+        elif "ncRNA" in self.features_by_type:
+            return "ncRNA"
+        elif "pseudo" in self.features_by_type:
+            return "pseudo"
+        elif "rRNA" in self.features_by_type:
+            return "rRNA"
+        elif "snoRNA" in self.features_by_type:
+            return "snoRNA"
+        elif "tRNA" in self.features_by_type:
+            return "tRNA"
+        elif "scRNA" in self.features_by_type:
+            return "scRNA"
+        elif "snRNA" in self.features_by_type:
+            return "snRNA"
+        elif "misc_RNA" in self.features_by_type:
+            return "misc_RNA"
+        elif "other" in self.features_by_type:
+            return "other"
+        else:
+            return "unknown"
+
+    @property
+    def gene_id(self):
+        # db_xref="GeneID:1234"
+        db_xrefs = self.gene_feature.qualifiers["db_xref"]
+        gene_ids = [x.partition(":")[2] for x in db_xrefs if x.startswith("GeneID:")]
+        assert len(gene_ids) == 1
+        return gene_ids[0]
+
+    @property
+    def db_xrefs(self):
+        """
+         gene            1..4577
+                 /gene="A2M"
+                 /gene_synonym="DKFZp779B086; FWP007; S863-7"
+                 /db_xref="GeneID:2"
+                 /db_xref="HPRD:00072"
+                 /db_xref="MIM:103950"
+        """
+        db_xrefs = self.gene_feature.qualifiers["db_xref"]
+        return [xref for xref in db_xrefs]
+
+    @property
+    def cds_se_i(self):
+        if self.cds_feature is not None:
+            return self.cds_feature.location.start.real, self.cds_feature.location.end.real
+        else:
+            return None
+
+    @property
+    def cds_product(self):
+        if self.cds_feature is not None:
+            return self.cds_feature.qualifiers["product"][0]
+        else:
+            return None
+
+    @property
+    def cds_protein_id(self):
+        if self.cds_feature is not None:
+            return self.cds_feature.qualifiers["protein_id"][0]
+        else:
+            return None
+
+    @property
+    def cds_translation(self):
+        if self.cds_feature is not None:
+            return str(self.cds_feature.qualifiers["translation"][0])
+        else:
+            return None
+
+    @property
+    def exons_se_i(self):
+        se_i = []
+        if "exon" in self.features_by_type:
+            exons = self.features_by_type["exon"]
+            se_i = [(f.location.start.real, f.location.end.real) for f in exons]
+        return se_i
+
+    @property
+    def codon_table(self) -> Optional[str]:
+        if self.cds_feature is None:
+            return None
+        else:
+            # default codon table is the standard table, aka "1"
+            # https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
+            return "1"
+
+    @property
+    def transl_except(self) -> Optional[List[str]]:
+        if self.cds_feature is None:
+            return None
+        else:
+            return self.cds_feature.qualifiers.get("transl_except")
+
+    @property
+    def feature_seq(self):
+        return str(self._sr.seq)
diff --git a/src/uta/tools/eutils.py b/src/uta/tools/eutils.py
new file mode 100644
index 0000000..7abfaa4
--- /dev/null
+++ b/src/uta/tools/eutils.py
@@ -0,0 +1,35 @@
+from enum import Enum
+
+import requests
+
+from uta import EutilsDownloadError
+
+
+class NcbiFileFormatEnum(str, Enum):
+    FASTA = "fasta"
+    GENBANK = "gb"
+
+
+def download_from_eutils(accession: str, file_format: NcbiFileFormatEnum, output_file: str) -> None:
+    """
+    Download a file from NCBI using the eutils endpoint.
+    Args:
+    - accession: NCBI accession ID
+    - file_format: File format to download ("fasta" or "gb")
+    - output_file: Path to the file where the downloaded content will be saved
+    """
+
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+    params = {
+        "db": "nuccore",
+        "id": accession,
+        "retmode": "text",
+        "rettype": file_format
+    }
+    response = requests.get(base_url, params=params)
+
+    if response.status_code == 200:
+        with open(output_file, 'w') as file:
+            file.write(response.text)
+    else:
+        raise EutilsDownloadError(f"Failed to download {file_format} file for {accession}. HTTP status code: {response.status_code}")
\ No newline at end of file
diff --git a/src/uta/tools/file_utils.py b/src/uta/tools/file_utils.py
new file mode 100644
index 0000000..78d8d3d
--- /dev/null
+++ b/src/uta/tools/file_utils.py
@@ -0,0 +1,12 @@
+import gzip
+from contextlib import contextmanager
+
+
+@contextmanager
+def open_file(filename):
+    if filename.endswith(".gz"):
+        with gzip.open(filename, "rt") as f:
+            yield f
+    else:
+        with open(filename) as f:
+            yield f
diff --git a/tests/data/NC_012920.1.gbff b/tests/data/NC_012920.1.gbff
new file mode 100644
index 0000000..afea7f3
--- /dev/null
+++ b/tests/data/NC_012920.1.gbff
@@ -0,0 +1,1165 @@
+LOCUS       NC_012920              16569 bp    DNA     circular PRI 03-APR-2023
+DEFINITION  Homo sapiens mitochondrion, complete genome.
+ACCESSION   NC_012920 AC_000021
+VERSION     NC_012920.1
+DBLINK      BioProject: PRJNA927338
+KEYWORDS    RefSeq.
+SOURCE      mitochondrion Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+REFERENCE   1  (bases 324 to 743)
+  AUTHORS   Andrews,R.M., Kubacka,I., Chinnery,P.F., Lightowlers,R.N.,
+            Turnbull,D.M. and Howell,N.
+  TITLE     Reanalysis and revision of the Cambridge reference sequence for
+            human mitochondrial DNA
+  JOURNAL   Nat. Genet. 23 (2), 147 (1999)
+   PUBMED   10508508
+REFERENCE   2  (bases 15888 to 15954)
+  AUTHORS   Anderson,S., Bankier,A.T., Barrell,B.G., de Bruijn,M.H.,
+            Coulson,A.R., Drouin,J., Eperon,I.C., Nierlich,D.P., Roe,B.A.,
+            Sanger,F., Schreier,P.H., Smith,A.J., Staden,R. and Young,I.G.
+  TITLE     Sequence and organization of the human mitochondrial genome
+  JOURNAL   Nature 290 (5806), 457-465 (1981)
+   PUBMED   7219534
+REFERENCE   3  (bases 1 to 16569)
+  CONSRTM   NCBI Genome Project
+  TITLE     Direct Submission
+  JOURNAL   Submitted (08-JUL-2009) National Center for Biotechnology
+            Information, NIH, Bethesda, MD 20894, USA
+REFERENCE   4  (bases 1 to 16569)
+  AUTHORS   Kogelnik,A.M. and Lott,M.T.
+  TITLE     Direct Submission
+  JOURNAL   Submitted (24-AUG-2006) Mitomap.org, Center for Molecular and
+            Mitochondrial Medicine and Genetics (MAMMAG) University of
+            California, University of California, Irvine, Irvine, CA
+            92697-3940, USA
+  REMARK    Sequence update by submitter
+REFERENCE   5  (bases 1 to 16569)
+  AUTHORS   Kogelnik,A.M. and Lott,M.T.
+  TITLE     Direct Submission
+  JOURNAL   Submitted (18-APR-1997) Center for Molecular Medicine, Emory
+            University School of Medicine, 1462 Clifton Road, Suite 420,
+            Atlanta, GA 30322, USA
+  REMARK    sequence updated
+COMMENT     PROVISIONAL REFSEQ: This record has not yet been subject to final
+            NCBI review. The reference sequence was derived from J01415.
+            
+            On Jul 8, 2009 this sequence version replaced AC_000021.2.
+            This sequence is a corrected version of the HUMMTCG reference
+            sequence.  The original Cambridge reference sequence (CRS) is
+            preserved as GenBank J01415 gi:337188 [PMID:7219534].  Corrections
+            have been made and annotated per the re-sequencing of the original
+            material by Andrews et al [PMID:10508508].
+            
+            This Revised Cambridge Reference Sequence (rCRS) has eighteen
+            specific corrections or confirmations of the original 1981 sequence
+            of Anderson et al [PMID:7219534].  Seven nucleotides are confirmed
+            as rare polymorphisms, maintained as: 263A, 311C-315C, 750A, 1438A,
+            4769A, 8860A, and 15326A.  Eleven nucleotides are error
+            corrections: 3107del, 3423T, 4985A,  9559C, 11335C, 13702C, 14199T,
+            14272C, 14365C, 14368C, and 14766C. These 11 errors in  the
+            original Cambridge sequence were determined to be  either outright
+            sequencing errors (8 instances) or due to the presence of bovine
+            DNA (2 instances) or HeLa DNA (1 instance) mixed in with the
+            original human placental DNA [PMID:10508508].  HISTORICAL
+            NUCLEOTIDE NUMBERS ARE MAINTAINED by indicating 3107del as 'N'.
+            A summary table of the reanalysis data is available online at
+            http://www.mitomap.org/MITOMAP/CambridgeReanalysis
+            
+            L-strand is shown.
+            COMPLETENESS: full length.
+FEATURES             Location/Qualifiers
+     source          1..16569
+                     /organism="Homo sapiens"
+                     /organelle="mitochondrion"
+                     /mol_type="genomic DNA"
+                     /isolation_source="caucasian"
+                     /db_xref="taxon:9606"
+                     /tissue_type="placenta"
+                     /country="United Kingdom: Great Britain"
+                     /note="this is the rCRS"
+     D-loop          complement(join(16024..16569,1..576))
+     gene            577..647
+                     /gene="TRNF"
+                     /nomenclature="Official Symbol: MT-TF | Name:
+                     mitochondrially encoded tRNA phenylalanine | Provided by:
+                     HGNC:HGNC:7481"
+                     /db_xref="GeneID:4558"
+                     /db_xref="HGNC:HGNC:7481"
+                     /db_xref="MIM:590070"
+     tRNA            577..647
+                     /gene="TRNF"
+                     /product="tRNA-Phe"
+                     /note="NAR: 1455"
+                     /anticodon=(pos:611..613,aa:Phe,seq:gaa)
+                     /codon_recognized="UUC"
+                     /db_xref="GeneID:4558"
+                     /db_xref="HGNC:HGNC:7481"
+                     /db_xref="MIM:590070"
+     gene            648..1601
+                     /gene="RNR1"
+                     /gene_synonym="MTRNR1"
+                     /nomenclature="Official Symbol: MT-RNR1 | Name:
+                     mitochondrially encoded 12S RNA | Provided by:
+                     HGNC:HGNC:7470"
+                     /db_xref="GeneID:4549"
+                     /db_xref="HGNC:HGNC:7470"
+                     /db_xref="MIM:561000"
+     rRNA            648..1601
+                     /gene="RNR1"
+                     /gene_synonym="MTRNR1"
+                     /product="s-rRNA"
+                     /note="12S rRNA; 12S ribosomal RNA"
+                     /db_xref="GeneID:4549"
+                     /db_xref="HGNC:HGNC:7470"
+                     /db_xref="MIM:561000"
+     gene            1602..1670
+                     /gene="TRNV"
+                     /gene_synonym="MTTV"
+                     /nomenclature="Official Symbol: MT-TV | Name:
+                     mitochondrially encoded tRNA valine | Provided by:
+                     HGNC:HGNC:7500"
+                     /db_xref="GeneID:4577"
+                     /db_xref="HGNC:HGNC:7500"
+                     /db_xref="MIM:590105"
+     tRNA            1602..1670
+                     /gene="TRNV"
+                     /gene_synonym="MTTV"
+                     /product="tRNA-Val"
+                     /note="NAR: 2053"
+                     /anticodon=(pos:1633..1635,aa:Val,seq:tac)
+                     /codon_recognized="GUA"
+                     /db_xref="GeneID:4577"
+                     /db_xref="HGNC:HGNC:7500"
+                     /db_xref="MIM:590105"
+     gene            1671..3229
+                     /gene="RNR2"
+                     /gene_synonym="MTRNR2"
+                     /nomenclature="Official Symbol: MT-RNR2 | Name:
+                     mitochondrially encoded 16S RNA | Provided by:
+                     HGNC:HGNC:7471"
+                     /db_xref="GeneID:4550"
+                     /db_xref="HGNC:HGNC:7471"
+                     /db_xref="MIM:561010"
+     rRNA            1671..3229
+                     /gene="RNR2"
+                     /gene_synonym="MTRNR2"
+                     /product="l-rRNA"
+                     /note="16S ribosomal RNA; 16S rRNA"
+                     /db_xref="GeneID:4550"
+                     /db_xref="HGNC:HGNC:7471"
+                     /db_xref="MIM:561010"
+     misc_feature    3107
+                     /note="preserves historical genome annotation numbering"
+     gene            3230..3304
+                     /gene="TRNL1"
+                     /gene_synonym="MTTL1"
+                     /nomenclature="Official Symbol: MT-TL1 | Name:
+                     mitochondrially encoded tRNA leucine 1 (UUA/G) | Provided
+                     by: HGNC:HGNC:7490"
+                     /db_xref="GeneID:4567"
+                     /db_xref="HGNC:HGNC:7490"
+                     /db_xref="MIM:590050"
+     tRNA            3230..3304
+                     /gene="TRNL1"
+                     /gene_synonym="MTTL1"
+                     /product="tRNA-Leu"
+                     /note="NAR: 1054"
+                     /anticodon=(pos:3265..3267,aa:Leu,seq:taa)
+                     /codon_recognized="UUR"
+                     /db_xref="GeneID:4567"
+                     /db_xref="HGNC:HGNC:7490"
+                     /db_xref="MIM:590050"
+     gene            3307..4262
+                     /gene="ND1"
+                     /gene_synonym="MTND1"
+                     /nomenclature="Official Symbol: MT-ND1 | Name:
+                     mitochondrially encoded NADH dehydrogenase 1 | Provided
+                     by: HGNC:HGNC:7455"
+                     /db_xref="GeneID:4535"
+                     /db_xref="HGNC:HGNC:7455"
+                     /db_xref="MIM:516000"
+     CDS             3307..4262
+                     /gene="ND1"
+                     /gene_synonym="MTND1"
+                     /note="NADH dehydrogenase, subunit 1 (complex I); TAA stop
+                     codon is completed by the addition of 3' A residues to the
+                     mRNA"
+                     /codon_start=1
+                     /transl_except=(pos:4261..4262,aa:TERM)
+                     /transl_table=2
+                     /product="NADH dehydrogenase subunit 1"
+                     /protein_id="YP_003024026.1"
+                     /db_xref="GeneID:4535"
+                     /db_xref="HGNC:HGNC:7455"
+                     /db_xref="MIM:516000"
+                     /translation="MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYG
+                     LLQPFADAMKLFTKEPLKPATSTITLYITAPTLALTIALLLWTPLPMPNPLVNLNLGL
+                     LFILATSSLAVYSILWSGWASNSNYALIGALRAVAQTISYEVTLAIILLSTLLMSGSF
+                     NLSTLITTQEHLWLLLPSWPLAMMWFISTLAETNRTPFDLAEGESELVSGFNIEYAAG
+                     PFALFFMAEYTNIIMMNTLTTTIFLGTTYDALSPELYTTYFVTKTLLLTSLFLWIRTA
+                     YPRFRYDQLMHLLWKNFLPLTLALLMWYVSMPITISSIPPQT"
+     gene            4263..4331
+                     /gene="TRNI"
+                     /gene_synonym="MTTI"
+                     /nomenclature="Official Symbol: MT-TI | Name:
+                     mitochondrially encoded tRNA isoleucine | Provided by:
+                     HGNC:HGNC:7488"
+                     /db_xref="GeneID:4565"
+                     /db_xref="HGNC:HGNC:7488"
+                     /db_xref="MIM:590045"
+     tRNA            4263..4331
+                     /gene="TRNI"
+                     /gene_synonym="MTTI"
+                     /product="tRNA-Ile"
+                     /note="NAR: 0997"
+                     /anticodon=(pos:4292..4294,aa:Ile,seq:gat)
+                     /codon_recognized="AUC"
+                     /db_xref="GeneID:4565"
+                     /db_xref="HGNC:HGNC:7488"
+                     /db_xref="MIM:590045"
+     gene            complement(4329..4400)
+                     /gene="TRNQ"
+                     /gene_synonym="MTTQ"
+                     /nomenclature="Official Symbol: MT-TQ | Name:
+                     mitochondrially encoded tRNA glutamine | Provided by:
+                     HGNC:HGNC:7495"
+                     /db_xref="GeneID:4572"
+                     /db_xref="HGNC:HGNC:7495"
+                     /db_xref="MIM:590030"
+     tRNA            complement(4329..4400)
+                     /gene="TRNQ"
+                     /gene_synonym="MTTQ"
+                     /product="tRNA-Gln"
+                     /note="NAR: 0597"
+                     /anticodon=(pos:complement(4365..4367),aa:Gln,seq:ttg)
+                     /codon_recognized="CAA"
+                     /db_xref="GeneID:4572"
+                     /db_xref="HGNC:HGNC:7495"
+                     /db_xref="MIM:590030"
+     gene            4402..4469
+                     /gene="TRNM"
+                     /gene_synonym="MTTM"
+                     /nomenclature="Official Symbol: MT-TM | Name:
+                     mitochondrially encoded tRNA methionine | Provided by:
+                     HGNC:HGNC:7492"
+                     /db_xref="GeneID:4569"
+                     /db_xref="HGNC:HGNC:7492"
+                     /db_xref="MIM:590065"
+     tRNA            4402..4469
+                     /gene="TRNM"
+                     /gene_synonym="MTTM"
+                     /product="tRNA-Met"
+                     /note="NAR: 1297"
+                     /anticodon=(pos:4432..4434,aa:Met,seq:cat)
+                     /codon_recognized="AUG"
+                     /db_xref="GeneID:4569"
+                     /db_xref="HGNC:HGNC:7492"
+                     /db_xref="MIM:590065"
+     gene            4470..5511
+                     /gene="ND2"
+                     /gene_synonym="MTND2"
+                     /nomenclature="Official Symbol: MT-ND2 | Name:
+                     mitochondrially encoded NADH dehydrogenase 2 | Provided
+                     by: HGNC:HGNC:7456"
+                     /db_xref="GeneID:4536"
+                     /db_xref="HGNC:HGNC:7456"
+                     /db_xref="MIM:516001"
+     CDS             4470..5511
+                     /gene="ND2"
+                     /gene_synonym="MTND2"
+                     /note="TAA stop codon is completed by the addition of 3' A
+                     residues to the mRNA"
+                     /codon_start=1
+                     /transl_except=(pos:5511,aa:TERM)
+                     /transl_table=2
+                     /product="NADH dehydrogenase subunit 2"
+                     /protein_id="YP_003024027.1"
+                     /db_xref="GeneID:4536"
+                     /db_xref="HGNC:HGNC:7456"
+                     /db_xref="MIM:516001"
+                     /translation="MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVL
+                     TKKMNPRSTEAAIKYFLTQATASMILLMAILFNNMLSGQWTMTNTTNQYSSLMIMMAM
+                     AMKLGMAPFHFWVPEVTQGTPLTSGLLLLTWQKLAPISIMYQISPSLNVSLLLTLSIL
+                     SIMAGSWGGLNQTQLRKILAYSSITHMGWMMAVLPYNPNMTILNLTIYIILTTTAFLL
+                     LNLNSSTTTLLLSRTWNKLTWLTPLIPSTLLSLGGLPPLTGFLPKWAIIEEFTKNNSL
+                     IIPTIMATITLLNLYFYLRLIYSTSITLLPMSNNVKMKWQFEHTKPTPFLPTLIALTT
+                     LLLPISPFMLMIL"
+     gene            5512..5579
+                     /gene="TRNW"
+                     /gene_synonym="MTTW"
+                     /nomenclature="Official Symbol: MT-TW | Name:
+                     mitochondrially encoded tRNA tryptophan | Provided by:
+                     HGNC:HGNC:7501"
+                     /db_xref="GeneID:4578"
+                     /db_xref="HGNC:HGNC:7501"
+                     /db_xref="MIM:590095"
+     tRNA            5512..5579
+                     /gene="TRNW"
+                     /gene_synonym="MTTW"
+                     /product="tRNA-Trp"
+                     /note="NAR: 1897"
+                     /anticodon=(pos:5544..5546,aa:Trp,seq:tca)
+                     /codon_recognized="UGA"
+                     /db_xref="GeneID:4578"
+                     /db_xref="HGNC:HGNC:7501"
+                     /db_xref="MIM:590095"
+     gene            complement(5587..5655)
+                     /gene="TRNA"
+                     /gene_synonym="MTTA"
+                     /nomenclature="Official Symbol: MT-TA | Name:
+                     mitochondrially encoded tRNA alanine | Provided by:
+                     HGNC:HGNC:7475"
+                     /db_xref="GeneID:4553"
+                     /db_xref="HGNC:HGNC:7475"
+                     /db_xref="MIM:590000"
+     tRNA            complement(5587..5655)
+                     /gene="TRNA"
+                     /gene_synonym="MTTA"
+                     /product="tRNA-Ala"
+                     /note="NAR: 0097"
+                     /anticodon=(pos:complement(5623..5625),aa:Ala,seq:tgc)
+                     /codon_recognized="GCA"
+                     /db_xref="GeneID:4553"
+                     /db_xref="HGNC:HGNC:7475"
+                     /db_xref="MIM:590000"
+     gene            complement(5657..5729)
+                     /gene="TRNN"
+                     /gene_synonym="MTTN"
+                     /nomenclature="Official Symbol: MT-TN | Name:
+                     mitochondrially encoded tRNA asparagine | Provided by:
+                     HGNC:HGNC:7493"
+                     /db_xref="GeneID:4570"
+                     /db_xref="HGNC:HGNC:7493"
+                     /db_xref="MIM:590010"
+     tRNA            complement(5657..5729)
+                     /gene="TRNN"
+                     /gene_synonym="MTTN"
+                     /product="tRNA-Asn"
+                     /note="NAR: 0297"
+                     /anticodon=(pos:complement(5694..5696),aa:Asn,seq:gtt)
+                     /codon_recognized="AAC"
+                     /db_xref="GeneID:4570"
+                     /db_xref="HGNC:HGNC:7493"
+                     /db_xref="MIM:590010"
+     gene            complement(5761..5826)
+                     /gene="TRNC"
+                     /gene_synonym="MTTC"
+                     /nomenclature="Official Symbol: MT-TC | Name:
+                     mitochondrially encoded tRNA cysteine | Provided by:
+                     HGNC:HGNC:7477"
+                     /db_xref="GeneID:4511"
+                     /db_xref="HGNC:HGNC:7477"
+                     /db_xref="MIM:590020"
+     tRNA            complement(5761..5826)
+                     /gene="TRNC"
+                     /gene_synonym="MTTC"
+                     /product="tRNA-Cys"
+                     /note="NAR: 0497"
+                     /anticodon=(pos:complement(5796..5798),aa:Cys,seq:gca)
+                     /codon_recognized="UGC"
+                     /db_xref="GeneID:4511"
+                     /db_xref="HGNC:HGNC:7477"
+                     /db_xref="MIM:590020"
+     gene            complement(5826..5891)
+                     /gene="TRNY"
+                     /gene_synonym="MTTY"
+                     /nomenclature="Official Symbol: MT-TY | Name:
+                     mitochondrially encoded tRNA tyrosine | Provided by:
+                     HGNC:HGNC:7502"
+                     /db_xref="GeneID:4579"
+                     /db_xref="HGNC:HGNC:7502"
+                     /db_xref="MIM:590100"
+     tRNA            complement(5826..5891)
+                     /gene="TRNY"
+                     /gene_synonym="MTTY"
+                     /product="tRNA-Tyr"
+                     /note="NAR: 1997"
+                     /anticodon=(pos:complement(5860..5862),aa:Tyr,seq:gta)
+                     /codon_recognized="UAC"
+                     /db_xref="GeneID:4579"
+                     /db_xref="HGNC:HGNC:7502"
+                     /db_xref="MIM:590100"
+     gene            5904..7445
+                     /gene="COX1"
+                     /gene_synonym="COI; MTCO1"
+                     /nomenclature="Official Symbol: MT-CO1 | Name:
+                     mitochondrially encoded cytochrome c oxidase I | Provided
+                     by: HGNC:HGNC:7419"
+                     /db_xref="GeneID:4512"
+                     /db_xref="HGNC:HGNC:7419"
+                     /db_xref="MIM:516030"
+     CDS             5904..7445
+                     /gene="COX1"
+                     /gene_synonym="COI; MTCO1"
+                     /note="cytochrome c oxidase I"
+                     /codon_start=1
+                     /transl_table=2
+                     /product="cytochrome c oxidase subunit I"
+                     /protein_id="YP_003024028.1"
+                     /db_xref="GeneID:4512"
+                     /db_xref="HGNC:HGNC:7419"
+                     /db_xref="MIM:516030"
+                     /translation="MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQP
+                     GNLLGNDHIYNVIVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSF
+                     WLLPPSLLLLLASAMVEAGAGTGWTVYPPLAGNYSHPGASVDLTIFSLHLAGVSSILG
+                     AINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTT
+                     FFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWA
+                     MMSIGFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKW
+                     SAAVLWALGFIFLFTVGGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGG
+                     FIHWFPLFSGYTLDQTYAKIHFTIMFIGVNLTFFPQHFLGLSGMPRRYSDYPDAYTTW
+                     NILSSVGSFISLTAVMLMIFMIWEAFASKRKVLMVEEPSMNLEWLYGCPPPYHTFEEP
+                     VYMKS"
+     gene            complement(7446..7514)
+                     /gene="TRNS1"
+                     /gene_synonym="MTTS1"
+                     /nomenclature="Official Symbol: MT-TS1 | Name:
+                     mitochondrially encoded tRNA serine 1 (UCN)"
+                     /db_xref="GeneID:4574"
+                     /db_xref="HGNC:HGNC:7497"
+                     /db_xref="MIM:590080"
+     tRNA            complement(7446..7514)
+                     /gene="TRNS1"
+                     /gene_synonym="MTTS1"
+                     /product="tRNA-Ser"
+                     /note="NAR: 1697"
+                     /anticodon=(pos:complement(7482..7484),aa:Ser,seq:tga)
+                     /codon_recognized="UCN"
+                     /db_xref="GeneID:4574"
+                     /db_xref="HGNC:HGNC:7497"
+                     /db_xref="MIM:590080"
+     gene            7518..7585
+                     /gene="TRND"
+                     /gene_synonym="MTTD"
+                     /nomenclature="Official Symbol: MT-TD | Name:
+                     mitochondrially encoded tRNA aspartic acid | Provided by:
+                     HGNC:HGNC:7478"
+                     /db_xref="GeneID:4555"
+                     /db_xref="HGNC:HGNC:7478"
+                     /db_xref="MIM:590015"
+     tRNA            7518..7585
+                     /gene="TRND"
+                     /gene_synonym="MTTD"
+                     /product="tRNA-Asp"
+                     /note="NAR: 0397"
+                     /anticodon=(pos:7548..7550,aa:Asp,seq:gtc)
+                     /codon_recognized="GAC"
+                     /db_xref="GeneID:4555"
+                     /db_xref="HGNC:HGNC:7478"
+                     /db_xref="MIM:590015"
+     gene            7586..8269
+                     /gene="COX2"
+                     /gene_synonym="COII; MTCO2"
+                     /nomenclature="Official Symbol: MT-CO2 | Name:
+                     mitochondrially encoded cytochrome c oxidase II | Provided
+                     by: HGNC:HGNC:7421"
+                     /db_xref="GeneID:4513"
+                     /db_xref="HGNC:HGNC:7421"
+                     /db_xref="MIM:516040"
+     CDS             7586..8269
+                     /gene="COX2"
+                     /gene_synonym="COII; MTCO2"
+                     /note="cytochrome c oxidase II"
+                     /codon_start=1
+                     /transl_table=2
+                     /product="cytochrome c oxidase subunit II"
+                     /protein_id="YP_003024029.1"
+                     /db_xref="GeneID:4513"
+                     /db_xref="HGNC:HGNC:7421"
+                     /db_xref="MIM:516040"
+                     /translation="MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFL
+                     TLTTKLTNTNISDAQEMETVWTILPAIILVLIALPSLRILYMTDEVNDPSLTIKSIGH
+                     QWYWTYEYTDYGGLIFNSYMLPPLFLEPGDLRLLDVDNRVVLPIEAPIRMMITSQDVL
+                     HSWAVPTLGLKTDAIPGRLNQTTFTATRPGVYYGQCSEICGANHSFMPIVLELIPLKI
+                     FEMGPVFTL"
+     gene            8295..8364
+                     /gene="TRNK"
+                     /gene_synonym="MTTK"
+                     /nomenclature="Official Symbol: MT-TK | Name:
+                     mitochondrially encoded tRNA lysine | Provided by:
+                     HGNC:HGNC:7489"
+                     /db_xref="GeneID:4566"
+                     /db_xref="HGNC:HGNC:7489"
+                     /db_xref="MIM:590060"
+     tRNA            8295..8364
+                     /gene="TRNK"
+                     /gene_synonym="MTTK"
+                     /product="tRNA-Lys"
+                     /note="NAR: 1197"
+                     /anticodon=(pos:8323..8325,aa:Lys,seq:ttt)
+                     /codon_recognized="AAA"
+                     /db_xref="GeneID:4566"
+                     /db_xref="HGNC:HGNC:7489"
+                     /db_xref="MIM:590060"
+     gene            8366..8572
+                     /gene="ATP8"
+                     /gene_synonym="ATPase8; MTATP8"
+                     /nomenclature="Official Symbol: MT-ATP8 | Name:
+                     mitochondrially encoded ATP synthase 8 | Provided by:
+                     HGNC:HGNC:7415"
+                     /db_xref="GeneID:4509"
+                     /db_xref="HGNC:HGNC:7415"
+                     /db_xref="MIM:516070"
+     CDS             8366..8572
+                     /gene="ATP8"
+                     /gene_synonym="ATPase8; MTATP8"
+                     /note="ATP synthase 8; ATPase subunit 8"
+                     /codon_start=1
+                     /transl_table=2
+                     /product="ATP synthase F0 subunit 8"
+                     /protein_id="YP_003024030.1"
+                     /db_xref="GeneID:4509"
+                     /db_xref="HGNC:HGNC:7415"
+                     /db_xref="MIM:516070"
+                     /translation="MPQLNTTVWPTMITPMLLTLFLITQLKMLNTNYHLPPSPKPMKM
+                     KNYNKPWEPKWTKICSLHSLPPQS"
+     gene            8527..9207
+                     /gene="ATP6"
+                     /gene_synonym="ATPase6; MTATP6"
+                     /nomenclature="Official Symbol: MT-ATP6 | Name:
+                     mitochondrially encoded ATP synthase 6 | Provided by:
+                     HGNC:HGNC:7414"
+                     /db_xref="GeneID:4508"
+                     /db_xref="HGNC:HGNC:7414"
+                     /db_xref="MIM:516060"
+     CDS             8527..9207
+                     /gene="ATP6"
+                     /gene_synonym="ATPase6; MTATP6"
+                     /note="ATP synthase 6; ATPase subunit 6"
+                     /codon_start=1
+                     /transl_table=2
+                     /product="ATP synthase F0 subunit 6"
+                     /protein_id="YP_003024031.1"
+                     /db_xref="GeneID:4508"
+                     /db_xref="HGNC:HGNC:7414"
+                     /db_xref="MIM:516060"
+                     /translation="MNENLFASFIAPTILGLPAAVLIILFPPLLIPTSKYLINNRLIT
+                     TQQWLIKLTSKQMMTMHNTKGRTWSLMLVSLIIFIATTNLLGLLPHSFTPTTQLSMNL
+                     AMAIPLWAGTVIMGFRSKIKNALAHFLPQGTPTPLIPMLVIIETISLLIQPMALAVRL
+                     TANITAGHLLMHLIGSATLAMSTINLPSTLIIFTILILLTILEIAVALIQAYVFTLLV
+                     SLYLHDNT"
+     gene            9207..9990
+                     /gene="COX3"
+                     /gene_synonym="COIII; MTCO3"
+                     /nomenclature="Official Symbol: MT-CO3 | Name:
+                     mitochondrially encoded cytochrome c oxidase III |
+                     Provided by: HGNC:HGNC:7422"
+                     /db_xref="GeneID:4514"
+                     /db_xref="HGNC:HGNC:7422"
+                     /db_xref="MIM:516050"
+     CDS             9207..9990
+                     /gene="COX3"
+                     /gene_synonym="COIII; MTCO3"
+                     /note="cytochrome c oxidase III; TAA stop codon is
+                     completed by the addition of 3' A residues to the mRNA"
+                     /codon_start=1
+                     /transl_except=(pos:9990,aa:TERM)
+                     /transl_table=2
+                     /product="cytochrome c oxidase subunit III"
+                     /protein_id="YP_003024032.1"
+                     /db_xref="GeneID:4514"
+                     /db_xref="HGNC:HGNC:7422"
+                     /db_xref="MIM:516050"
+                     /translation="MTHQSHAYHMVKPSPWPLTGALSALLMTSGLAMWFHFHSMTLLM
+                     LGLLTNTLTMYQWWRDVTRESTYQGHHTPPVQKGLRYGMILFITSEVFFFAGFFWAFY
+                     HSSLAPTPQLGGHWPPTGITPLNPLEVPLLNTSVLLASGVSITWAHHSLMENNRNQMI
+                     QALLITILLGLYFTLLQASEYFESPFTISDGIYGSTFFVATGFHGLHVIIGSTFLTIC
+                     FIRQLMFHFTSKHHFGFEAAAWYWHFVDVVWLFLYVSIYWWGS"
+     gene            9991..10058
+                     /gene="TRNG"
+                     /gene_synonym="MTTG"
+                     /nomenclature="Official Symbol: MT-TG | Name:
+                     mitochondrially encoded tRNA glycine | Provided by:
+                     HGNC:HGNC:7486"
+                     /db_xref="GeneID:4563"
+                     /db_xref="HGNC:HGNC:7486"
+                     /db_xref="MIM:590035"
+     tRNA            9991..10058
+                     /gene="TRNG"
+                     /gene_synonym="MTTG"
+                     /product="tRNA-Gly"
+                     /note="NAR: 0797"
+                     /anticodon=(pos:10021..10023,aa:Gly,seq:tcc)
+                     /codon_recognized="GGA"
+                     /db_xref="GeneID:4563"
+                     /db_xref="HGNC:HGNC:7486"
+                     /db_xref="MIM:590035"
+     gene            10059..10404
+                     /gene="ND3"
+                     /gene_synonym="MTND3"
+                     /nomenclature="Official Symbol: MT-ND3 | Name:
+                     mitochondrially encoded NADH dehydrogenase 3 | Provided
+                     by: HGNC:HGNC:7458"
+                     /db_xref="GeneID:4537"
+                     /db_xref="HGNC:HGNC:7458"
+                     /db_xref="MIM:516002"
+     CDS             10059..10404
+                     /gene="ND3"
+                     /gene_synonym="MTND3"
+                     /note="NADH dehydrogenase, subunit 3 (complex I); TAA stop
+                     codon is completed by the addition of 3' A residues to the
+                     mRNA"
+                     /codon_start=1
+                     /transl_except=(pos:10404,aa:TERM)
+                     /transl_table=2
+                     /product="NADH dehydrogenase subunit 3"
+                     /protein_id="YP_003024033.1"
+                     /db_xref="GeneID:4537"
+                     /db_xref="HGNC:HGNC:7458"
+                     /db_xref="MIM:516002"
+                     /translation="MNFALILMINTLLALLLMIITFWLPQLNGYMEKSTPYECGFDPM
+                     SPARVPFSMKFFLVAITFLLFDLEIALLLPLPWALQTTNLPLMVMSSLLLIIILALSL
+                     AYEWLQKGLDWTE"
+     gene            10405..10469
+                     /gene="TRNR"
+                     /gene_synonym="MTTR"
+                     /nomenclature="Official Symbol: MT-TR | Name:
+                     mitochondrially encoded tRNA arginine | Provided by:
+                     HGNC:HGNC:7496"
+                     /db_xref="GeneID:4573"
+                     /db_xref="HGNC:HGNC:7496"
+                     /db_xref="MIM:590005"
+     tRNA            10405..10469
+                     /gene="TRNR"
+                     /gene_synonym="MTTR"
+                     /product="tRNA-Arg"
+                     /note="NAR: 0197"
+                     /anticodon=(pos:10435..10437,aa:Arg,seq:tcg)
+                     /codon_recognized="CGA"
+                     /db_xref="GeneID:4573"
+                     /db_xref="HGNC:HGNC:7496"
+                     /db_xref="MIM:590005"
+     gene            10470..10766
+                     /gene="ND4L"
+                     /gene_synonym="MTND4L"
+                     /nomenclature="Official Symbol: MT-ND4L | Name:
+                     mitochondrially encoded NADH 4L dehydrogenase | Provided
+                     by: HGNC:HGNC:7460"
+                     /db_xref="GeneID:4539"
+                     /db_xref="HGNC:HGNC:7460"
+                     /db_xref="MIM:516004"
+     CDS             10470..10766
+                     /gene="ND4L"
+                     /gene_synonym="MTND4L"
+                     /note="NADH dehydrogenase, subunit 4L (complex I)"
+                     /codon_start=1
+                     /transl_table=2
+                     /product="NADH dehydrogenase subunit 4L"
+                     /protein_id="YP_003024034.1"
+                     /db_xref="GeneID:4539"
+                     /db_xref="HGNC:HGNC:7460"
+                     /db_xref="MIM:516004"
+                     /translation="MPLIYMNIMLAFTISLLGMLVYRSHLMSSLLCLEGMMLSLFIMA
+                     TLMTLNTHSLLANIVPIAMLVFAACEAAVGLALLVSISNTYGLDYVHNLNLLQC"
+     gene            10760..12137
+                     /gene="ND4"
+                     /gene_synonym="MTND4"
+                     /nomenclature="Official Symbol: MT-ND4 | Name:
+                     mitochondrially encoded NADH dehydrogenase 4 | Provided
+                     by: HGNC:HGNC:7459"
+                     /db_xref="GeneID:4538"
+                     /db_xref="HGNC:HGNC:7459"
+                     /db_xref="MIM:516003"
+     CDS             10760..12137
+                     /gene="ND4"
+                     /gene_synonym="MTND4"
+                     /note="NADH dehydrogenase, subunit 4 (complex I); TAA stop
+                     codon is completed by the addition of 3' A residues to the
+                     mRNA"
+                     /codon_start=1
+                     /transl_except=(pos:12137,aa:TERM)
+                     /transl_table=2
+                     /product="NADH dehydrogenase subunit 4"
+                     /protein_id="YP_003024035.1"
+                     /db_xref="GeneID:4538"
+                     /db_xref="HGNC:HGNC:7459"
+                     /db_xref="MIM:516003"
+                     /translation="MLKLIVPTIMLLPLTWLSKKHMIWINTTTHSLIISIIPLLFFNQ
+                     INNNLFSCSPTFSSDPLTTPLLMLTTWLLPLTIMASQRHLSSEPLSRKKLYLSMLISL
+                     QISLIMTFTATELIMFYIFFETTLIPTLAIITRWGNQPERLNAGTYFLFYTLVGSLPL
+                     LIALIYTHNTLGSLNILLLTLTAQELSNSWANNLMWLAYTMAFMVKMPLYGLHLWLPK
+                     AHVEAPIAGSMVLAAVLLKLGGYGMMRLTLILNPLTKHMAYPFLVLSLWGMIMTSSIC
+                     LRQTDLKSLIAYSSISHMALVVTAILIQTPWSFTGAVILMIAHGLTSSLLFCLANSNY
+                     ERTHSRIMILSQGLQTLLPLMAFWWLLASLANLALPPTINLLGELSVLVTTFSWSNIT
+                     LLLTGLNMLVTALYSLYMFTTTQWGSLTHHINNMKPSFTRENTLMFMHLSPILLLSLN
+                     PDIITGFSS"
+     gene            12138..12206
+                     /gene="TRNH"
+                     /gene_synonym="MTTH"
+                     /nomenclature="Official Symbol: MT-TH | Name:
+                     mitochondrially encoded tRNA histidine | Provided by:
+                     HGNC:HGNC:7487"
+                     /db_xref="GeneID:4564"
+                     /db_xref="HGNC:HGNC:7487"
+                     /db_xref="MIM:590040"
+     tRNA            12138..12206
+                     /gene="TRNH"
+                     /gene_synonym="MTTH"
+                     /product="tRNA-His"
+                     /note="NAR: 0897"
+                     /anticodon=(pos:12168..12170,aa:His,seq:gtg)
+                     /codon_recognized="CAC"
+                     /db_xref="GeneID:4564"
+                     /db_xref="HGNC:HGNC:7487"
+                     /db_xref="MIM:590040"
+     gene            12207..12265
+                     /gene="TRNS2"
+                     /gene_synonym="MTTS2"
+                     /nomenclature="Official Symbol: MT-TS2 | Name:
+                     mitochondrially encoded tRNA serine 2 (AGU/C) | Provided
+                     by: HGNC:HGNC:7498"
+                     /db_xref="GeneID:4575"
+                     /db_xref="HGNC:HGNC:7498"
+                     /db_xref="MIM:590085"
+     tRNA            12207..12265
+                     /gene="TRNS2"
+                     /gene_synonym="MTTS2"
+                     /product="tRNA-Ser"
+                     /note="NAR: 1656"
+                     /anticodon=(pos:12226..12228,aa:Ser,seq:gct)
+                     /codon_recognized="AGY"
+                     /db_xref="GeneID:4575"
+                     /db_xref="HGNC:HGNC:7498"
+                     /db_xref="MIM:590085"
+     gene            12266..12336
+                     /gene="TRNL2"
+                     /gene_synonym="MTTL2"
+                     /nomenclature="Official Symbol: MT-TL2 | Name:
+                     mitochondrially encoded tRNA leucine 2 (CUN) | Provided
+                     by: HGNC:HGNC:7491"
+                     /db_xref="GeneID:4568"
+                     /db_xref="HGNC:HGNC:7491"
+                     /db_xref="MIM:590055"
+     tRNA            12266..12336
+                     /gene="TRNL2"
+                     /gene_synonym="MTTL2"
+                     /product="tRNA-Leu"
+                     /note="NAR: 1097"
+                     /anticodon=(pos:12298..12300,aa:Leu,seq:tag)
+                     /codon_recognized="CUN"
+                     /db_xref="GeneID:4568"
+                     /db_xref="HGNC:HGNC:7491"
+                     /db_xref="MIM:590055"
+     gene            12337..14148
+                     /gene="ND5"
+                     /gene_synonym="MTND5"
+                     /nomenclature="Official Symbol: MT-ND5 | Name:
+                     mitochondrially encoded NADH dehydrogenase 5 | Provided
+                     by: HGNC:HGNC:7461"
+                     /db_xref="GeneID:4540"
+                     /db_xref="HGNC:HGNC:7461"
+                     /db_xref="MIM:516005"
+     CDS             12337..14148
+                     /gene="ND5"
+                     /gene_synonym="MTND5"
+                     /note="NADH dehydrogenase, subunit 5 (complex I)"
+                     /codon_start=1
+                     /transl_table=2
+                     /product="NADH dehydrogenase subunit 5"
+                     /protein_id="YP_003024036.1"
+                     /db_xref="GeneID:4540"
+                     /db_xref="HGNC:HGNC:7461"
+                     /db_xref="MIM:516005"
+                     /translation="MTMHTTMTTLTLTSLIPPILTTLVNPNKKNSYPHYVKSIVASTF
+                     IISLFPTTMFMCLDQEVIISNWHWATTQTTQLSLSFKLDYFSMMFIPVALFVTWSIME
+                     FSLWYMNSDPNINQFFKYLLIFLITMLILVTANNLFQLFIGWEGVGIMSFLLISWWYA
+                     RADANTAAIQAILYNRIGDIGFILALAWFILHSNSWDPQQMALLNANPSLTPLLGLLL
+                     AAAGKSAQLGLHPWLPSAMEGPTPVSALLHSSTMVVAGIFLLIRFHPLAENSPLIQTL
+                     TLCLGAITTLFAAVCALTQNDIKKIVAFSTSSQLGLMMVTIGINQPHLAFLHICTHAF
+                     FKAMLFMCSGSIIHNLNNEQDIRKMGGLLKTMPLTSTSLTIGSLALAGMPFLTGFYSK
+                     DHIIETANMSYTNAWALSITLIATSLTSAYSTRMILLTLTGQPRFPTLTNINENNPTL
+                     LNPIKRLAAGSLFAGFLITNNISPASPFQTTIPLYLKLTALAVTFLGLLTALDLNYLT
+                     NKLKMKSPLCTFYFSNMLGFYPSITHRTIPYLGLLTSQNLPLLLLDLTWLEKLLPKTI
+                     SQHQISTSIITSTQKGMIKLYFLSFFFPLILTLLLIT"
+     gene            complement(14149..14673)
+                     /gene="ND6"
+                     /gene_synonym="MTND6"
+                     /nomenclature="Official Symbol: MT-ND6 | Name:
+                     mitochondrially encoded NADH dehydrogenase 6 | Provided
+                     by: HGNC:HGNC:7462"
+                     /db_xref="GeneID:4541"
+                     /db_xref="HGNC:HGNC:7462"
+                     /db_xref="MIM:516006"
+     CDS             complement(14149..14673)
+                     /gene="ND6"
+                     /gene_synonym="MTND6"
+                     /note="NADH dehydrogenase, subunit 6 (complex I)"
+                     /codon_start=1
+                     /transl_table=2
+                     /product="NADH dehydrogenase subunit 6"
+                     /protein_id="YP_003024037.1"
+                     /db_xref="GeneID:4541"
+                     /db_xref="HGNC:HGNC:7462"
+                     /db_xref="MIM:516006"
+                     /translation="MMYALFLLSVGLVMGFVGFSSKPSPIYGGLVLIVSGVVGCVIIL
+                     NFGGGYMGLMVFLIYLGGMMVVFGYTTAMAIEEYPEAWGSGVEVLVSVLVGLAMEVGL
+                     VLWVKEYDGVVVVVNFNSVGSWMIYEGEGSGLIREDPIGAGALYDYGRWLVVVTGWTL
+                     FVGVYIVIEIARGN"
+     gene            complement(14674..14742)
+                     /gene="TRNE"
+                     /gene_synonym="MTTE"
+                     /nomenclature="Official Symbol: MT-TE | Name:
+                     mitochondrially encoded tRNA glutamic acid | Provided by:
+                     HGNC:HGNC:7479"
+                     /db_xref="GeneID:4556"
+                     /db_xref="HGNC:HGNC:7479"
+                     /db_xref="MIM:590025"
+     tRNA            complement(14674..14742)
+                     /gene="TRNE"
+                     /gene_synonym="MTTE"
+                     /product="tRNA-Glu"
+                     /note="NAR: 0697"
+                     /anticodon=(pos:complement(14710..14712),aa:Glu,seq:ttc)
+                     /codon_recognized="GAA"
+                     /db_xref="GeneID:4556"
+                     /db_xref="HGNC:HGNC:7479"
+                     /db_xref="MIM:590025"
+     gene            14747..15887
+                     /gene="CYTB"
+                     /gene_synonym="MTCYB"
+                     /nomenclature="Official Symbol: MT-CYB | Name:
+                     mitochondrially encoded cytochrome b | Provided by:
+                     HGNC:HGNC:7427"
+                     /db_xref="GeneID:4519"
+                     /db_xref="HGNC:HGNC:7427"
+                     /db_xref="MIM:516020"
+     CDS             14747..15887
+                     /gene="CYTB"
+                     /gene_synonym="MTCYB"
+                     /note="TAA stop codon is completed by the addition of 3' A
+                     residues to the mRNA"
+                     /codon_start=1
+                     /transl_except=(pos:15887,aa:TERM)
+                     /transl_table=2
+                     /product="cytochrome b"
+                     /protein_id="YP_003024038.1"
+                     /db_xref="GeneID:4519"
+                     /db_xref="HGNC:HGNC:7427"
+                     /db_xref="MIM:516020"
+                     /translation="MTPMRKTNPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQ
+                     ITTGLFLAMHYSPDASTAFSSIAHITRDVNYGWIIRYLHANGASMFFICLFLHIGRGL
+                     YYGSFLYSETWNIGIILLLATMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGTDL
+                     VQWIWGGYSVDSPTLTRFFTFHFILPFIIAALATLHLLFLHETGSNNPLGITSHSDKI
+                     TFHPYYTIKDALGLLLFLLSLMTLTLFSPDLLGDPDNYTLANPLNTPPHIKPEWYFLF
+                     AYTILRSVPNKLGGVLALLLSILILAMIPILHMSKQQSMMFRPLSQSLYWLLAADLLI
+                     LTWIGGQPVSYPFTIIGQVASVLYFTTILILMPTISLIENKMLKWA"
+     gene            15888..15953
+                     /gene="TRNT"
+                     /gene_synonym="MTTT"
+                     /nomenclature="Official Symbol: MT-TT | Name:
+                     mitochondrially encoded tRNA threonine | Provided by:
+                     HGNC:HGNC:7499"
+                     /db_xref="GeneID:4576"
+                     /db_xref="HGNC:HGNC:7499"
+                     /db_xref="MIM:590090"
+     tRNA            15888..15953
+                     /gene="TRNT"
+                     /gene_synonym="MTTT"
+                     /product="tRNA-Thr"
+                     /note="NAR: 1797"
+                     /anticodon=(pos:15919..15921,aa:Thr,seq:tgt)
+                     /codon_recognized="ACA"
+                     /db_xref="GeneID:4576"
+                     /db_xref="HGNC:HGNC:7499"
+                     /db_xref="MIM:590090"
+     gene            complement(15956..16023)
+                     /gene="TRNP"
+                     /gene_synonym="MTTP"
+                     /nomenclature="Official Symbol: MT-TP | Name:
+                     mitochondrially encoded tRNA proline | Provided by:
+                     HGNC:HGNC:7494"
+                     /db_xref="GeneID:4571"
+                     /db_xref="HGNC:HGNC:7494"
+                     /db_xref="MIM:590075"
+     tRNA            complement(15956..16023)
+                     /gene="TRNP"
+                     /gene_synonym="MTTP"
+                     /product="tRNA-Pro"
+                     /note="NAR: 1597"
+                     /anticodon=(pos:complement(15990..15992),aa:Pro,seq:tgg)
+                     /codon_recognized="CCA"
+                     /db_xref="GeneID:4571"
+                     /db_xref="HGNC:HGNC:7494"
+                     /db_xref="MIM:590075"
+ORIGIN      
+        1 gatcacaggt ctatcaccct attaaccact cacgggagct ctccatgcat ttggtatttt
+       61 cgtctggggg gtatgcacgc gatagcattg cgagacgctg gagccggagc accctatgtc
+      121 gcagtatctg tctttgattc ctgcctcatc ctattattta tcgcacctac gttcaatatt
+      181 acaggcgaac atacttacta aagtgtgtta attaattaat gcttgtagga cataataata
+      241 acaattgaat gtctgcacag ccactttcca cacagacatc ataacaaaaa atttccacca
+      301 aaccccccct cccccgcttc tggccacagc acttaaacac atctctgcca aaccccaaaa
+      361 acaaagaacc ctaacaccag cctaaccaga tttcaaattt tatcttttgg cggtatgcac
+      421 ttttaacagt caccccccaa ctaacacatt attttcccct cccactccca tactactaat
+      481 ctcatcaata caacccccgc ccatcctacc cagcacacac acaccgctgc taaccccata
+      541 ccccgaacca accaaacccc aaagacaccc cccacagttt atgtagctta cctcctcaaa
+      601 gcaatacact gaaaatgttt agacgggctc acatcacccc ataaacaaat aggtttggtc
+      661 ctagcctttc tattagctct tagtaagatt acacatgcaa gcatccccgt tccagtgagt
+      721 tcaccctcta aatcaccacg atcaaaagga acaagcatca agcacgcagc aatgcagctc
+      781 aaaacgctta gcctagccac acccccacgg gaaacagcag tgattaacct ttagcaataa
+      841 acgaaagttt aactaagcta tactaacccc agggttggtc aatttcgtgc cagccaccgc
+      901 ggtcacacga ttaacccaag tcaatagaag ccggcgtaaa gagtgtttta gatcaccccc
+      961 tccccaataa agctaaaact cacctgagtt gtaaaaaact ccagttgaca caaaatagac
+     1021 tacgaaagtg gctttaacat atctgaacac acaatagcta agacccaaac tgggattaga
+     1081 taccccacta tgcttagccc taaacctcaa cagttaaatc aacaaaactg ctcgccagaa
+     1141 cactacgagc cacagcttaa aactcaaagg acctggcggt gcttcatatc cctctagagg
+     1201 agcctgttct gtaatcgata aaccccgatc aacctcacca cctcttgctc agcctatata
+     1261 ccgccatctt cagcaaaccc tgatgaaggc tacaaagtaa gcgcaagtac ccacgtaaag
+     1321 acgttaggtc aaggtgtagc ccatgaggtg gcaagaaatg ggctacattt tctaccccag
+     1381 aaaactacga tagcccttat gaaacttaag ggtcgaaggt ggatttagca gtaaactaag
+     1441 agtagagtgc ttagttgaac agggccctga agcgcgtaca caccgcccgt caccctcctc
+     1501 aagtatactt caaaggacat ttaactaaaa cccctacgca tttatataga ggagacaagt
+     1561 cgtaacatgg taagtgtact ggaaagtgca cttggacgaa ccagagtgta gcttaacaca
+     1621 aagcacccaa cttacactta ggagatttca acttaacttg accgctctga gctaaaccta
+     1681 gccccaaacc cactccacct tactaccaga caaccttagc caaaccattt acccaaataa
+     1741 agtataggcg atagaaattg aaacctggcg caatagatat agtaccgcaa gggaaagatg
+     1801 aaaaattata accaagcata atatagcaag gactaacccc tataccttct gcataatgaa
+     1861 ttaactagaa ataactttgc aaggagagcc aaagctaaga cccccgaaac cagacgagct
+     1921 acctaagaac agctaaaaga gcacacccgt ctatgtagca aaatagtggg aagatttata
+     1981 ggtagaggcg acaaacctac cgagcctggt gatagctggt tgtccaagat agaatcttag
+     2041 ttcaacttta aatttgccca cagaaccctc taaatcccct tgtaaattta actgttagtc
+     2101 caaagaggaa cagctctttg gacactagga aaaaaccttg tagagagagt aaaaaattta
+     2161 acacccatag taggcctaaa agcagccacc aattaagaaa gcgttcaagc tcaacaccca
+     2221 ctacctaaaa aatcccaaac atataactga actcctcaca cccaattgga ccaatctatc
+     2281 accctataga agaactaatg ttagtataag taacatgaaa acattctcct ccgcataagc
+     2341 ctgcgtcaga ttaaaacact gaactgacaa ttaacagccc aatatctaca atcaaccaac
+     2401 aagtcattat taccctcact gtcaacccaa cacaggcatg ctcataagga aaggttaaaa
+     2461 aaagtaaaag gaactcggca aatcttaccc cgcctgttta ccaaaaacat cacctctagc
+     2521 atcaccagta ttagaggcac cgcctgccca gtgacacatg tttaacggcc gcggtaccct
+     2581 aaccgtgcaa aggtagcata atcacttgtt ccttaaatag ggacctgtat gaatggctcc
+     2641 acgagggttc agctgtctct tacttttaac cagtgaaatt gacctgcccg tgaagaggcg
+     2701 ggcataacac agcaagacga gaagacccta tggagcttta atttattaat gcaaacagta
+     2761 cctaacaaac ccacaggtcc taaactacca aacctgcatt aaaaatttcg gttggggcga
+     2821 cctcggagca gaacccaacc tccgagcagt acatgctaag acttcaccag tcaaagcgaa
+     2881 ctactatact caattgatcc aataacttga ccaacggaac aagttaccct agggataaca
+     2941 gcgcaatcct attctagagt ccatatcaac aatagggttt acgacctcga tgttggatca
+     3001 ggacatcccg atggtgcagc cgctattaaa ggttcgtttg ttcaacgatt aaagtcctac
+     3061 gtgatctgag ttcagaccgg agtaatccag gtcggtttct atctacnttc aaattcctcc
+     3121 ctgtacgaaa ggacaagaga aataaggcct acttcacaaa gcgccttccc ccgtaaatga
+     3181 tatcatctca acttagtatt atacccacac ccacccaaga acagggtttg ttaagatggc
+     3241 agagcccggt aatcgcataa aacttaaaac tttacagtca gaggttcaat tcctcttctt
+     3301 aacaacatac ccatggccaa cctcctactc ctcattgtac ccattctaat cgcaatggca
+     3361 ttcctaatgc ttaccgaacg aaaaattcta ggctatatac aactacgcaa aggccccaac
+     3421 gttgtaggcc cctacgggct actacaaccc ttcgctgacg ccataaaact cttcaccaaa
+     3481 gagcccctaa aacccgccac atctaccatc accctctaca tcaccgcccc gaccttagct
+     3541 ctcaccatcg ctcttctact atgaaccccc ctccccatac ccaaccccct ggtcaacctc
+     3601 aacctaggcc tcctatttat tctagccacc tctagcctag ccgtttactc aatcctctga
+     3661 tcagggtgag catcaaactc aaactacgcc ctgatcggcg cactgcgagc agtagcccaa
+     3721 acaatctcat atgaagtcac cctagccatc attctactat caacattact aataagtggc
+     3781 tcctttaacc tctccaccct tatcacaaca caagaacacc tctgattact cctgccatca
+     3841 tgacccttgg ccataatatg atttatctcc acactagcag agaccaaccg aacccccttc
+     3901 gaccttgccg aaggggagtc cgaactagtc tcaggcttca acatcgaata cgccgcaggc
+     3961 cccttcgccc tattcttcat agccgaatac acaaacatta ttataataaa caccctcacc
+     4021 actacaatct tcctaggaac aacatatgac gcactctccc ctgaactcta cacaacatat
+     4081 tttgtcacca agaccctact tctaacctcc ctgttcttat gaattcgaac agcatacccc
+     4141 cgattccgct acgaccaact catacacctc ctatgaaaaa acttcctacc actcacccta
+     4201 gcattactta tatgatatgt ctccataccc attacaatct ccagcattcc ccctcaaacc
+     4261 taagaaatat gtctgataaa agagttactt tgatagagta aataatagga gcttaaaccc
+     4321 ccttatttct aggactatga gaatcgaacc catccctgag aatccaaaat tctccgtgcc
+     4381 acctatcaca ccccatccta aagtaaggtc agctaaataa gctatcgggc ccataccccg
+     4441 aaaatgttgg ttataccctt cccgtactaa ttaatcccct ggcccaaccc gtcatctact
+     4501 ctaccatctt tgcaggcaca ctcatcacag cgctaagctc gcactgattt tttacctgag
+     4561 taggcctaga aataaacatg ctagctttta ttccagttct aaccaaaaaa ataaaccctc
+     4621 gttccacaga agctgccatc aagtatttcc tcacgcaagc aaccgcatcc ataatccttc
+     4681 taatagctat cctcttcaac aatatactct ccggacaatg aaccataacc aatactacca
+     4741 atcaatactc atcattaata atcataatag ctatagcaat aaaactagga atagccccct
+     4801 ttcacttctg agtcccagag gttacccaag gcacccctct gacatccggc ctgcttcttc
+     4861 tcacatgaca aaaactagcc cccatctcaa tcatatacca aatctctccc tcactaaacg
+     4921 taagccttct cctcactctc tcaatcttat ccatcatagc aggcagttga ggtggattaa
+     4981 accaaaccca gctacgcaaa atcttagcat actcctcaat tacccacata ggatgaataa
+     5041 tagcagttct accgtacaac cctaacataa ccattcttaa tttaactatt tatattatcc
+     5101 taactactac cgcattccta ctactcaact taaactccag caccacgacc ctactactat
+     5161 ctcgcacctg aaacaagcta acatgactaa cacccttaat tccatccacc ctcctctccc
+     5221 taggaggcct gcccccgcta accggctttt tgcccaaatg ggccattatc gaagaattca
+     5281 caaaaaacaa tagcctcatc atccccacca tcatagccac catcaccctc cttaacctct
+     5341 acttctacct acgcctaatc tactccacct caatcacact actccccata tctaacaacg
+     5401 taaaaataaa atgacagttt gaacatacaa aacccacccc attcctcccc acactcatcg
+     5461 cccttaccac gctactccta cctatctccc cttttatact aataatctta tagaaattta
+     5521 ggttaaatac agaccaagag ccttcaaagc cctcagtaag ttgcaatact taatttctgt
+     5581 aacagctaag gactgcaaaa ccccactctg catcaactga acgcaaatca gccactttaa
+     5641 ttaagctaag cccttactag accaatggga cttaaaccca caaacactta gttaacagct
+     5701 aagcacccta atcaactggc ttcaatctac ttctcccgcc gccgggaaaa aaggcgggag
+     5761 aagccccggc aggtttgaag ctgcttcttc gaatttgcaa ttcaatatga aaatcacctc
+     5821 ggagctggta aaaagaggcc taacccctgt ctttagattt acagtccaat gcttcactca
+     5881 gccattttac ctcaccccca ctgatgttcg ccgaccgttg actattctct acaaaccaca
+     5941 aagacattgg aacactatac ctattattcg gcgcatgagc tggagtccta ggcacagctc
+     6001 taagcctcct tattcgagcc gagctgggcc agccaggcaa ccttctaggt aacgaccaca
+     6061 tctacaacgt tatcgtcaca gcccatgcat ttgtaataat cttcttcata gtaataccca
+     6121 tcataatcgg aggctttggc aactgactag ttcccctaat aatcggtgcc cccgatatgg
+     6181 cgtttccccg cataaacaac ataagcttct gactcttacc tccctctctc ctactcctgc
+     6241 tcgcatctgc tatagtggag gccggagcag gaacaggttg aacagtctac cctcccttag
+     6301 cagggaacta ctcccaccct ggagcctccg tagacctaac catcttctcc ttacacctag
+     6361 caggtgtctc ctctatctta ggggccatca atttcatcac aacaattatc aatataaaac
+     6421 cccctgccat aacccaatac caaacgcccc tcttcgtctg atccgtccta atcacagcag
+     6481 tcctacttct cctatctctc ccagtcctag ctgctggcat cactatacta ctaacagacc
+     6541 gcaacctcaa caccaccttc ttcgaccccg ccggaggagg agaccccatt ctataccaac
+     6601 acctattctg atttttcggt caccctgaag tttatattct tatcctacca ggcttcggaa
+     6661 taatctccca tattgtaact tactactccg gaaaaaaaga accatttgga tacataggta
+     6721 tggtctgagc tatgatatca attggcttcc tagggtttat cgtgtgagca caccatatat
+     6781 ttacagtagg aatagacgta gacacacgag catatttcac ctccgctacc ataatcatcg
+     6841 ctatccccac cggcgtcaaa gtatttagct gactcgccac actccacgga agcaatatga
+     6901 aatgatctgc tgcagtgctc tgagccctag gattcatctt tcttttcacc gtaggtggcc
+     6961 tgactggcat tgtattagca aactcatcac tagacatcgt actacacgac acgtactacg
+     7021 ttgtagccca cttccactat gtcctatcaa taggagctgt atttgccatc ataggaggct
+     7081 tcattcactg atttccccta ttctcaggct acaccctaga ccaaacctac gccaaaatcc
+     7141 atttcactat catattcatc ggcgtaaatc taactttctt cccacaacac tttctcggcc
+     7201 tatccggaat gccccgacgt tactcggact accccgatgc atacaccaca tgaaacatcc
+     7261 tatcatctgt aggctcattc atttctctaa cagcagtaat attaataatt ttcatgattt
+     7321 gagaagcctt cgcttcgaag cgaaaagtcc taatagtaga agaaccctcc ataaacctgg
+     7381 agtgactata tggatgcccc ccaccctacc acacattcga agaacccgta tacataaaat
+     7441 ctagacaaaa aaggaaggaa tcgaaccccc caaagctggt ttcaagccaa ccccatggcc
+     7501 tccatgactt tttcaaaaag gtattagaaa aaccatttca taactttgtc aaagttaaat
+     7561 tataggctaa atcctatata tcttaatggc acatgcagcg caagtaggtc tacaagacgc
+     7621 tacttcccct atcatagaag agcttatcac ctttcatgat cacgccctca taatcatttt
+     7681 ccttatctgc ttcctagtcc tgtatgccct tttcctaaca ctcacaacaa aactaactaa
+     7741 tactaacatc tcagacgctc aggaaataga aaccgtctga actatcctgc ccgccatcat
+     7801 cctagtcctc atcgccctcc catccctacg catcctttac ataacagacg aggtcaacga
+     7861 tccctccctt accatcaaat caattggcca ccaatggtac tgaacctacg agtacaccga
+     7921 ctacggcgga ctaatcttca actcctacat acttccccca ttattcctag aaccaggcga
+     7981 cctgcgactc cttgacgttg acaatcgagt agtactcccg attgaagccc ccattcgtat
+     8041 aataattaca tcacaagacg tcttgcactc atgagctgtc cccacattag gcttaaaaac
+     8101 agatgcaatt cccggacgtc taaaccaaac cactttcacc gctacacgac cgggggtata
+     8161 ctacggtcaa tgctctgaaa tctgtggagc aaaccacagt ttcatgccca tcgtcctaga
+     8221 attaattccc ctaaaaatct ttgaaatagg gcccgtattt accctatagc accccctcta
+     8281 ccccctctag agcccactgt aaagctaact tagcattaac cttttaagtt aaagattaag
+     8341 agaaccaaca cctctttaca gtgaaatgcc ccaactaaat actaccgtat ggcccaccat
+     8401 aattaccccc atactcctta cactattcct catcacccaa ctaaaaatat taaacacaaa
+     8461 ctaccaccta cctccctcac caaagcccat aaaaataaaa aattataaca aaccctgaga
+     8521 accaaaatga acgaaaatct gttcgcttca ttcattgccc ccacaatcct aggcctaccc
+     8581 gccgcagtac tgatcattct atttccccct ctattgatcc ccacctccaa atatctcatc
+     8641 aacaaccgac taatcaccac ccaacaatga ctaatcaaac taacctcaaa acaaatgata
+     8701 accatacaca acactaaagg acgaacctga tctcttatac tagtatcctt aatcattttt
+     8761 attgccacaa ctaacctcct cggactcctg cctcactcat ttacaccaac cacccaacta
+     8821 tctataaacc tagccatggc catcccctta tgagcgggca cagtgattat aggctttcgc
+     8881 tctaagatta aaaatgccct agcccacttc ttaccacaag gcacacctac accccttatc
+     8941 cccatactag ttattatcga aaccatcagc ctactcattc aaccaatagc cctggccgta
+     9001 cgcctaaccg ctaacattac tgcaggccac ctactcatgc acctaattgg aagcgccacc
+     9061 ctagcaatat caaccattaa ccttccctct acacttatca tcttcacaat tctaattcta
+     9121 ctgactatcc tagaaatcgc tgtcgcctta atccaagcct acgttttcac acttctagta
+     9181 agcctctacc tgcacgacaa cacataatga cccaccaatc acatgcctat catatagtaa
+     9241 aacccagccc atgaccccta acaggggccc tctcagccct cctaatgacc tccggcctag
+     9301 ccatgtgatt tcacttccac tccataacgc tcctcatact aggcctacta accaacacac
+     9361 taaccatata ccaatgatgg cgcgatgtaa cacgagaaag cacataccaa ggccaccaca
+     9421 caccacctgt ccaaaaaggc cttcgatacg ggataatcct atttattacc tcagaagttt
+     9481 ttttcttcgc aggatttttc tgagcctttt accactccag cctagcccct accccccaat
+     9541 taggagggca ctggccccca acaggcatca ccccgctaaa tcccctagaa gtcccactcc
+     9601 taaacacatc cgtattactc gcatcaggag tatcaatcac ctgagctcac catagtctaa
+     9661 tagaaaacaa ccgaaaccaa ataattcaag cactgcttat tacaatttta ctgggtctct
+     9721 attttaccct cctacaagcc tcagagtact tcgagtctcc cttcaccatt tccgacggca
+     9781 tctacggctc aacatttttt gtagccacag gcttccacgg acttcacgtc attattggct
+     9841 caactttcct cactatctgc ttcatccgcc aactaatatt tcactttaca tccaaacatc
+     9901 actttggctt cgaagccgcc gcctgatact ggcattttgt agatgtggtt tgactatttc
+     9961 tgtatgtctc catctattga tgagggtctt actcttttag tataaatagt accgttaact
+    10021 tccaattaac tagttttgac aacattcaaa aaagagtaat aaacttcgcc ttaattttaa
+    10081 taatcaacac cctcctagcc ttactactaa taattattac attttgacta ccacaactca
+    10141 acggctacat agaaaaatcc accccttacg agtgcggctt cgaccctata tcccccgccc
+    10201 gcgtcccttt ctccataaaa ttcttcttag tagctattac cttcttatta tttgatctag
+    10261 aaattgccct ccttttaccc ctaccatgag ccctacaaac aactaacctg ccactaatag
+    10321 ttatgtcatc cctcttatta atcatcatcc tagccctaag tctggcctat gagtgactac
+    10381 aaaaaggatt agactgaacc gaattggtat atagtttaaa caaaacgaat gatttcgact
+    10441 cattaaatta tgataatcat atttaccaaa tgcccctcat ttacataaat attatactag
+    10501 catttaccat ctcacttcta ggaatactag tatatcgctc acacctcata tcctccctac
+    10561 tatgcctaga aggaataata ctatcgctgt tcattatagc tactctcata accctcaaca
+    10621 cccactccct cttagccaat attgtgccta ttgccatact agtctttgcc gcctgcgaag
+    10681 cagcggtggg cctagcccta ctagtctcaa tctccaacac atatggccta gactacgtac
+    10741 ataacctaaa cctactccaa tgctaaaact aatcgtccca acaattatat tactaccact
+    10801 gacatgactt tccaaaaaac acataatttg aatcaacaca accacccaca gcctaattat
+    10861 tagcatcatc cctctactat tttttaacca aatcaacaac aacctattta gctgttcccc
+    10921 aaccttttcc tccgaccccc taacaacccc cctcctaata ctaactacct gactcctacc
+    10981 cctcacaatc atggcaagcc aacgccactt atccagtgaa ccactatcac gaaaaaaact
+    11041 ctacctctct atactaatct ccctacaaat ctccttaatt ataacattca cagccacaga
+    11101 actaatcata ttttatatct tcttcgaaac cacacttatc cccaccttgg ctatcatcac
+    11161 ccgatgaggc aaccagccag aacgcctgaa cgcaggcaca tacttcctat tctacaccct
+    11221 agtaggctcc cttcccctac tcatcgcact aatttacact cacaacaccc taggctcact
+    11281 aaacattcta ctactcactc tcactgccca agaactatca aactcctgag ccaacaactt
+    11341 aatatgacta gcttacacaa tagcttttat agtaaagata cctctttacg gactccactt
+    11401 atgactccct aaagcccatg tcgaagcccc catcgctggg tcaatagtac ttgccgcagt
+    11461 actcttaaaa ctaggcggct atggtataat acgcctcaca ctcattctca accccctgac
+    11521 aaaacacata gcctacccct tccttgtact atccctatga ggcataatta taacaagctc
+    11581 catctgccta cgacaaacag acctaaaatc gctcattgca tactcttcaa tcagccacat
+    11641 agccctcgta gtaacagcca ttctcatcca aaccccctga agcttcaccg gcgcagtcat
+    11701 tctcataatc gcccacgggc ttacatcctc attactattc tgcctagcaa actcaaacta
+    11761 cgaacgcact cacagtcgca tcataatcct ctctcaagga cttcaaactc tactcccact
+    11821 aatagctttt tgatgacttc tagcaagcct cgctaacctc gccttacccc ccactattaa
+    11881 cctactggga gaactctctg tgctagtaac cacgttctcc tgatcaaata tcactctcct
+    11941 acttacagga ctcaacatac tagtcacagc cctatactcc ctctacatat ttaccacaac
+    12001 acaatggggc tcactcaccc accacattaa caacataaaa ccctcattca cacgagaaaa
+    12061 caccctcatg ttcatacacc tatcccccat tctcctccta tccctcaacc ccgacatcat
+    12121 taccgggttt tcctcttgta aatatagttt aaccaaaaca tcagattgtg aatctgacaa
+    12181 cagaggctta cgacccctta tttaccgaga aagctcacaa gaactgctaa ctcatgcccc
+    12241 catgtctaac aacatggctt tctcaacttt taaaggataa cagctatcca ttggtcttag
+    12301 gccccaaaaa ttttggtgca actccaaata aaagtaataa ccatgcacac tactataacc
+    12361 accctaaccc tgacttccct aattcccccc atccttacca ccctcgttaa ccctaacaaa
+    12421 aaaaactcat acccccatta tgtaaaatcc attgtcgcat ccacctttat tatcagtctc
+    12481 ttccccacaa caatattcat gtgcctagac caagaagtta ttatctcgaa ctgacactga
+    12541 gccacaaccc aaacaaccca gctctcccta agcttcaaac tagactactt ctccataata
+    12601 ttcatccctg tagcattgtt cgttacatgg tccatcatag aattctcact gtgatatata
+    12661 aactcagacc caaacattaa tcagttcttc aaatatctac tcatcttcct aattaccata
+    12721 ctaatcttag ttaccgctaa caacctattc caactgttca tcggctgaga gggcgtagga
+    12781 attatatcct tcttgctcat cagttgatga tacgcccgag cagatgccaa cacagcagcc
+    12841 attcaagcaa tcctatacaa ccgtatcggc gatatcggtt tcatcctcgc cttagcatga
+    12901 tttatcctac actccaactc atgagaccca caacaaatag cccttctaaa cgctaatcca
+    12961 agcctcaccc cactactagg cctcctccta gcagcagcag gcaaatcagc ccaattaggt
+    13021 ctccacccct gactcccctc agccatagaa ggccccaccc cagtctcagc cctactccac
+    13081 tcaagcacta tagttgtagc aggaatcttc ttactcatcc gcttccaccc cctagcagaa
+    13141 aatagcccac taatccaaac tctaacacta tgcttaggcg ctatcaccac tctgttcgca
+    13201 gcagtctgcg cccttacaca aaatgacatc aaaaaaatcg tagccttctc cacttcaagt
+    13261 caactaggac tcataatagt tacaatcggc atcaaccaac cacacctagc attcctgcac
+    13321 atctgtaccc acgccttctt caaagccata ctatttatgt gctccgggtc catcatccac
+    13381 aaccttaaca atgaacaaga tattcgaaaa ataggaggac tactcaaaac catacctctc
+    13441 acttcaacct ccctcaccat tggcagccta gcattagcag gaataccttt cctcacaggt
+    13501 ttctactcca aagaccacat catcgaaacc gcaaacatat catacacaaa cgcctgagcc
+    13561 ctatctatta ctctcatcgc tacctccctg acaagcgcct atagcactcg aataattctt
+    13621 ctcaccctaa caggtcaacc tcgcttcccc acccttacta acattaacga aaataacccc
+    13681 accctactaa accccattaa acgcctggca gccggaagcc tattcgcagg atttctcatt
+    13741 actaacaaca tttcccccgc atcccccttc caaacaacaa tccccctcta cctaaaactc
+    13801 acagccctcg ctgtcacttt cctaggactt ctaacagccc tagacctcaa ctacctaacc
+    13861 aacaaactta aaataaaatc cccactatgc acattttatt tctccaacat actcggattc
+    13921 taccctagca tcacacaccg cacaatcccc tatctaggcc ttcttacgag ccaaaacctg
+    13981 cccctactcc tcctagacct aacctgacta gaaaagctat tacctaaaac aatttcacag
+    14041 caccaaatct ccacctccat catcacctca acccaaaaag gcataattaa actttacttc
+    14101 ctctctttct tcttcccact catcctaacc ctactcctaa tcacataacc tattcccccg
+    14161 agcaatctca attacaatat atacaccaac aaacaatgtt caaccagtaa ctactactaa
+    14221 tcaacgccca taatcataca aagcccccgc accaatagga tcctcccgaa tcaaccctga
+    14281 cccctctcct tcataaatta ttcagcttcc tacactatta aagtttacca caaccaccac
+    14341 cccatcatac tctttcaccc acagcaccaa tcctacctcc atcgctaacc ccactaaaac
+    14401 actcaccaag acctcaaccc ctgaccccca tgcctcagga tactcctcaa tagccatcgc
+    14461 tgtagtatat ccaaagacaa ccatcattcc ccctaaataa attaaaaaaa ctattaaacc
+    14521 catataacct cccccaaaat tcagaataat aacacacccg accacaccgc taacaatcaa
+    14581 tactaaaccc ccataaatag gagaaggctt agaagaaaac cccacaaacc ccattactaa
+    14641 acccacactc aacagaaaca aagcatacat cattattctc gcacggacta caaccacgac
+    14701 caatgatatg aaaaaccatc gttgtatttc aactacaaga acaccaatga ccccaatacg
+    14761 caaaactaac cccctaataa aattaattaa ccactcattc atcgacctcc ccaccccatc
+    14821 caacatctcc gcatgatgaa acttcggctc actccttggc gcctgcctga tcctccaaat
+    14881 caccacagga ctattcctag ccatgcacta ctcaccagac gcctcaaccg ccttttcatc
+    14941 aatcgcccac atcactcgag acgtaaatta tggctgaatc atccgctacc ttcacgccaa
+    15001 tggcgcctca atattcttta tctgcctctt cctacacatc gggcgaggcc tatattacgg
+    15061 atcatttctc tactcagaaa cctgaaacat cggcattatc ctcctgcttg caactatagc
+    15121 aacagccttc ataggctatg tcctcccgtg aggccaaata tcattctgag gggccacagt
+    15181 aattacaaac ttactatccg ccatcccata cattgggaca gacctagttc aatgaatctg
+    15241 aggaggctac tcagtagaca gtcccaccct cacacgattc tttacctttc acttcatctt
+    15301 gcccttcatt attgcagccc tagcaacact ccacctccta ttcttgcacg aaacgggatc
+    15361 aaacaacccc ctaggaatca cctcccattc cgataaaatc accttccacc cttactacac
+    15421 aatcaaagac gccctcggct tacttctctt ccttctctcc ttaatgacat taacactatt
+    15481 ctcaccagac ctcctaggcg acccagacaa ttatacccta gccaacccct taaacacccc
+    15541 tccccacatc aagcccgaat gatatttcct attcgcctac acaattctcc gatccgtccc
+    15601 taacaaacta ggaggcgtcc ttgccctatt actatccatc ctcatcctag caataatccc
+    15661 catcctccat atatccaaac aacaaagcat aatatttcgc ccactaagcc aatcacttta
+    15721 ttgactccta gccgcagacc tcctcattct aacctgaatc ggaggacaac cagtaagcta
+    15781 cccttttacc atcattggac aagtagcatc cgtactatac ttcacaacaa tcctaatcct
+    15841 aataccaact atctccctaa ttgaaaacaa aatactcaaa tgggcctgtc cttgtagtat
+    15901 aaactaatac accagtcttg taaaccggag atgaaaacct ttttccaagg acaaatcaga
+    15961 gaaaaagtct ttaactccac cattagcacc caaagctaag attctaattt aaactattct
+    16021 ctgttctttc atggggaagc agatttgggt accacccaag tattgactca cccatcaaca
+    16081 accgctatgt atttcgtaca ttactgccag ccaccatgaa tattgtacgg taccataaat
+    16141 acttgaccac ctgtagtaca taaaaaccca atccacatca aaaccccctc cccatgctta
+    16201 caagcaagta cagcaatcaa ccctcaacta tcacacatca actgcaactc caaagccacc
+    16261 cctcacccac taggatacca acaaacctac ccacccttaa cagtacatag tacataaagc
+    16321 catttaccgt acatagcaca ttacagtcaa atcccttctc gtccccatgg atgacccccc
+    16381 tcagataggg gtcccttgac caccatcctc cgtgaaatca atatcccgca caagagtgct
+    16441 actctcctcg ctccgggccc ataacacttg ggggtagcta aagtgaactg tatccgacat
+    16501 ctggttccta cttcagggtc ataaagccta aatagcccac acgttcccct taaataagac
+    16561 atcacgatg
+//
+
diff --git a/tests/data/assocacs.gz b/tests/data/assocacs.gz
new file mode 100644
index 0000000..10214c2
Binary files /dev/null and b/tests/data/assocacs.gz differ
diff --git a/tests/data/exonsets.mm-exons.gz b/tests/data/exonsets.mm-exons.gz
new file mode 100644
index 0000000..9cec37e
Binary files /dev/null and b/tests/data/exonsets.mm-exons.gz differ
diff --git a/tests/data/expected_genomic_100.exonset b/tests/data/expected_genomic_100.exonset
new file mode 100644
index 0000000..172f7a0
--- /dev/null
+++ b/tests/data/expected_genomic_100.exonset
@@ -0,0 +1,12 @@
+tx_ac	alt_ac	method	strand	exons_se_i
+NR_046018.2	NC_000001.10	splign	1	11873,12227;12612,12721;13220,14409
+NR_024540.1	NC_000001.10	splign	-1	29320,29370;24737,24891;18267,18366;17914,18061;17605,17742;17232,17368;16857,17055;16606,16765;15795,15947;14969,15038;14361,14829
+NR_106918.1	NC_000001.10	splign	-1	17368,17436
+NR_036051.1	NC_000001.10	splign	1	30365,30503
+NR_026818.1	NC_000001.10	splign	-1	35720,36081;35276,35481;34610,35174
+NM_001005484.2	NC_000001.10	splign	1	65418,65433;65519,65573;69036,71585
+NR_039983.2	NC_000001.10	splign	-1	140074,140566;139789,139847;134772,139696
+NR_028322.1	NC_000001.10	splign	1	323891,324060;324287,324345;324438,328581
+NM_001005221.2	NC_000001.10	splign	1	367658,368597
+NR_125957.1	NC_000001.10	splign	-1	564298,564389;563340,563603;562759,563203
+NR_162149.1	NC_000001.10	splign	-1	567994,568065
diff --git a/tests/data/genomic_100.gff.gz b/tests/data/genomic_100.gff.gz
new file mode 100644
index 0000000..813ad8f
Binary files /dev/null and b/tests/data/genomic_100.gff.gz differ
diff --git a/tests/data/rna.NM_001396027.gbff b/tests/data/rna.NM_001396027.gbff
new file mode 100644
index 0000000..da202f2
--- /dev/null
+++ b/tests/data/rna.NM_001396027.gbff
@@ -0,0 +1,67 @@
+LOCUS       NM_001396027             696 bp    mRNA    linear   PRI 16-APR-2022
+DEFINITION  Homo sapiens family with sequence similarity 246 member C
+            (gene/pseudogene) (FAM246C), mRNA.
+ACCESSION   NM_001396027
+VERSION     NM_001396027.1
+KEYWORDS    RefSeq; RefSeq Select.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+COMMENT     VALIDATED REFSEQ: This record has undergone validation or
+            preliminary review. The reference sequence was derived from
+            CP068256.2.
+
+            Transcript Variant: This transcript contains two, common, single
+            nucleotide deletions relative to the GRCh38 reference assembly.
+            This transcript encodes a protein of 231 aa.
+
+            ##RefSeq-Attributes-START##
+            polymorphic pseudogene :: based on alignments, homology
+            RefSeq Select criteria :: based on single protein-coding transcript
+            ##RefSeq-Attributes-END##
+PRIMARY     REFSEQ_SPAN         PRIMARY_IDENTIFIER PRIMARY_SPAN        COMP
+            1-696               CP068256.2         19404891-19405586
+FEATURES             Location/Qualifiers
+     source          1..696
+                     /organism="Homo sapiens"
+                     /mol_type="mRNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="22"
+                     /map="22q11.21"
+     gene            1..696
+                     /gene="FAM246C"
+                     /note="family with sequence similarity 246 member C
+                     (gene/pseudogene)"
+                     /db_xref="GeneID:117134596"
+                     /db_xref="HGNC:HGNC:54842"
+     CDS             1..696
+                     /gene="FAM246C"
+                     /codon_start=1
+                     /product="protein FAM246C"
+                     /protein_id="NP_001382956.1"
+                     /db_xref="GeneID:117134596"
+                     /db_xref="HGNC:HGNC:54842"
+                     /translation="MAESGRPWAQARSAYRASEVLRRGTGRRRDPGPQSNGPGQEDAR
+                     APGRMARLRGQLRAEAASRSEVPRLLKLVERAGAGAAGAGERTGAHSRGSVCSVCGEP
+                     RGGATYPAGVLEVSERRLQEGLAAVREELGAGIEALRAELRAELDALRALLPPPPSPP
+                     ARREPRAVPRAAPRGPTLPRTLGTVSALVAASRPADDAPDGPAECGAHRAPARKNHKK
+                     MPVPPGAPQGGGD"
+     exon            1..696
+                     /gene="FAM246C"
+                     /inference="alignment:Splign:2.1.0"
+ORIGIN
+        1 atggcggagt ccggccgccc gtgggcccag gcgcgtagtg cgtacagagc cagcgaggtg
+       61 ctgcggcgcg gcacgggccg ccggcgggat ccggggccgc aatccaatgg gccgggccag
+      121 gaagacgccc gagccccggg ccggatggct cgcctgcgcg gccagctccg ggccgaagcg
+      181 gcttcgcggt ccgaggtgcc gcggctgctg aagctggtgg agcgtgcggg ggccggggcg
+      241 gcgggcgcgg gcgagaggac cggcgcgcac agccgcggct ccgtgtgctc ggtatgcggg
+      301 gagccccgcg gcggggccac ctacccggcg ggggtcctgg aggtgagcga gcggcggctg
+      361 caggagggcc tggcggcagt gcgcgaggag ctgggcgccg ggattgaggc gctgcgcgcg
+      421 gagcttcgag cggagctgga tgccctgcgc gcgctgctgc cgccgccgcc gtccccgcct
+      481 gcccgccgcg agccccgcgc cgtcccccgc gccgcgcccc gcggcccgac cctgccgcgg
+      541 acgctcggca ccgtgagcgc cctggtcgcc gcctccaggc ccgcagacga cgccccggac
+      601 ggcccagcag aatgcggagc gcaccgagcc ccggccagga agaaccacaa gaagatgcca
+      661 gtgccgcctg gggccccgca aggtggcggg gactga
+//
diff --git a/tests/data/rna.NM_001996.gbff b/tests/data/rna.NM_001996.gbff
new file mode 100644
index 0000000..79ff577
--- /dev/null
+++ b/tests/data/rna.NM_001996.gbff
@@ -0,0 +1,393 @@
+LOCUS       NM_001996               2251 bp    mRNA    linear   PRI 18-APR-2022
+DEFINITION  Homo sapiens fibulin 1 (FBLN1), transcript variant C, mRNA.
+ACCESSION   NM_001996
+VERSION     NM_001996.4
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+REFERENCE   1  (bases 1 to 2251)
+  AUTHORS   Yang F, Shi J, Zhang L, Wang H and Li Y.
+  TITLE     Role of fibulin-1 gene promoter methylation in the carcinogenesis
+            and development of tongue squamous cell carcinoma
+  JOURNAL   Oral Surg Oral Med Oral Pathol Oral Radiol 133 (4), 432-440 (2022)
+   PUBMED   35153187
+  REMARK    GeneRIF: Role of fibulin-1 gene promoter methylation in the
+            carcinogenesis and development of tongue squamous cell carcinoma.
+REFERENCE   2  (bases 1 to 2251)
+  AUTHORS   Wan Y, Song Y, Chen J, Kong J, Gu C, Huang J and Zuo L.
+  TITLE     Upregulated Fibulin-1 Increased Endometrial Stromal Cell Viability
+            and Migration by Repressing EFEMP1-Dependent Ferroptosis in
+            Endometriosis
+  JOURNAL   Biomed Res Int 2022, 4809415 (2022)
+   PUBMED   35127942
+  REMARK    GeneRIF: Upregulated Fibulin-1 Increased Endometrial Stromal Cell
+            Viability and Migration by Repressing EFEMP1-Dependent Ferroptosis
+            in Endometriosis.
+            Publication Status: Online-Only
+REFERENCE   3  (bases 1 to 2251)
+  AUTHORS   Xu G, Geng X, Yang F and Zhang H.
+  TITLE     FBLN1 promotes chondrocyte proliferation by increasing
+            phosphorylation of Smad2
+  JOURNAL   J Orthop Sci 27 (1), 242-248 (2022)
+   PUBMED   33610427
+  REMARK    GeneRIF: FBLN1 promotes chondrocyte proliferation by increasing
+            phosphorylation of Smad2.
+REFERENCE   4  (bases 1 to 2251)
+  AUTHORS   Ustunyurt E, Dundar B, Simsek D and Temur M.
+  TITLE     Act of fibulin-1 in preeclamptic patients: can it be a predictive
+            marker?
+  JOURNAL   J Matern Fetal Neonatal Med 34 (22), 3775-3781 (2021)
+   PUBMED   34238097
+  REMARK    GeneRIF: Act of fibulin-1 in preeclamptic patients: can it be a
+            predictive marker?
+REFERENCE   5  (bases 1 to 2251)
+  AUTHORS   Liu XT, Liu TT, Wu MY, Chen QX, Zhuang JX and Wang Q.
+  TITLE     Identifying FBLN1 (Gene ID: 2192) as a Potential Melanoma Biomarker
+            for Melanoma based on an Analysis of microRNA Expression Profiles
+            in the GEO and TCGA Databases
+  JOURNAL   Genet Test Mol Biomarkers 25 (1), 68-78 (2021)
+   PUBMED   33470885
+  REMARK    GeneRIF: Identifying FBLN1 (Gene ID: 2192) as a Potential Melanoma
+            Biomarker for Melanoma based on an Analysis of microRNA Expression
+            Profiles in the GEO and TCGA Databases.
+REFERENCE   6  (bases 1 to 2251)
+  AUTHORS   Sasaki T, Gohring W, Pan TC, Chu ML and Timpl R.
+  TITLE     Binding of mouse and human fibulin-2 to extracellular matrix
+            ligands
+  JOURNAL   J Mol Biol 254 (5), 892-899 (1995)
+   PUBMED   7500359
+REFERENCE   7  (bases 1 to 2251)
+  AUTHORS   Roark EF, Keene DR, Haudenschild CC, Godyna S, Little CD and
+            Argraves WS.
+  TITLE     The association of human fibulin-1 with elastic fibers: an
+            immunohistological, ultrastructural, and RNA study
+  JOURNAL   J Histochem Cytochem 43 (4), 401-411 (1995)
+   PUBMED   7534784
+REFERENCE   8  (bases 1 to 2251)
+  AUTHORS   Balbona K, Tran H, Godyna S, Ingham KC, Strickland DK and Argraves
+            WS.
+  TITLE     Fibulin binds to itself and to the carboxyl-terminal
+            heparin-binding region of fibronectin
+  JOURNAL   J Biol Chem 267 (28), 20120-20125 (1992)
+   PUBMED   1400330
+REFERENCE   9  (bases 1 to 2251)
+  AUTHORS   Argraves WS, Tran H, Burgess WH and Dickerson K.
+  TITLE     Fibulin is an extracellular matrix and plasma glycoprotein with
+            repeated domain structure
+  JOURNAL   J Cell Biol 111 (6 Pt 2), 3155-3164 (1990)
+   PUBMED   2269669
+REFERENCE   10 (bases 1 to 2251)
+  AUTHORS   Argraves WS, Dickerson K, Burgess WH and Ruoslahti E.
+  TITLE     Fibulin, a novel protein that interacts with the fibronectin
+            receptor beta subunit cytoplasmic domain
+  JOURNAL   Cell 58 (4), 623-629 (1989)
+   PUBMED   2527614
+COMMENT     REVIEWED REFSEQ: This record has been curated by NCBI staff. The
+            reference sequence was derived from AL021391.3, Z98047.1 and
+            Z95331.2.
+
+            On May 31, 2019 this sequence version replaced NM_001996.3.
+
+            Summary: Fibulin 1 is a secreted glycoprotein that becomes
+            incorporated into a fibrillar extracellular matrix. Calcium-binding
+            is apparently required to mediate its binding to laminin and
+            nidogen. It mediates platelet adhesion via binding fibrinogen. Four
+            splice variants which differ in the 3' end have been identified.
+            Each variant encodes a different isoform, but no functional
+            distinctions have been identified among the four variants.
+            [provided by RefSeq, Jul 2008].
+
+            Transcript Variant: This variant (C) has an alternate 3' sequence,
+            as compared to variant D. The encoded isoform C is shorter and has
+            a distinct C-terminus, as compared to isoform D.
+
+            Publication Note:  This RefSeq record includes a subset of the
+            publications that are available for this gene. Please see the Gene
+            record to access additional publications.
+
+            ##Evidence-Data-START##
+            Transcript exon combination :: SRR7346977.956579.1,
+                                           SRR7346977.1110405.1 [ECO:0000332]
+            RNAseq introns              :: single sample supports all introns
+                                           SAMEA1965299, SAMEA1966682
+                                           [ECO:0000348]
+            ##Evidence-Data-END##
+            COMPLETENESS: complete on the 3' end.
+PRIMARY     REFSEQ_SPAN         PRIMARY_IDENTIFIER PRIMARY_SPAN        COMP
+            1-182               AL021391.3         135806-135987
+            183-288             Z98047.1           13577-13682
+            289-424             Z98047.1           20438-20573
+            425-587             Z98047.1           22742-22904
+            588-647             Z98047.1           26160-26219
+            648-749             Z98047.1           27958-28059
+            750-887             Z98047.1           28656-28793
+            888-1025            Z98047.1           30095-30232
+            1026-1169           Z98047.1           36124-36267
+            1170-1298           Z98047.1           37050-37178
+            1299-1424           Z98047.1           38296-38421
+            1425-1544           Z98047.1           41980-42099
+            1545-1676           Z98047.1           43508-43639
+            1677-1800           Z98047.1           45387-45510
+            1801-2251           Z95331.2           10369-10819
+FEATURES             Location/Qualifiers
+     source          1..2251
+                     /organism="Homo sapiens"
+                     /mol_type="mRNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="22"
+                     /map="22q13.31"
+     gene            1..2251
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="fibulin 1"
+                     /db_xref="GeneID:2192"
+                     /db_xref="HGNC:HGNC:3600"
+                     /db_xref="MIM:135820"
+     exon            1..182
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     CDS             104..2155
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="isoform C precursor is encoded by transcript
+                     variant C"
+                     /codon_start=1
+                     /product="fibulin-1 isoform C precursor"
+                     /protein_id="NP_001987.3"
+                     /db_xref="CCDS:CCDS14069.1"
+                     /db_xref="GeneID:2192"
+                     /db_xref="HGNC:HGNC:3600"
+                     /db_xref="MIM:135820"
+                     /translation="MERAAPSRRVPLPLLLLGGLALLAAGVDADVLLEACCADGHRMA
+                     THQKDCSLPYATESKECRMVQEQCCHSQLEELHCATGISLANEQDRCATPHGDNASLE
+                     ATFVKRCCHCCLLGRAAQAQGQSCEYSLMVGYQCGQVFQACCVKSQETGDLDVGGLQE
+                     TDKIIEVEEEQEDPYLNDRCRGGGPCKQQCRDTGDEVVCSCFVGYQLLSDGVSCEDVN
+                     ECITGSHSCRLGESCINTVGSFRCQRDSSCGTGYELTEDNSCKDIDECESGIHNCLPD
+                     FICQNTLGSFRCRPKLQCKSGFIQDALGNCIDINECLSISAPCPIGHTCINTEGSYTC
+                     QKNVPNCGRGYHLNEEGTRCVDVDECAPPAEPCGKGHRCVNSPGSFRCECKTGYYFDG
+                     ISRMCVDVNECQRYPGRLCGHKCENTLGSYLCSCSVGFRLSVDGRSCEDINECSSSPC
+                     SQECANVYGSYQCYCRRGYQLSDVDGVTCEDIDECALPTGGHICSYRCINIPGSFQCS
+                     CPSSGYRLAPNGRNCQDIDECVTGIHNCSINETCFNIQGGFRCLAFECPENYRRSAAT
+                     RCERLPCHENRECSKLPLRITYYHLSFPTNIQAPAVVFRMGPSSAVPGDSMQLAITGG
+                     NEEGFFTTRKVSPHSGVVALTKPVPEPRDLLLTVKMDLSRHGTVSSFVAKLFIFVSAE
+                     L"
+     sig_peptide     104..190
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="COORDINATES: ab initio prediction:SignalP:4.0"
+     mat_peptide     191..2152
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /product="fibulin-1 isoform C"
+     misc_feature    191..340
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Anaphylatoxin homologous domain; C3a, C4a and C5a
+                     anaphylatoxins are protein fragments generated
+                     enzymatically in serum during activation of complement
+                     molecules C3, C4, and C5. They induce smooth muscle
+                     contraction. These fragments are homologous to...; Region:
+                     ANATO; cd00017"
+                     /db_xref="CDD:237984"
+     misc_feature    392..559
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Anaphylatoxin homologous domain; C3a, C4a and C5a
+                     anaphylatoxins are protein fragments generated
+                     enzymatically in serum during activation of complement
+                     molecules C3, C4, and C5. They induce smooth muscle
+                     contraction. These fragments are homologous to...; Region:
+                     ANATO; cd00017"
+                     /db_xref="CDD:237984"
+     misc_feature    698..760
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Complement Clr-like EGF-like; Region: cEGF;
+                     pfam12662"
+                     /db_xref="CDD:432704"
+     misc_feature    749..832
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Calcium-binding EGF domain; Region: EGF_CA;
+                     pfam07645"
+                     /db_xref="CDD:429571"
+     misc_feature    887..994
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Calcium-binding EGF domain; Region: EGF_CA;
+                     pfam07645"
+                     /db_xref="CDD:429571"
+     misc_feature    order(887..889,896..898,944..946)
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Ca2+ binding site [ion binding]; other site"
+                     /db_xref="CDD:238011"
+     misc_feature    1025..1168
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Calcium-binding EGF-like domain; Region: EGF_CA;
+                     smart00179"
+                     /db_xref="CDD:214542"
+     misc_feature    order(1025..1027,1034..1036,1082..1084)
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Ca2+ binding site [ion binding]; other site"
+                     /db_xref="CDD:238011"
+     misc_feature    1169..1270
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Calcium-binding EGF-like domain; Region: EGF_CA;
+                     smart00179"
+                     /db_xref="CDD:214542"
+     misc_feature    order(1169..1171,1178..1180,1226..1228)
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Ca2+ binding site [ion binding]; other site"
+                     /db_xref="CDD:238011"
+     misc_feature    order(1298..1300,1307..1309,1352..1354)
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Ca2+ binding site [ion binding]; other site"
+                     /db_xref="CDD:238011"
+     misc_feature    1364..1435
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Complement Clr-like EGF-like; Region: cEGF;
+                     pfam12662"
+                     /db_xref="CDD:432704"
+     misc_feature    1481..1555
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Complement Clr-like EGF-like; Region: cEGF;
+                     pfam12662"
+                     /db_xref="CDD:432704"
+     misc_feature    1544..1636
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Calcium-binding EGF-like domain, present in a large
+                     number of membrane-bound and extracellular (mostly animal)
+                     proteins. Many of these proteins require calcium for their
+                     biological function and calcium-binding sites have been
+                     found to be located at the...; Region: EGF_CA; cl21504"
+                     /db_xref="CDD:451279"
+     misc_feature    1613..1687
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Complement Clr-like EGF-like; Region: cEGF;
+                     pfam12662"
+                     /db_xref="CDD:432704"
+     misc_feature    1676..1780
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="Calcium-binding EGF domain; Region: EGF_CA;
+                     pfam07645"
+                     /db_xref="CDD:429571"
+     exon            183..288
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            289..424
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            425..587
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            588..647
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            648..749
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            750..887
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            888..1025
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            1026..1169
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            1170..1298
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            1299..1424
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            1425..1544
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            1545..1676
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            1677..1800
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     exon            1801..2251
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /inference="alignment:Splign:2.1.0"
+     regulatory      2220..2225
+                     /regulatory_class="polyA_signal_sequence"
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="hexamer: AATAAA"
+     polyA_site      2251
+                     /gene="FBLN1"
+                     /gene_synonym="FBLN; FIBL1"
+                     /note="major polyA site"
+ORIGIN
+        1 gttggctgcc gaggctcggc cggagcgtgg agcccgcgcc gctgccccag gaccgcgccc
+       61 gcgcctttgt ccgccgccgc ccaccgcccg tcgcccgccg cccatggagc gcgccgcgcc
+      121 gtcgcgccgg gtcccgcttc cgctgctgct gctcggcggc cttgcgctgc tggcggccgg
+      181 agtggacgcg gatgtcctcc tggaggcctg ctgtgcggac ggacaccgga tggccactca
+      241 tcagaaggac tgctcgctgc catatgctac ggaatccaaa gaatgcagga tggtgcagga
+      301 gcagtgctgc cacagccagc tggaggagct gcactgtgcc acgggcatca gcctggccaa
+      361 cgagcaggac cgctgtgcca cgccccacgg tgacaacgcc agcctggagg ccacatttgt
+      421 gaagaggtgc tgccattgct gtctgctggg gagggcggcc caggcccagg gccagagctg
+      481 cgagtacagc ctcatggttg gctaccagtg tggacaggtc ttccaggcat gctgtgtcaa
+      541 gagccaggag accggagatt tggatgtcgg gggcctccaa gaaacggata agatcattga
+      601 ggttgaggag gaacaagagg acccatatct gaatgaccgc tgccgaggag gcgggccctg
+      661 caagcagcag tgccgagaca cgggtgacga ggtggtctgc tcctgcttcg tgggctacca
+      721 gctgctgtct gatggtgtct cctgtgaaga tgtcaatgaa tgcatcacgg gcagccacag
+      781 ctgccggctt ggagaatcct gcatcaacac agtgggctct ttccgctgcc agcgggacag
+      841 cagctgcggg actggctatg agctcacaga ggacaatagc tgcaaagata ttgacgagtg
+      901 tgagagtggt attcataact gcctccccga ttttatctgt cagaatactc tgggatcctt
+      961 ccgctgccga cccaagctac agtgcaagag tggctttata caagatgctc taggcaactg
+     1021 tattgatatc aatgagtgtt tgagtatcag tgccccgtgc cctatcgggc atacatgcat
+     1081 caacacagag ggctcctaca cgtgccagaa gaacgtgccc aactgtggcc gtggctacca
+     1141 tctcaacgag gagggaacgc gctgtgttga tgtggacgag tgcgcgccac ctgctgagcc
+     1201 ctgtgggaag ggacatcgct gcgtgaactc tcccggcagt ttccgctgcg aatgcaagac
+     1261 gggttactat tttgacggca tcagcaggat gtgtgtcgat gtcaacgagt gccagcgcta
+     1321 ccccgggcgc ctgtgtggcc acaagtgcga gaacacgctg ggctcctacc tctgcagctg
+     1381 ttccgtgggc ttccggctct ctgtggatgg caggtcatgt gaagacatca atgagtgcag
+     1441 cagcagcccc tgtagccagg agtgtgccaa cgtctacggc tcctaccagt gttactgccg
+     1501 gcgaggctac cagctcagcg atgtggatgg agtcacctgt gaagacatcg acgagtgcgc
+     1561 cctgcccacc gggggccaca tctgctccta ccgctgcatc aacatccctg gaagcttcca
+     1621 gtgcagctgc ccctcgtctg gctacaggct ggcccccaat ggccgcaact gccaagacat
+     1681 tgatgagtgt gtgactggca tccacaactg ctccatcaac gagacctgct tcaacatcca
+     1741 gggcggcttc cgctgcctgg ccttcgagtg ccctgagaac taccgccgct ccgcagccac
+     1801 ccgctgtgag cgcttgcctt gccatgagaa tcgggagtgc tccaagctgc ctctgagaat
+     1861 aacctactac cacctctctt tccccaccaa catccaagcg cccgcggtgg ttttccgcat
+     1921 gggcccctcc agtgctgtcc ccggggacag catgcagctg gccatcaccg gcggcaatga
+     1981 ggagggcttt ttcaccaccc ggaaggtgag cccccacagt ggggtggtgg ccctcaccaa
+     2041 gcctgtcccc gagcccaggg acttgctcct gaccgtcaag atggatctct ctcgccacgg
+     2101 caccgtcagc tcctttgtgg ccaagctttt catctttgtg tctgcagagc tctgagcact
+     2161 cgcttcgcgt cgcggggtct ccctcctgtt gctttcctaa ccctgccctc cggggcgtta
+     2221 ataaagtctt agcaagcgtc ccacacagtg a
+//
diff --git a/tests/data/rna.NR_173080.gbff b/tests/data/rna.NR_173080.gbff
new file mode 100644
index 0000000..f6b2486
--- /dev/null
+++ b/tests/data/rna.NR_173080.gbff
@@ -0,0 +1,61 @@
+LOCUS       NR_173080               1073 bp    RNA     linear   PRI 20-JUL-2023
+DEFINITION  Homo sapiens uncharacterized LOC122455341 (LOC122455341),
+            transcript variant 1, long non-coding RNA.
+ACCESSION   NR_173080
+VERSION     NR_173080.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+COMMENT     VALIDATED REFSEQ: This record has undergone validation or
+            preliminary review. The reference sequence was derived from
+            AC007326.28.
+
+            ##Evidence-Data-START##
+            RNAseq introns :: single sample supports all introns SAMEA2159764
+                              [ECO:0000348]
+            ##Evidence-Data-END##
+            COMPLETENESS: full length.
+PRIMARY     REFSEQ_SPAN         PRIMARY_IDENTIFIER PRIMARY_SPAN        COMP
+            1-93                AC007326.28        35576-35668
+            94-170              AC007326.28        37874-37950
+            171-1073            AC007326.28        44992-45894
+FEATURES             Location/Qualifiers
+     source          1..1073
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="22"
+                     /map="22q11.21"
+     gene            1..1073
+                     /gene="LOC122455341"
+                     /note="uncharacterized LOC122455341"
+                     /db_xref="GeneID:122455341"
+     ncRNA           1..1073
+                     /ncRNA_class="lncRNA"
+                     /gene="LOC122455341"
+                     /product="uncharacterized LOC122455341, transcript variant
+                     1"
+                     /db_xref="GeneID:122455341"
+ORIGIN
+        1 acaagttgta tggttcgttt tatatgaaga gttcagaata gacaaatcga tagagacaga
+       61 agtcacacga agacaagccc atagtggagc ctgggtgaag gtacgctcga gcgtggtcat
+      121 tgaggacaag tcgacgagag atcccgagta cgtctacagt cagccttacg tctgcaggtg
+      181 tacccaacag ctccgaagag acagcgacca tcgagaacgg gccatgatga cgatggcggt
+      241 tttgtcgaaa agaaaagggg gaaatgtggg gaaaagcaag agagatcaga ttgttactgt
+      301 gtctgtgtag aaagaagtag acataggaga ctccattttg ttatgtgcta agaaaaattc
+      361 ttctgccttg agattctgtt aatctataac cttaccccca accccgtgct ctctgaaacg
+      421 tgtgctgtgt caactcagag ttaaatggat taagggcggt gcaggatgtg ctttgttaaa
+      481 cagatgcttg aaggcagcat gctccttaag agtcatcacc actccctaat ctcaagtacc
+      541 cagggacaca aaaactgcgg aaggccgcag ggacctctgc ctaggaaagc caggtattgt
+      601 ccaaggtttc tccccatgtg atagtctgaa atatggcctc gtgggaaggg aaagacctga
+      661 ccgtccccca gcccgacacc aagggtctgt gctgaggagg attagtaaaa gaggaaggaa
+      721 tgcctcttgc agttgagaca agaggaaggc atctgtctcc tgcctgtccc tgggcaatgg
+      781 aatgtctcgg tataaaaccc gattgtatgc tccatctact gagataggga aaaaccgcct
+      841 tagggctgga ggtgggacct gcgggcagca atactgcttt gtaaagcatt gagatgttta
+      901 tgtgtatgca tatctaaaag cacagcactt aatcctttac attgtctatg atgcaaagac
+      961 ctttgttcac gtgtttgtct gctgaccctc tccccacaat tgtcttgtga ccctgacaca
+     1021 tccccctctt cgagaaacac ccacaagtga tgaataaata ctaagggaac tca
+//
diff --git a/tests/data/rna.NR_173148.gbff b/tests/data/rna.NR_173148.gbff
new file mode 100644
index 0000000..91312ca
--- /dev/null
+++ b/tests/data/rna.NR_173148.gbff
@@ -0,0 +1,57 @@
+LOCUS       NR_173148                698 bp    RNA     linear   PRI 17-SEP-2021
+DEFINITION  Homo sapiens family with sequence similarity 246 member C
+            (gene/pseudogene) (FAM246C), non-coding RNA.
+ACCESSION   NR_173148
+VERSION     NR_173148.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+COMMENT     VALIDATED REFSEQ: This record has undergone validation or
+            preliminary review. The reference sequence was derived from
+            AC000095.5.
+
+            Transcript Variant: This transcript matches the GRCh38 reference
+            assembly. Compared to other members of the FAM246 gene family, it
+            has a premature stop codon which makes it a non-coding transcript.
+
+            ##RefSeq-Attributes-START##
+            polymorphic pseudogene :: based on alignments, homology
+            ##RefSeq-Attributes-END##
+PRIMARY     REFSEQ_SPAN         PRIMARY_IDENTIFIER PRIMARY_SPAN        COMP
+            1-698               AC000095.5         28241-28938
+FEATURES             Location/Qualifiers
+     source          1..698
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="22"
+                     /map="22q11.21"
+     gene            1..698
+                     /gene="FAM246C"
+                     /note="family with sequence similarity 246 member C
+                     (gene/pseudogene)"
+                     /db_xref="GeneID:117134596"
+                     /db_xref="HGNC:HGNC:54842"
+     misc_RNA        1..698
+                     /gene="FAM246C"
+                     /product="family with sequence similarity 246 member C
+                     (gene/pseudogene)"
+                     /db_xref="GeneID:117134596"
+                     /db_xref="HGNC:HGNC:54842"
+ORIGIN
+        1 atggcggagc ccggccgccc gtgggcccag gcgcgtagtg cgtacagagc cagcgaggtg
+       61 ctgcggcgcg gcacgggccg ccggcgggat ccggggccgc aatccaatgg gccgggccag
+      121 gaagacgccc gagccccggg ccggatggct cgcctgcgcg gccagctccg ggccgaagcg
+      181 gcttcgcggt ccgaggtgcc gcggctgctg aagctggtgg agcgtgcggg ggccggggcg
+      241 gccgggcgcg ggcgagagga ccggcgcgca cagccgcggg ctccgtgtgc tcggtatgcg
+      301 gggagccccg cggcggggcc acctacccgg cgggggtcct ggaggtgagc gagcggcggc
+      361 tgcaggaggg cctggcggca gtgcgcgagg agctgggcgc cgggattgag gcgctgcgcg
+      421 cggagcttcg agcggagctg gatgccctgc gcgcgctgct gccgccgccg ccgtccccgc
+      481 ctgcccgccg cgagccccgc gccgtccccc gcgccgcgcc ccgcggcccg accctgccgc
+      541 ggacgctcgg caccgtgagc gccctggtcg ccgcctccag gcccgcagac gacgccccgg
+      601 acggcccagc agaatgcgga gcgcaccgag ccccggccag gaagaaccac aagaagatgc
+      661 cagtgccgcc tggggccccg caaggtggcg gggactga
+//
diff --git a/tests/data/txinfo.gz b/tests/data/txinfo.gz
new file mode 100644
index 0000000..8b33153
Binary files /dev/null and b/tests/data/txinfo.gz differ
diff --git a/tests/test_coalesce_exonsets.py b/tests/test_coalesce_exonsets.py
new file mode 100644
index 0000000..757ec0b
--- /dev/null
+++ b/tests/test_coalesce_exonsets.py
@@ -0,0 +1,48 @@
+import contextlib
+import io
+import sys
+import unittest
+from tempfile import NamedTemporaryFile
+from unittest.mock import patch
+
+from sbin.coalesce_exonsets import coalesce_exonsets
+from uta.formats.exonset import ExonSetWriter
+
+
+class TestCoalesceExonsets(unittest.TestCase):
+
+    def _create_temporary_file(self, lines):
+        with NamedTemporaryFile(delete=False) as temp_exonsets:
+            with open(temp_exonsets.name, "wt") as f:
+                for line in lines:
+                    f.write(line)
+            temp_exonsets.seek(0)
+        return temp_exonsets.name
+
+    @patch('sbin.coalesce_exonsets.logger')
+    def test_coalesce_exonsets(self, mock_logger):
+        lines_1 = [
+            "tx_ac\talt_ac\tmethod\tstrand\texons_se_i\n",
+            "NM_145660.2\tNC_000022.10\tsplign\t-1\t36600673,36600879;36598038,36598101;36595375,36595422;36591356,36591483;36585175,36587958\n",
+            "NM_000348.4\tNC_000002.11\tsplign\t-1\t31805689,31806007;31758672,31758836;31756440,31756542;31754376,31754527;31747549,31751332\n"
+        ]
+        lines_2 = [
+            "tx_ac\talt_ac\tmethod\tstrand\texons_se_i\n",
+            "NM_145660.2\tNC_000022.10\tsplign\t-1\t36600673,36600879;36598038,36598101;36595375,36595422;36591356,36591483;36587846,36587958;36585175,36587845\n",
+            "NM_145660.2\tNC_000022.11\tsplign\t-1\t36204627,36204833;36201992,36202055;36199329,36199376;36195310,36195437;36189127,36191912\n",
+            "NM_001005484.2\tNC_000001.10\tsplign\t1\t65418,65433;65519,65573;69036,71585\n"
+        ]
+        temp_exonsets_1_fn = self._create_temporary_file(lines_1)
+        temp_exonsets_2_fn = self._create_temporary_file(lines_2)
+
+        # the first record in lines_2 (NM_145660.2, NC_000022.10) will be skipped, as it is already passed to the output
+        expected_output = lines_1 + lines_2[2:]
+        stdout = io.StringIO()
+
+        with contextlib.redirect_stdout(stdout):
+            coalesce_exonsets([temp_exonsets_1_fn, temp_exonsets_2_fn])
+
+        output = stdout.getvalue()
+        self.assertEqual(output, ''.join(expected_output))
+
+        mock_logger.warning.assert_called_with(f"  - exon set for transcript NM_145660.2/NC_000022.10 already seen in {temp_exonsets_1_fn}. Skipping.")
diff --git a/tests/test_filter_exonset_transcripts.py b/tests/test_filter_exonset_transcripts.py
new file mode 100644
index 0000000..8bc7995
--- /dev/null
+++ b/tests/test_filter_exonset_transcripts.py
@@ -0,0 +1,46 @@
+import contextlib
+import io
+import unittest
+from tempfile import NamedTemporaryFile
+from unittest.mock import patch
+
+from sbin.filter_exonset_transcripts import filter_exonset
+
+
+class TestFilterExonsetTranscripts(unittest.TestCase):
+
+    @patch('sbin.filter_exonset_transcripts.logger')
+    def test_filter_exonset(self, mock_logger):
+        # Test NR_046571.1 is filtered out
+        lines = [
+            "tx_ac\talt_ac\tmethod\tstrand\texons_se_i\n",
+            "NR_122113.1\tNC_000022.10\tsplign\t-1\t16192905,16193009;16190680,16190791;16189263,16189378;16189031,16189143;16187164,16187302;16186810,16186953;16162396,16162487;16150528,16151821\n",
+            "NR_133911.1\tNC_000022.10\tsplign\t1\t16157078,16157342;16164481,16164569;16171951,16172265\n",
+            "NR_046571.1\tNC_000022.10\tsplign\t1\t16274608,16275003;16276480,16277577\n"
+        ]
+        with NamedTemporaryFile(delete=False) as temp_exonsets:
+            with open(temp_exonsets.name, "wt") as f:
+                for line in lines:
+                    f.write(line)
+            temp_exonsets.seek(0)
+        missing_ids_file = NamedTemporaryFile()
+
+        transcript_ids = {"NR_122113.1", "NR_133911.1"}
+        stdout = io.StringIO()
+        with contextlib.redirect_stdout(stdout):
+            filter_exonset(temp_exonsets.name, transcript_ids, missing_ids_file.name)
+
+        # Assert the record for NR_046571.1 is filtered out
+        self.assertEqual(stdout.getvalue(), ''.join(lines[0:3]))
+
+        # Confirm filtered transcript is present in missing_ids_file
+        with open(missing_ids_file.name, 'r') as f:
+            contents = f.read()
+        self.assertEqual(contents, 'NR_046571.1\n')
+
+        mock_logger.debug.assert_called_with('Exon set transcript NR_046571.1 not found in txinfo file. Filtering out.')
+        mock_logger.info.assert_called_with('Filtered out exon sets for 1 transcript(s)')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_ncbi_parse_genomic_gff.py b/tests/test_ncbi_parse_genomic_gff.py
new file mode 100644
index 0000000..4a0b638
--- /dev/null
+++ b/tests/test_ncbi_parse_genomic_gff.py
@@ -0,0 +1,247 @@
+import gzip
+import os
+import subprocess
+import unittest
+from tempfile import NamedTemporaryFile
+
+from sbin.ncbi_parse_genomic_gff import (
+    get_zero_based_exon_ranges,
+    GFFRecord,
+    parse_gff_files,
+    parse_gff_record,
+)
+
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+BASE_DIR = os.path.dirname(CURRENT_DIR)
+
+
+def sample_line(**params):
+    defaults = {
+        "seqid": "NC_000001.10",
+        "source": "BestRefSeq",
+        "feature": "exon",
+        "start": 11874,
+        "stop": 12227,
+        "score": ".",
+        "strand": "1",
+        "phase": ".",
+        "attributes_str": "ID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;transcript_id=NR_046018.2",
+    }
+    defaults.update(params)
+    return "\t".join(map(str, defaults.values())) + "\n"
+
+
+class TestGFFParsing(unittest.TestCase):
+    def setUp(self):
+        with NamedTemporaryFile(delete=False) as temp_gff:
+            with open(temp_gff.name, "wt") as f:
+                f.write(
+                    "NC_000001.10\tBestRefSeq\texon\t11874\t12227\t.\t+\t.\tID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n"
+                )
+                f.write(
+                    "NC_000001.10\tBestRefSeq\texon\t12613\t12721\t.\t+\t.\tID=exon-NR_046018.2-2;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n"
+                )
+                f.write(
+                    "NC_000001.10\tBestRefSeq\texon\t13221\t14409\t.\t+\t.\tID=exon-NR_046018.2-3;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n"
+                )
+                f.write(
+                    "NC_000001.11\tBestRefSeq\texon\t15874\t16227\t.\t+\t.\tID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n"
+                )
+                f.write(
+                    "NC_000001.11\tBestRefSeq\texon\t16613\t16721\t.\t+\t.\tID=exon-NR_046018.2-2;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n"
+                )
+                f.write(
+                    "NC_000001.11\tBestRefSeq\texon\t17221\t18409\t.\t+\t.\tID=exon-NR_046018.2-3;Parent=rna-NR_046018.2;transcript_id=NR_046018.2\n"
+                )
+            temp_gff.seek(0)
+        self.temp_gff = temp_gff
+        self.gff_records = [
+            GFFRecord(
+                seqid="NC_000001.10",
+                start=11874,
+                end=12227,
+                strand="+",
+                exon_number=1,
+                parent_id="rna-NR_046018.2",
+                transcript_id="NR_046018.2",
+            ),
+            GFFRecord(
+                seqid="NC_000001.10",
+                start=12613,
+                end=12721,
+                strand="+",
+                exon_number=2,
+                parent_id="rna-NR_046018.2",
+                transcript_id="NR_046018.2",
+            ),
+            GFFRecord(
+                seqid="NC_000001.10",
+                start=13221,
+                end=14409,
+                strand="+",
+                exon_number=3,
+                parent_id="rna-NR_046018.2",
+                transcript_id="NR_046018.2",
+            ),
+            GFFRecord(
+                seqid="NC_000001.11",
+                start=15874,
+                end=16227,
+                strand="+",
+                exon_number=1,
+                parent_id="rna-NR_046018.2",
+                transcript_id="NR_046018.2",
+            ),
+            GFFRecord(
+                seqid="NC_000001.11",
+                start=16613,
+                end=16721,
+                strand="+",
+                exon_number=2,
+                parent_id="rna-NR_046018.2",
+                transcript_id="NR_046018.2",
+            ),
+            GFFRecord(
+                seqid="NC_000001.11",
+                start=17221,
+                end=18409,
+                strand="+",
+                exon_number=3,
+                parent_id="rna-NR_046018.2",
+                transcript_id="NR_046018.2",
+            ),
+        ]
+
+    def tearDown(self):
+        os.remove(self.temp_gff.name)
+
+    def test_parse_gff_record(self):
+        # Test parsing a single GFF record
+        line = sample_line()
+        expected_record = GFFRecord(
+            seqid="NC_000001.10",
+            start=11874,
+            end=12227,
+            strand="1",
+            exon_number=1,
+            parent_id="rna-NR_046018.2",
+            transcript_id="NR_046018.2",
+        )
+        parsed_record = parse_gff_record(line)
+        self.assertEqual(parsed_record, expected_record)
+        self.assertEqual(parsed_record.key, f"{expected_record.transcript_id}:{expected_record.seqid}")
+
+    def test_parse_gff_record_skips_non_exon_records(self):
+        # We exclude non-exon records
+        line = sample_line(feature="pseudogene")
+        expected_record = None
+        parsed_record = parse_gff_record(line)
+        self.assertEqual(parsed_record, expected_record)
+
+    def test_parse_gff_record_skips_missing_transcript_id(self):
+        # We exclude alignments missing a parent field
+        line = sample_line(
+            attributes_str="ID=exon-NR_046018.2-1;transcript_id=NR_046018.2"
+        )  # parent missing from attributes
+        expected_record = None
+        parsed_record = parse_gff_record(line)
+        self.assertEqual(parsed_record, expected_record)
+
+    def test_parse_gff_record_skips_missing_parent_field(self):
+        # We exclude alignments missing transcript_id
+        line = sample_line(
+            attributes_str="ID=exon-NR_046018.2-1;Parent=rna-NR_046018.2"
+        )  # transcript_id missing from attributes
+        expected_record = None
+        parsed_record = parse_gff_record(line)
+        self.assertEqual(parsed_record, expected_record)
+
+    def test_parse_gff_record_skips_non_NM_NR_transcripts(self):
+        # We only care about transcripts that start with NM_ or NR_
+        line = sample_line(
+            attributes_str="ID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;transcript_id=somethingelse"
+        )  # transcript_id missing from attributes
+        expected_record = None
+        parsed_record = parse_gff_record(line)
+        self.assertEqual(parsed_record, expected_record)
+
+    def test_parse_gff_record_unexpected_number_of_fields(self):
+        # Raise an exception if there are not exactly 9 fields in a non-comment line
+        line = "NC_000001.10\tID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;transcript_id=NR_046018\n"  # only 2 fields
+        with self.assertRaises(ValueError) as context:
+            parse_gff_record(line)
+
+        self.assertEqual(
+            str(context.exception), "Expected 9 tab-separated fields, got 2"
+        )
+
+    def test_parse_gff_record_raises_non_int_start_stop(self):
+        # Raise an exception if either start or stop is not an integer
+        lines = [sample_line(start="a string"), sample_line(stop="another string")]
+        for line in lines:
+            with self.assertRaises(ValueError):
+                parse_gff_record(line)
+
+    def test_parse_gff_record_raises_unparseable_id(self):
+        # raise an exception if we cannot parse the exon number from the ID
+        line = sample_line(
+            attributes_str="ID=unexpected_id;Parent=rna-NR_046018.2;transcript_id=NR_046018"
+        )
+        with self.assertRaises(ValueError) as context:
+            parse_gff_record(line)
+
+        self.assertEqual(
+            str(context.exception), "Failed to parse exon number from unexpected_id"
+        )
+
+    def test_parse_gff_file(self):
+        # Test parsing the entire uncompressed GFF file
+        expected_result = {
+            "NR_046018.2:NC_000001.10": self.gff_records[:3],
+            "NR_046018.2:NC_000001.11": self.gff_records[3:],
+        }
+        parsed_result = parse_gff_files([self.temp_gff.name])
+        self.assertEqual(parsed_result, expected_result)
+
+    def test_parse_gff_file_accepts_gzipped_files(self):
+        # Create a gzipped version of the temp_gff file
+        with gzip.open(self.temp_gff.name + ".gz", "wb") as f_out:
+            with open(self.temp_gff.name, "rb") as f_in:
+                f_out.write(f_in.read())
+
+        # Test parsing the gzipped GFF file
+        expected_result = {
+            "NR_046018.2:NC_000001.10": self.gff_records[:3],
+            "NR_046018.2:NC_000001.11": self.gff_records[3:],
+        }
+        parsed_result = parse_gff_files([self.temp_gff.name + ".gz"])
+        self.assertEqual(parsed_result, expected_result)
+
+    def test_get_zero_based_exon_ranges(self):
+        # Test converting exon ranges to 0-based half-open format yields expected values
+        exon_ranges = get_zero_based_exon_ranges(self.gff_records[:3])
+        assert exon_ranges == "11873,12227;12612,12721;13220,14409"
+
+    def test_script_output(self):
+        # Run the script from the command line
+        input_gff_file = os.path.join(CURRENT_DIR, "data", f"genomic_100.gff.gz")
+        script_path = os.path.join(BASE_DIR, "sbin", "ncbi_parse_genomic_gff.py")
+
+        command = ["python", script_path, input_gff_file]
+        completed_process = subprocess.run(
+            command, check=True, capture_output=True, text=True
+        )
+        stdout_content = completed_process.stdout
+        expected_file_path = os.path.join(
+            CURRENT_DIR, "data", "expected_genomic_100.exonset"
+        )
+        with open(expected_file_path, "r") as expected_file:
+            expected_content = expected_file.read()
+
+        assert (
+            stdout_content == expected_content
+        ), "Output content doesn't match expected."
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_ncbi_process_mito.py b/tests/test_ncbi_process_mito.py
new file mode 100644
index 0000000..6e8ef27
--- /dev/null
+++ b/tests/test_ncbi_process_mito.py
@@ -0,0 +1,279 @@
+import os
+import unittest
+from unittest.mock import MagicMock, patch
+
+from Bio.SeqRecord import SeqRecord
+
+from sbin.ncbi_process_mito import (
+    download_mito_files,
+    get_mito_genes,
+    parse_db_xrefs,
+    parse_nomenclature_value,
+)
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class TestNcbiProcessMito(unittest.TestCase):
+    def verify_mito_gene_attributes(self, mito_gene, expected_values):
+        for k, v in expected_values.items():
+            try:
+                self.assertEqual(getattr(mito_gene, k), v)
+            except AssertionError:
+                print(
+                    f"Test failure on mito gene {mito_gene.gene_symbol} ({mito_gene.gene_id}) "
+                    f'attribute "{k}" with value "{v}" not equal to "{getattr(mito_gene, k)}"'
+                )
+                raise
+
+    @patch("sbin.ncbi_process_mito.download_from_eutils")
+    def test_download_mito_files(self, mock_download):
+        output_dir = "test_dir"
+        accession = "test_accession"
+        result = download_mito_files(output_dir, accession)
+        self.assertEqual(
+            result,
+            {
+                "gbff": "test_dir/test_accession.gbff",
+                "fna": "test_dir/test_accession.fna",
+            },
+        )
+        mock_download.assert_any_call(accession, "gb", f"{output_dir}/{accession}.gbff")
+        mock_download.assert_any_call(
+            accession, "fasta", f"{output_dir}/{accession}.fna"
+        )
+
+    def test_db_xrefs(self):
+        gb_feature_mock = MagicMock(spec=SeqRecord)
+        gb_feature_mock.qualifiers = {
+            "db_xref": ["GeneID:4558", "HGNC:HGNC:7481", "MIM:590070"]
+        }
+
+        result = parse_db_xrefs(gb_feature_mock)
+        self.assertEqual(
+            result, {"GeneID": "4558", "HGNC": "HGNC:7481", "MIM": "590070"}
+        )
+
+    def test_db_xrefs_empty(self):
+        gb_feature_mock = MagicMock(spec=SeqRecord)
+        gb_feature_mock.qualifiers = {}
+
+        result = parse_db_xrefs(gb_feature_mock)
+        self.assertEqual(result, {})
+
+    def test_parse_nomenclature_value(self):
+        gb_feature_mock = MagicMock(spec=SeqRecord)
+        gb_feature_mock.qualifiers = {
+            "nomenclature": [
+                "Official Symbol: MT-TF | Name: mitochondrially encoded tRNA phenylalanine | Provided by: HGNC:HGNC:7481"
+            ]
+        }
+
+        result = parse_nomenclature_value(gb_feature_mock)
+        self.assertEqual(
+            result,
+            {
+                "Official Symbol": "MT-TF",
+                "Name": "mitochondrially encoded tRNA phenylalanine",
+                "Provided by": "HGNC:HGNC:7481",
+            },
+        )
+
+    def test_parse_nomenclature_value_empty(self):
+        gb_feature_mock = MagicMock(spec=SeqRecord)
+        gb_feature_mock.qualifiers = {}
+
+        result = parse_nomenclature_value(gb_feature_mock)
+        self.assertEqual(result, {})
+
+    def test_get_mito_genes(self):
+        mito_genbank_filepath = f"{BASE_DIR}/data/NC_012920.1.gbff"
+        results = [_ for _ in get_mito_genes(mito_genbank_filepath)]
+        expected_gene_ids = [
+            4508,
+            4509,
+            4511,
+            4512,
+            4513,
+            4514,
+            4519,
+            4535,
+            4536,
+            4537,
+            4538,
+            4539,
+            4540,
+            4541,
+            4549,
+            4550,
+            4553,
+            4555,
+            4556,
+            4558,
+            4563,
+            4564,
+            4565,
+            4566,
+            4567,
+            4568,
+            4569,
+            4570,
+            4571,
+            4572,
+            4573,
+            4574,
+            4575,
+            4576,
+            4577,
+            4578,
+            4579,
+        ]
+        expected_gene_symbols = [
+            "MT-ATP6",
+            "MT-ATP8",
+            "MT-CO1",
+            "MT-CO2",
+            "MT-CO3",
+            "MT-CYB",
+            "MT-ND1",
+            "MT-ND2",
+            "MT-ND3",
+            "MT-ND4",
+            "MT-ND4L",
+            "MT-ND5",
+            "MT-ND6",
+            "MT-RNR1",
+            "MT-RNR2",
+            "MT-TA",
+            "MT-TC",
+            "MT-TD",
+            "MT-TE",
+            "MT-TF",
+            "MT-TG",
+            "MT-TH",
+            "MT-TI",
+            "MT-TK",
+            "MT-TL1",
+            "MT-TL2",
+            "MT-TM",
+            "MT-TN",
+            "MT-TP",
+            "MT-TQ",
+            "MT-TR",
+            "MT-TS1",
+            "MT-TS2",
+            "MT-TT",
+            "MT-TV",
+            "MT-TW",
+            "MT-TY",
+        ]
+        expected_origin = "NCBI"
+        expected_aln_method = "splign"
+
+        self.assertEqual(len(results), 37)
+        self.assertEqual(sorted([r.gene_id for r in results]), expected_gene_ids)
+        self.assertEqual(
+            sorted([r.gene_symbol for r in results]), expected_gene_symbols
+        )
+        self.assertEqual([r.origin for r in results], [expected_origin] * 37)
+        self.assertEqual(
+            [r.alignment_method for r in results], [expected_aln_method] * 37
+        )
+
+        results_by_gene = {mg.gene_id: mg for mg in results}
+
+        # Expected results for "MT-TV" non-coding tRNA gene on the plus strand
+        expected_mg4577_values = {
+            "gene_symbol": "MT-TV",
+            "name": "mitochondrially encoded tRNA valine",
+            "tx_ac": "NC_012920.1_01601_01670",
+            "tx_seq": "CAGAGTGTAGCTTAACACAAAGCACCCAACTTACACTTAGGAGATTTCAACTTAACTTGACCGCTCTGA",
+            "tx_start": 0,
+            "tx_end": 69,
+            "alt_ac": "NC_012920.1",
+            "alt_start": 1601,
+            "alt_end": 1670,
+            "strand": 1,
+            "transl_table": None,
+            "transl_except": None,
+            "pro_ac": None,
+            "pro_seq": None,
+        }
+        self.verify_mito_gene_attributes(results_by_gene[4577], expected_mg4577_values)
+
+        # Expected results for "MT-TQ" tRNA gene on the minus strand
+        expected_mg4572_values = {
+            "gene_symbol": "MT-TQ",
+            "name": "mitochondrially encoded tRNA glutamine",
+            "tx_ac": "NC_012920.1_04328_04400",
+            "tx_seq": "TAGGATGGGGTGTGATAGGTGGCACGGAGAATTTTGGATTCTCAGGGATGGGTTCGATTCTCATAGTCCTAG",
+            "tx_start": 0,
+            "tx_end": 72,
+            "alt_ac": "NC_012920.1",
+            "alt_start": 4328,
+            "alt_end": 4400,
+            "strand": -1,
+            "transl_table": None,
+            "transl_except": None,
+            "pro_ac": None,
+            "pro_seq": None,
+        }
+        self.verify_mito_gene_attributes(results_by_gene[4572], expected_mg4572_values)
+
+        # Expected results for "MT-CO2" coding gene on the plus strand
+        expected_mg4513_values = {
+            "gene_symbol": "MT-CO2",
+            "name": "mitochondrially encoded cytochrome c oxidase II",
+            "tx_ac": "NC_012920.1_07585_08269",
+            "tx_seq": "ATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTT"
+            "TCCTTATCTGCTTCCTAGTCCTGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATCTCAGACGCTCAGGAAATAGAAACCGTCTGAACT"
+            "ATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGACGAGGTCAACGATCCCTCCCTTACCATCAAATCAATTGG"
+            "CCACCAATGGTACTGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCC"
+            "TTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTA"
+            "AAAACAGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAG"
+            "TTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAG",
+            "tx_start": 0,
+            "tx_end": 684,
+            "alt_ac": "NC_012920.1",
+            "alt_start": 7585,
+            "alt_end": 8269,
+            "strand": 1,
+            "transl_table": "2",
+            "transl_except": None,
+            "pro_ac": "YP_003024029.1",
+            "pro_seq": "MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFLTLTTKLTNTNISDAQEMETVWTILPAIILVLIALPSLRILYMTDEVNDP"
+            "SLTIKSIGHQWYWTYEYTDYGGLIFNSYMLPPLFLEPGDLRLLDVDNRVVLPIEAPIRMMITSQDVLHSWAVPTLGLKTDAIPGRLNQTTFTATRPGVYYGQCS"
+            "EICGANHSFMPIVLELIPLKIFEMGPVFTL",
+        }
+        self.verify_mito_gene_attributes(results_by_gene[4513], expected_mg4513_values)
+
+        # Expected results for "MT-ND1" coding gene on the minus strand with a transl_except
+        expected_mg4535_values = {
+            "gene_symbol": "MT-ND1",
+            "name": "mitochondrially encoded NADH dehydrogenase 1",
+            "tx_ac": "NC_012920.1_03306_04262",
+            "tx_seq": "ATACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAATCGCAATGGCATTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATAC"
+            "AACTACGCAAAGGCCCCAACGTTGTAGGCCCCTACGGGCTACTACAACCCTTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCACATCT"
+            "ACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATCGCTCTTCTACTATGAACCCCCCTCCCCATACCCAACCCCCTGGTCAACCTCAACCTAGG"
+            "CCTCCTATTTATTCTAGCCACCTCTAGCCTAGCCGTTTACTCAATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCGCACTGCGAGCAG"
+            "TAGCCCAAACAATCTCATATGAAGTCACCCTAGCCATCATTCTACTATCAACATTACTAATAAGTGGCTCCTTTAACCTCTCCACCCTTATCACAACACAAGAA"
+            "CACCTCTGATTACTCCTGCCATCATGACCCTTGGCCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTTCGACCTTGCCGAAGGGGAGTC"
+            "CGAACTAGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATACACAAACATTATTATAATAAACACCCTCACCACTA"
+            "CAATCTTCCTAGGAACAACATATGACGCACTCTCCCCTGAACTCTACACAACATATTTTGTCACCAAGACCCTACTTCTAACCTCCCTGTTCTTATGAATTCGA"
+            "ACAGCATACCCCCGATTCCGCTACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTAGCATTACTTATATGATATGTCTCCATACCCAT"
+            "TACAATCTCCAGCATTCCCCCTCAAACCTA",
+            "tx_start": 0,
+            "tx_end": 956,
+            "alt_ac": "NC_012920.1",
+            "alt_start": 3306,
+            "alt_end": 4262,
+            "strand": 1,
+            "transl_table": "2",
+            "transl_except": ["(pos:4261..4262,aa:TERM)"],
+            "pro_ac": "YP_003024026.1",
+            "pro_seq": "MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLLQPFADAMKLFTKEPLKPATSTITLYITAPTLALTIALLLWTPLPMPN"
+            "PLVNLNLGLLFILATSSLAVYSILWSGWASNSNYALIGALRAVAQTISYEVTLAIILLSTLLMSGSFNLSTLITTQEHLWLLLPSWPLAMMWFISTLAETNRTP"
+            "FDLAEGESELVSGFNIEYAAGPFALFFMAEYTNIIMMNTLTTTIFLGTTYDALSPELYTTYFVTKTLLLTSLFLWIRTAYPRFRYDQLMHLLWKNFLPLTLALL"
+            "MWYVSMPITISSIPPQT",
+        }
+        self.verify_mito_gene_attributes(results_by_gene[4535], expected_mg4535_values)
diff --git a/tests/test_uta_formats_txinfo.py b/tests/test_uta_formats_txinfo.py
new file mode 100644
index 0000000..b2e3e09
--- /dev/null
+++ b/tests/test_uta_formats_txinfo.py
@@ -0,0 +1,10 @@
+import unittest
+from uta.formats.txinfo import TxInfo
+
+
+class TestUtaFormats(unittest.TestCase):
+
+    def test_txinfo_serialize_transl_except(self):
+        self.assertIsNone(TxInfo.serialize_transl_except(None))
+        self.assertEqual(TxInfo.serialize_transl_except([]), '')
+        self.assertEqual(TxInfo.serialize_transl_except(['(pos:333..335,aa:Sec)', '(pos:1017,aa:TERM)']), '(pos:333..335,aa:Sec);(pos:1017,aa:TERM)')
diff --git a/tests/test_uta_loading.py b/tests/test_uta_loading.py
new file mode 100644
index 0000000..210d2ab
--- /dev/null
+++ b/tests/test_uta_loading.py
@@ -0,0 +1,445 @@
+import configparser
+import signal
+import unittest
+from unittest.mock import Mock, patch
+
+import sqlalchemy as sa
+import testing.postgresql
+
+import uta
+import uta.loading as ul
+import uta.models as usam
+
+
+class TestUtaLoading(unittest.TestCase):
+
+    def setUp(self):
+        # setup test database
+        self.db = testing.postgresql.Postgresql()
+        self.session = uta.connect(self.db.url())
+
+        admin_role = 'uta_admin'
+        self.session.execute(sa.text(f'create user {admin_role}'))
+        self.session.execute(sa.text(f'grant all privileges on database test to {admin_role}'))
+
+        self.cf = configparser.ConfigParser()
+        self.cf.add_section('uta')
+        self.cf.set('uta', 'admin_role', 'uta_admin')
+
+        ul.create_schema(self.session, {}, self.cf)
+        ul.grant_permissions(self.session, {}, self.cf)
+
+    def tearDown(self):
+        self.session.close()
+        self.db.stop(_signal=signal.SIGKILL)
+        self.db.cleanup()
+
+    def test_meta_data(self):
+        """
+        Metadata should exist, then updated when update_meta_data is called.
+        """
+        # the schema_version should match existing values in UTA models
+        expected_schema_version = usam.schema_version
+        md_schema_version = self.session.query(usam.Meta).filter(usam.Meta.key == 'schema_version').one()
+        self.assertEqual(md_schema_version.value, expected_schema_version)
+
+        new_schema_version = '9.9'
+        with patch('uta.models.schema_version', new_schema_version):
+            ul.update_meta_data(self.session, {}, self.cf)
+
+        md_schema_version = self.session.query(usam.Meta).filter(usam.Meta.key == 'schema_version').one()
+        self.assertEqual(md_schema_version.value, new_schema_version)
+
+        md_updated_at = self.session.query(usam.Meta).filter(usam.Meta.key == 'updated on').one_or_none()
+        self.assertIsNotNone(md_updated_at)
+
+    def test_load_assoc_ac(self):
+        """
+        Loading file tests/data/assocacs.gz should create associated_accessions records in the database.
+        Row will be created in associated_accessions even when transcript or origin does not exist in database.
+        This is only the case until tx_ac and origin are converted to foreign keys.
+        """
+
+        # insert origins referenced in data file
+        o1 = usam.Origin(
+            name='NCBI',
+            url='http://bogus.com/ncbi',
+            url_ac_fmt='http://bogus.com/ncbi/{ac}',
+        )
+        self.session.add(o1)
+
+        # insert genes required for transcripts
+        g1 = usam.Gene(
+            gene_id='49',
+            hgnc='ACR',
+            symbol='ACR',
+            maploc='22q13.33',
+            descr='acrosin',
+            summary='acrosin',
+            aliases='SPGF87',
+            type='protein-coding',
+            xrefs='MIM:102480,HGNC:HGNC:126,Ensembl:ENSG00000100312,AllianceGenome:HGNC:126',
+        )
+        g2 = usam.Gene(
+            gene_id=50,
+            hgnc='ACO2',
+            symbol='ACO2',
+            maploc='22q13.2',
+            descr='aconitase 2',
+            summary='aconitase 2',
+            aliases='ACONM,HEL-S-284,ICRD,OCA8,OPA9',
+            type='protein-coding',
+            xrefs='MIM:100850,HGNC:HGNC:118,Ensembl:ENSG00000100412,AllianceGenome:HGNC:118',
+        )
+        self.session.add(g1)
+        self.session.add(g2)
+
+        # insert transcripts referenced in data file
+        t1 = usam.Transcript(
+            ac='NM_001097.3',
+            origin=o1,
+            gene_id=g1.gene_id,
+            cds_start_i=0,
+            cds_end_i=1,
+            cds_md5='a',
+        )
+        t2 = usam.Transcript(
+            ac='NM_001098.3',
+            origin=o1,
+            gene_id=g2.gene_id,
+            cds_start_i=2,
+            cds_end_i=3,
+            cds_md5='b',
+        )
+        self.session.add(t1)
+        self.session.add(t2)
+
+        # pre-add one of the associated_acessions from the test data file
+        # to demonstrate get-or-insert behavior
+        p = usam.AssociatedAccessions(
+            tx_ac='NM_001097.3',
+            pro_ac='NP_001088.2',
+            origin='NCBI',
+        )
+        self.session.add(p)
+
+        self.session.commit()
+
+        cf = configparser.ConfigParser()
+        cf.add_section('uta')
+        cf.set('uta', 'admin_role', 'uta_admin')
+
+        ul.load_assoc_ac(self.session, {'FILE': 'tests/data/assocacs.gz'}, cf)
+
+        # associated_accessions table should contain one record per line in file
+        aa = self.session.query(usam.AssociatedAccessions).order_by(usam.AssociatedAccessions.tx_ac).all()
+        aa_list = [{'tx_ac': aa.tx_ac, 'pro_ac': aa.pro_ac, 'origin_name': aa.origin} for aa in aa]
+        expected_aa_list = [
+            {
+                'tx_ac': 'DummyTx',
+                'pro_ac': 'DummyProtein',
+                'origin_name': 'DummyOrigin',
+            },
+            {
+                'tx_ac': 'NM_001097.3',
+                'pro_ac': 'NP_001088.2',
+                'origin_name': 'NCBI',
+            },
+            {
+                'tx_ac': 'NM_001098.3',
+                'pro_ac': 'NP_001089.1',
+                'origin_name': 'NCBI',
+            },
+        ]
+        self.assertEqual(aa_list, expected_aa_list)
+
+    def test_load_txinfo(self):
+        """
+        Loading file tests/data/txinfo.gz should create transcript, exon_set, exon, and translation_exception records in the database.
+        """
+
+        # insert origins referenced in data file
+        o1 = usam.Origin(
+            name='NCBI',
+            url='http://bogus.com/ncbi',
+            url_ac_fmt='http://bogus.com/ncbi/{ac}',
+        )
+        self.session.add(o1)
+
+        # insert genes required for transcripts
+        g1 = usam.Gene(
+            gene_id='140606',
+            hgnc='SELENOM',
+            symbol='SELENOM',
+            maploc='22q12.2',
+            descr='selenoprotein M',
+            summary='selenoprotein M',
+            aliases='SELM,SEPM',
+            type='protein-coding',
+            xrefs='MIM:610918,HGNC:HGNC:30397,Ensembl:ENSG00000198832,AllianceGenome:HGNC:30397',
+        )
+        g2 = usam.Gene(
+            gene_id='4514',
+            hgnc='MT-CO3',
+            symbol='MT-CO3',
+            maploc=None,
+            descr='mitochondrially encoded cytochrome c oxidase III',
+            summary='mitochondrially encoded cytochrome c oxidase III',
+            aliases='COIII,MTCO3',
+            type='protein-coding',
+            xrefs='GeneID:4514,HGNC:HGNC:7422,MIM:516050',
+        )
+        self.session.add(g1)
+        self.session.add(g2)
+        self.session.commit()
+
+        cf = configparser.ConfigParser()
+        cf.add_section('uta')
+        cf.set('uta', 'admin_role', 'uta_admin')
+
+        with patch('uta.loading._get_seqfetcher', return_value=Mock(fetch=Mock(return_value='FAKESEQUENCE'))):
+            ul.load_txinfo(self.session, {'FILE': 'tests/data/txinfo.gz'}, cf)
+
+        transcript = self.session.query(usam.Transcript).filter(usam.Transcript.ac == 'NM_080430.4').one()
+        self.assertEqual(
+            {
+                'ac': transcript.ac,
+                'gene_id': transcript.gene_id,
+                'cds_start_i': transcript.cds_start_i,
+                'cds_end_i': transcript.cds_end_i,
+                'codon_table': transcript.codon_table,
+            },
+            {
+                'ac': 'NM_080430.4',
+                'gene_id': '140606',
+                'cds_start_i': 63,
+                'cds_end_i': 501,
+                'codon_table': '1',
+            },
+        )
+
+        transcript = self.session.query(usam.Transcript).filter(usam.Transcript.ac == 'NC_012920.1_09206_09990').one()
+        self.assertEqual(
+            {
+                'ac': transcript.ac,
+                'gene_id': transcript.gene_id,
+                'cds_start_i': transcript.cds_start_i,
+                'cds_end_i': transcript.cds_end_i,
+                'codon_table': transcript.codon_table,
+            },
+            {
+                'ac': 'NC_012920.1_09206_09990',
+                'gene_id': '4514',
+                'cds_start_i': 0,
+                'cds_end_i': 784,
+                'codon_table': '2',
+            },
+        )
+
+        exon_set = self.session.query(usam.ExonSet).filter(usam.ExonSet.tx_ac == 'NM_080430.4').one()
+        exons = self.session.query(usam.Exon).filter(usam.Exon.exon_set_id == exon_set.exon_set_id).all()
+        self.assertEqual(len(exons), 5)
+
+        translation_exception = self.session.query(usam.TranslationException).filter(usam.TranslationException.tx_ac == 'NM_080430.4').one()
+        self.assertEqual(
+            {
+                'tx_ac': translation_exception.tx_ac,
+                'start_position': translation_exception.start_position,
+                'end_position': translation_exception.end_position,
+                'amino_acid': translation_exception.amino_acid,
+            },
+            {
+                'tx_ac': 'NM_080430.4',
+                'start_position': 204,
+                'end_position': 207,
+                'amino_acid': 'Sec',
+            },
+        )
+
+    def test_load_exonset_with_exon_structure_mismatch(self):
+        """
+        Loading the test file tests/data/exonsets-mm-exons.gz should not raise an exception, exon alignments without
+        a mismatch should load, those with a mismatch should be skipped and logged as such. The input file has
+        alignments for 4 transcripts against NC_000001.11, but only 2 of them have the correct number of exons.
+        We only expect the alignmets for NM_000911.4 and NM_001005277.1 to be loaded.
+        """
+        # setup
+        # insert origins referenced in data file
+        o1 = usam.Origin(
+            name="NCBI",
+            url="http://bogus.com/ncbi",
+            url_ac_fmt="http://bogus.com/ncbi/{ac}",
+        )
+        self.session.add(o1)
+        self.session.flush()
+
+        for gene_data in [
+            {
+                "gene_id": "3352",
+                "hgnc": "HTR1D",
+                "symbol": "HTR1D",
+                "type": "protein-coding",
+            },
+            {
+                "gene_id": "4985",
+                "hgnc": "OPRD1",
+                "symbol": "OPRD1",
+                "type": "protein-coding",
+            },
+            {
+                "gene_id": "81399",
+                "hgnc": "OR4F16",
+                "symbol": "OR4F16",
+                "type": "protein-coding",
+            },
+            {
+                "gene_id": "79501",
+                "hgnc": "OR4F5",
+                "symbol": "OR4F5",
+                "type": "protein-coding",
+            },
+        ]:
+            gene = usam.Gene(**gene_data)
+            self.session.add(gene)
+
+        for tx_data in [
+            {
+                "ac": "NM_000864.5",
+                "origin_id": o1.origin_id,
+                "gene_id": "3352",
+                "cds_start_i": 994,
+                "cds_end_i": 2128,
+                "cds_md5": "a",
+            },
+            {
+                "ac": "NM_000911.4",
+                "origin_id": o1.origin_id,
+                "gene_id": "4985",
+                "cds_start_i": 214,
+                "cds_end_i": 1333,
+                "cds_md5": "b",
+            },
+            {
+                "ac": "NM_001005277.1",
+                "origin_id": o1.origin_id,
+                "gene_id": "81399",
+                "cds_start_i": 0,
+                "cds_end_i": 939,
+                "cds_md5": "c",
+            },
+            {
+                "ac": "NM_001005484.2",
+                "origin_id": o1.origin_id,
+                "gene_id": "79501",
+                "cds_start_i": 60,
+                "cds_end_i": 1041,
+                "cds_md5": "d",
+            },
+        ]:
+            tx = usam.Transcript(**tx_data)
+            self.session.add(tx)
+            es = usam.ExonSet(
+                tx_ac=tx.ac,
+                alt_ac=tx.ac,
+                alt_strand=1,
+                alt_aln_method="transcript",
+            )
+            self.session.add(es)
+            self.session.flush()
+
+        for exon_data in [
+            ("NM_000864.5", 1, 0, 3319),  # exons for NM_000864.5 are 0,212;212,3319
+            ("NM_000911.4", 1, 0, 441),
+            ("NM_000911.4", 2, 441, 791),
+            ("NM_000911.4", 3, 791, 9317),
+            ("NM_001005277.1", 1, 0, 939),
+            ("NM_001005484.2", 1, 0, 15),
+            ("NM_001005484.2", 2, 15, 69),
+            (
+                "NM_001005484.2",
+                3,
+                69,
+                1041,
+            ),  # exons for NM_001005484.2 are 0,15;15,69;69,2618
+            ("NM_001005484.2", 4, 1041, 2618),
+        ]:
+            es = (
+                self.session.query(usam.ExonSet)
+                .filter(
+                    usam.ExonSet.tx_ac == exon_data[0], usam.ExonSet.alt_ac == exon_data[0]
+                )
+                .one()
+            )
+            exon = usam.Exon(
+                exon_set_id=es.exon_set_id,
+                start_i=exon_data[2],
+                end_i=exon_data[3],
+                ord=exon_data[1],
+            )
+            self.session.add(exon)
+        self.session.commit()
+
+        cf = configparser.ConfigParser()
+        cf.add_section("uta")
+        cf.set("uta", "admin_role", "uta_admin")
+
+        # load data from test exonsets file.
+        with patch(
+            "uta.loading._get_seqfetcher",
+            return_value=Mock(fetch=Mock(return_value="FAKESEQUENCE")),
+        ), patch("uta.loading.logger") as mock_logger:
+            ul.load_exonset(self.session, {"FILE": "tests/data/exonsets.mm-exons.gz"}, cf)
+
+            assert mock_logger.warning.called_with(
+                "Exon structure mismatch: 4 exons in transcript NM_001005484.2; 3 in alignment NC_000001.11"
+            )
+            assert mock_logger.warning.called_with(
+                "Exon structure mismatch: 1 exons in transcript NM_000864.5; 2 in alignment NC_000001.11"
+            )
+
+        # check that the exons for NM_000864.5 and NM_001005484.2 were not loaded,
+        # and NM_000911.4 and NM_001005277.1 were loaded
+        for tx_ac, expected_exon_count in [("NM_000911.4", 3), ("NM_001005277.1", 1)]:
+            exon_set = (
+                self.session.query(usam.ExonSet)
+                .filter(
+                    usam.ExonSet.tx_ac == tx_ac,
+                    usam.ExonSet.alt_ac == "NC_000001.11",
+                    usam.ExonSet.alt_aln_method == "splign",
+                )
+                .one()
+            )
+            exons = (
+                self.session.query(usam.Exon)
+                .filter(usam.Exon.exon_set_id == exon_set.exon_set_id)
+                .all()
+            )
+            self.assertEqual(len(exons), expected_exon_count)
+
+        for tx_ac in ["NM_000864.5", "NM_001005484.2"]:
+            with self.assertRaises(sa.orm.exc.NoResultFound):
+                self.session.query(usam.ExonSet).filter(
+                    usam.ExonSet.tx_ac == tx_ac,
+                    usam.ExonSet.alt_ac == "NC_000001.11",
+                    usam.ExonSet.alt_aln_method == "splign",
+                ).one()
+
+
+class TestUtaLoadingFunctions(unittest.TestCase):
+    def test__create_translation_exceptions(self):
+        transl_except_list = ['(pos:333..335,aa:Sec)', '(pos:1017,aa:TERM)']
+        translation_exceptions = ul._create_translation_exceptions(transcript='dummy_tx', transl_except_list=transl_except_list)
+        self.assertEqual(translation_exceptions, [
+            {
+                'tx_ac': 'dummy_tx',
+                'start_position': 332,
+                'end_position': 335,
+                'amino_acid': 'Sec',
+            },
+            {
+                'tx_ac': 'dummy_tx',
+                'start_position': 1016,
+                'end_position': 1017,
+                'amino_acid': 'TERM',
+            },
+        ])
diff --git a/tests/test_uta_models.py b/tests/test_uta_models.py
index 86eafea..d619dfc 100644
--- a/tests/test_uta_models.py
+++ b/tests/test_uta_models.py
@@ -1,7 +1,8 @@
-import os
+import datetime
 import unittest
 
 import sqlalchemy
+from sqlalchemy import text
 import testing.postgresql
 
 import uta
@@ -16,6 +17,11 @@
         'g_strand': -1,
         'g_starts_i': [26721603, 26627221], 		'g_ends_i': [26722922, 26628183],
         'g_cds_start_i': 26627665, 			'g_cds_end_i': 26722486,
+        'pro_ac': 'NP_000671.2',
+        'translation_exceptions': [
+            {'start_position': 333, 'end_position': 335, 'amino_acid': 'Sec'},
+            {'start_position': 589, 'end_position': 589, 'amino_acid': 'TERM'},
+        ],
     },
     'NM_033302.2': {
         'seq': 'gaattccgaatcatgtgcagaatgctgaatcttcccccagccaggacgaataagacagcgcggaaaagcagattctcgtaattctggaattgcatgttgcaaggagtctcctggatcttcgcacccagcttcgggtagggagggagtccgggtcccgggctaggccagcccggcaggtggagagggtccccggcagccccgcgcgcccctggccatgtctttaatgccctgccccttcatgtggccttctgagggttcccagggctggccagggttgtttcccacccgcgcgcgcgctctcacccccagccaaacccacctggcagggctccctccagccgagaccttttgattcccggctcccgcgctcccgcctccgcgccagcccgggaggtggccctggacagccggacctcgcccggccccggctgggaccatggtgtttctctcgggaaatgcttccgacagctccaactgcacccaaccgccggcaccggtgaacatttccaaggccattctgctcggggtgatcttggggggcctcattcttttcggggtgctgggtaacatcctagtgatcctctccgtagcctgtcaccgacacctgcactcagtcacgcactactacatcgtcaacctggcggtggccgacctcctgctcacctccacggtgctgcccttctccgccatcttcgaggtcctaggctactgggccttcggcagggtcttctgcaacatctgggcggcagtggatgtgctgtgctgcaccgcgtccatcatgggcctctgcatcatctccatcgaccgctacatcggcgtgagctacccgctgcgctacccaaccatcgtcacccagaggaggggtctcatggctctgctctgcgtctgggcactctccctggtcatatccattggacccctgttcggctggaggcagccggcccccgaggacgagaccatctgccagatcaacgaggagccgggctacgtgctcttctcagcgctgggctccttctacctgcctctggccatcatcctggtcatgtactgccgcgtctacgtggtggccaagagggagagccggggcctcaagtctggcctcaagaccgacaagtcggactcggagcaagtgacgctccgcatccatcggaaaaacgccccggcaggaggcagcgggatggccagcgccaagaccaagacgcacttctcagtgaggctcctcaagttctcccgggagaagaaagcggccaaaacgctgggcatcgtggtcggctgcttcgtcctctgctggctgccttttttcttagtcatgcccattgggtctttcttccctgatttcaagccctctgaaacagtttttaaaatagtattttggctcggatatctaaacagctgcatcaaccccatcatatacccatgctccagccaagagttcaaaaaggcctttcagaatgtcttgagaatccagtgtctctgcagaaagcagtcttccaaacatgccctgggctacaccctgcacccgcccagccaggccgtggaagggcaacacaaggacatggtgcgcatccccgtgggatcaagagagaccttctacaggatctccaagacggatggcgtttgtgaatggaaatttttctcttccatgccccgtggatctgccaggattacagtgtccaaagaccaatcctcctgtaccacagcccggggacacacacccatgacatgaagccagcttcccgtccacgactgttgtccttactgcccaaggaaggggagcatgaaacccaccactggtcctgcgacccactgtctttggaatccaccccaggagcccaggagccttgcctgacacttggatttacttctttatcaagcatccatctgactaaggcacaaatccaacatgttactgttactgatacaggaaaaacagtaacttaaggaatgatcatgaatgcaaagggaaagaggaaaagagccttcagggacaaatagctcgattttttgtaaatcagtttcatacaacctccctcccccatttcattcttaaaagttaattgagaatcatcagccacgtgtagggtgtgag',
@@ -24,6 +30,7 @@
         'g_strand': -1,
         'g_starts_i': [26721603, 26627797, 26613912], 	'g_ends_i': [26722922, 26628183, 26614296],
         'g_cds_start_i': 26614275, 			'g_cds_end_i': 26722486,
+        'pro_ac': 'NP_150645.2',
     },
     'NM_033303.3': {
         'seq': 'gaattccgaatcatgtgcagaatgctgaatcttcccccagccaggacgaataagacagcgcggaaaagcagattctcgtaattctggaattgcatgttgcaaggagtctcctggatcttcgcacccagcttcgggtagggagggagtccgggtcccgggctaggccagcccggcaggtggagagggtccccggcagccccgcgcgcccctggccatgtctttaatgccctgccccttcatgtggccttctgagggttcccagggctggccagggttgtttcccacccgcgcgcgcgctctcacccccagccaaacccacctggcagggctccctccagccgagaccttttgattcccggctcccgcgctcccgcctccgcgccagcccgggaggtggccctggacagccggacctcgcccggccccggctgggaccatggtgtttctctcgggaaatgcttccgacagctccaactgcacccaaccgccggcaccggtgaacatttccaaggccattctgctcggggtgatcttggggggcctcattcttttcggggtgctgggtaacatcctagtgatcctctccgtagcctgtcaccgacacctgcactcagtcacgcactactacatcgtcaacctggcggtggccgacctcctgctcacctccacggtgctgcccttctccgccatcttcgaggtcctaggctactgggccttcggcagggtcttctgcaacatctgggcggcagtggatgtgctgtgctgcaccgcgtccatcatgggcctctgcatcatctccatcgaccgctacatcggcgtgagctacccgctgcgctacccaaccatcgtcacccagaggaggggtctcatggctctgctctgcgtctgggcactctccctggtcatatccattggacccctgttcggctggaggcagccggcccccgaggacgagaccatctgccagatcaacgaggagccgggctacgtgctcttctcagcgctgggctccttctacctgcctctggccatcatcctggtcatgtactgccgcgtctacgtggtggccaagagggagagccggggcctcaagtctggcctcaagaccgacaagtcggactcggagcaagtgacgctccgcatccatcggaaaaacgccccggcaggaggcagcgggatggccagcgccaagaccaagacgcacttctcagtgaggctcctcaagttctcccgggagaagaaagcggccaaaacgctgggcatcgtggtcggctgcttcgtcctctgctggctgccttttttcttagtcatgcccattgggtctttcttccctgatttcaagccctctgaaacagtttttaaaatagtattttggctcggatatctaaacagctgcatcaaccccatcatatacccatgctccagccaagagttcaaaaaggcctttcagaatgtcttgagaatccagtgtctctgcagaaagcagtcttccaaacatgccctgggctacaccctgcacccgcccagccaggccgtggaagggcaacacaaggacatggtgcgcatccccgtgggatcaagagagaccttctacaggatctccaagacggatggcgtttgtgaatggaaatttttctcttccatgccccgtggatctgccaggattacagtgtccaaagaccaatcctcctgtaccacagcccggacgaagtctcgctctgtcaccaggctggagtgcagtggcatgatcttggctcactgcaacctccgcctcccgggttcaagagattctcctgcctcagcctcccaagcagctgggactacagggatgtgccaccaggccgacgccaccaggcccagctaatttttgtatttttagtagagacggggtttcaccatgttggccaggatgatctcgatctcttgacctcatgatctgcctgcctcagcctcccaaagtgctgggattacaggcgtgagccaccgtgcccggcccaactattttttttttttatcttttttaacagtgcaatcctttctgtggatgaaatcttgctcagaagctcaatatgcaaaagaaagaaaaacagcagggctggacggatgttgggagtggggtaagaccccaaccactcagaaccacccccccaacacacacacacattctctccatggtgactggtgaggggcctctagagggtacatagtacaccatggagcacggtttaagcaccactggactacacattcttctgtggcagttatcttaccttcccatagacacccagcccatagccattggtt',
@@ -32,6 +39,7 @@
         'g_strand': -1,
         'g_starts_i': [26721603, 26627797, 26605666], 	'g_ends_i': [26722922, 26628183, 26606265],
         'g_cds_start_i': 26606106, 			'g_cds_end_i': 26722486,
+        'pro_ac': 'NP_150646.3',
     },
     'NM_033304.2': {
         'seq': 'gaattccgaatcatgtgcagaatgctgaatcttcccccagccaggacgaataagacagcgcggaaaagcagattctcgtaattctggaattgcatgttgcaaggagtctcctggatcttcgcacccagcttcgggtagggagggagtccgggtcccgggctaggccagcccggcaggtggagagggtccccggcagccccgcgcgcccctggccatgtctttaatgccctgccccttcatgtggccttctgagggttcccagggctggccagggttgtttcccacccgcgcgcgcgctctcacccccagccaaacccacctggcagggctccctccagccgagaccttttgattcccggctcccgcgctcccgcctccgcgccagcccgggaggtggccctggacagccggacctcgcccggccccggctgggaccatggtgtttctctcgggaaatgcttccgacagctccaactgcacccaaccgccggcaccggtgaacatttccaaggccattctgctcggggtgatcttggggggcctcattcttttcggggtgctgggtaacatcctagtgatcctctccgtagcctgtcaccgacacctgcactcagtcacgcactactacatcgtcaacctggcggtggccgacctcctgctcacctccacggtgctgcccttctccgccatcttcgaggtcctaggctactgggccttcggcagggtcttctgcaacatctgggcggcagtggatgtgctgtgctgcaccgcgtccatcatgggcctctgcatcatctccatcgaccgctacatcggcgtgagctacccgctgcgctacccaaccatcgtcacccagaggaggggtctcatggctctgctctgcgtctgggcactctccctggtcatatccattggacccctgttcggctggaggcagccggcccccgaggacgagaccatctgccagatcaacgaggagccgggctacgtgctcttctcagcgctgggctccttctacctgcctctggccatcatcctggtcatgtactgccgcgtctacgtggtggccaagagggagagccggggcctcaagtctggcctcaagaccgacaagtcggactcggagcaagtgacgctccgcatccatcggaaaaacgccccggcaggaggcagcgggatggccagcgccaagaccaagacgcacttctcagtgaggctcctcaagttctcccgggagaagaaagcggccaaaacgctgggcatcgtggtcggctgcttcgtcctctgctggctgccttttttcttagtcatgcccattgggtctttcttccctgatttcaagccctctgaaacagtttttaaaatagtattttggctcggatatctaaacagctgcatcaaccccatcatatacccatgctccagccaagagttcaaaaaggcctttcagaatgtcttgagaatccagtgtctctgcagaaagcagtcttccaaacatgccctgggctacaccctgcacccgcccagccaggccgtggaagggcaacacaaggacatggtgcgcatccccgtgggatcaagagagaccttctacaggatctccaagacggatggcgtttgtgaatggaaatttttctcttccatgccccgtggatctgccaggattacagtgtccaaagaccaatcctcctgtaccacagcccggaggggaatggattgtagatatttcaccaagaattgcagagagcatatcaagcatgtgaattttatgatgccaccgtggagaaagggttcagaatgctgatctccaggtagctggagacctaggcagtctgcaaatgaggagtcagctggaagctatggctatgtattatgtgacatcgcttgttcctaagtgaaaactggatatcccaaccttctggcccagtaggtttcatggttaagacctggtagtgagaacattttaggaactatttgcttgggcaggcaatttttcactct',
@@ -40,19 +48,22 @@
         'g_strand': -1,
         'g_starts_i': [26721603, 26627797, 26623370], 	'g_ends_i': [26722922, 26628183, 26623666],
         'g_cds_start_i': 26623567, 			'g_cds_end_i': 26722486,
+        'pro_ac': 'NP_150647.2',
     },
 }
 
 
-class Test_uta_models(unittest.TestCase):
+class TestUtaModels(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
         cls._postgresql = testing.postgresql.Postgresql()
 
         engine = sqlalchemy.create_engine(cls._postgresql.url())
-        engine.execute('drop schema if exists {schema} cascade'.format(schema=usam.schema_name))
-        engine.execute('create schema {schema}'.format(schema=usam.schema_name))
+        with engine.connect() as connection:
+            connection.execute(text('drop schema if exists {schema} cascade'.format(schema=usam.schema_name)))
+            connection.execute(text('create schema {schema}'.format(schema=usam.schema_name)))
+            connection.commit()
         engine.dispose()
 
         cls.session = uta.connect(cls._postgresql.url())
@@ -62,14 +73,17 @@ def setUpClass(cls):
         # http://www.ncbi.nlm.nih.gov/nuccore/NM_033304.2
 
         o = usam.Origin(
-            name='Testing (originally NCBI, via Eutils)',
+            name='NCBI',
+            descr='Testing (originally NCBI, via Eutils)',
             url='http://bogus.com/',
             url_ac_fmt='http://bogus.com/{ac}',
         )
         cls.session.add(o)
 
         g = usam.Gene(
+            gene_id='148',
             hgnc='ADRA1A',
+            symbol='ADRA1A',
             maploc='8p21.2',
             descr='adrenoceptor alpha 1A',
             summary='''Alpha-1-adrenergic receptors (alpha-1-ARs) are
@@ -113,13 +127,25 @@ def setUpClass(cls):
             t = usam.Transcript(
                 ac=ac,
                 origin=o,
-                hgnc=g.hgnc,
+                gene_id=g.gene_id,
                 cds_start_i=tx_info['t_cds_start_i'],
                 cds_end_i=tx_info['t_cds_end_i'],
                 cds_md5='d41d8cd98f00b204e9800998ecf8427e',
             )
             cls.session.add(t)
 
+            if 'translation_exceptions' in tx_info:
+                for te in tx_info['translation_exceptions']:
+                    te = usam.TranslationException(tx_ac=ac, **te)
+                    cls.session.add(te)
+
+            p = usam.AssociatedAccessions(
+                tx_ac=ac,
+                pro_ac=tx_info['pro_ac'],
+                origin=o.name,
+            )
+            cls.session.add(p)
+
             # ExonSet and Exons on Transcript seq
             t_es = usam.ExonSet(
                 tx_ac=ac,
@@ -161,7 +187,7 @@ def setUpClass(cls):
 
     @classmethod
     def tearDownClass(cls):
-        # sqlalchemy is keeping connections open and I can't figure out where
+        cls.session.close()
         # kill the database (we started it)
         import signal
         cls._postgresql.stop(_signal=signal.SIGKILL)
@@ -172,9 +198,9 @@ def test_origin(self):
         self.assertEqual(len(all_origins), 1)
 
         o = all_origins[0]
-        self.assertRegexpMatches(o.name, 'Testing')
-        self.assertEquals(o.url, 'http://bogus.com/')
-        self.assertEquals(o.url_ac_fmt, 'http://bogus.com/{ac}')
+        self.assertEqual(o.name, 'NCBI')
+        self.assertEqual(o.url, 'http://bogus.com/')
+        self.assertEqual(o.url_ac_fmt, 'http://bogus.com/{ac}')
 
         # NM_000680.2, NM_033302.2, NM_033303.3, NM_033304.2
         self.assertEqual(len(o.transcripts), 4)
@@ -201,23 +227,23 @@ def test_dnaseq(self):
 
         n = self.session.query(usam.SeqAnno).filter(
             usam.SeqAnno.ac == 'NC_000008.10').one()
-        self.assertEquals(n.ac, u'NC_000008.10')
+        self.assertEqual(n.ac, u'NC_000008.10')
         # self.assertTrue(len(n.exon_sets),2)
-        self.assertRegexpMatches(n.origin.name, '^Testing')
-        #self.assertEquals(len(n.transcripts), 0)
+        self.assertEqual(n.origin.name, 'NCBI')
+        #self.assertEqual(len(n.transcripts), 0)
 
         n = self.session.query(usam.SeqAnno).filter(
             usam.SeqAnno.ac == 'NM_000680.2').one()
-        self.assertEquals(n.ac, u'NM_000680.2')
+        self.assertEqual(n.ac, u'NM_000680.2')
         # self.assertTrue(len(n.exon_sets),1)
-        self.assertRegexpMatches(n.origin.name, '^Testing')
+        self.assertEqual(n.origin.name, 'NCBI')
 
         n = self.session.query(usam.Seq).join(usam.Seq.aliases).filter(
             usam.SeqAnno.ac == 'NM_000680.2').one()
-        self.assertEquals(len(n.seq), 2281)
+        self.assertEqual(len(n.seq), 2281)
         self.assertTrue(n.seq.startswith('gaattccgaa'))
         self.assertTrue(n.seq.endswith('gacatttatg'))
-        #self.assertEquals(len(n.transcripts), 1)
+        #self.assertEqual(len(n.transcripts), 1)
 
     def test_exon_set(self):
         all_exon_sets = self.session.query(usam.Seq).all()
@@ -228,12 +254,12 @@ def test_exon_set(self):
 
         # http://www.ncbi.nlm.nih.gov/nuccore/NM_000680.2
         ## es = [ es for es in exon_sets if es.is_primary ][0]
-        ## self.assertEquals( (es.cds_start_i,es.cds_end_i), (436, 1837) )
-        ## self.assertEquals( len(es.exons), 2 )
-        ## self.assertEquals( es.is_primary, True )
-        ## self.assertEquals( es.ref_dnaseq.ac, 'NM_000680.2' )
-        ## self.assertEquals( es.strand, 1 )
-        ## self.assertEquals( es.transcript.ac, 'NM_000680.2' )
+        ## self.assertEqual( (es.cds_start_i,es.cds_end_i), (436, 1837) )
+        ## self.assertEqual( len(es.exons), 2 )
+        ## self.assertEqual( es.is_primary, True )
+        ## self.assertEqual( es.ref_dnaseq.ac, 'NM_000680.2' )
+        ## self.assertEqual( es.strand, 1 )
+        ## self.assertEqual( es.transcript.ac, 'NM_000680.2' )
 
         # seq_gene.md.gz:
         # 9606	8	26627222	26627665	-	NT_167187.1	14485368	14485811	-	NM_000680.2	GeneID:148	UTR	GRCh37.p10-Primary Assembly	NM_000680.2	-
@@ -241,12 +267,12 @@ def test_exon_set(self):
         # 9606	8	26721604	26722486	-	NT_167187.1	14579750	14580632	-	NP_000671.2	GeneID:148	CDS	GRCh37.p10-Primary Assembly	NM_000680.2	-
         # 9606	8	26722487	26722922	-	NT_167187.1	14580633	14581068	-	NM_000680.2	GeneID:148	UTR	GRCh37.p10-Primary Assembly	NM_000680.2	-
         ## es = [ es for es in exon_sets if not es.is_primary ][0]
-        ## self.assertEquals( (es.cds_start_i,es.cds_end_i), (26627665, 26722486) )
-        ## self.assertEquals( len(es.exons), 2 )
-        ## self.assertEquals( es.is_primary, False )
-        ## self.assertEquals( es.ref_dnaseq.ac, 'NC_000008.10' )
-        ## self.assertEquals( es.strand, -1 )
-        ## self.assertEquals( es.transcript.ac, 'NM_000680.2' )
+        ## self.assertEqual( (es.cds_start_i,es.cds_end_i), (26627665, 26722486) )
+        ## self.assertEqual( len(es.exons), 2 )
+        ## self.assertEqual( es.is_primary, False )
+        ## self.assertEqual( es.ref_dnaseq.ac, 'NC_000008.10' )
+        ## self.assertEqual( es.strand, -1 )
+        ## self.assertEqual( es.transcript.ac, 'NM_000680.2' )
 
     def test_exon(self):
         t = self.session.query(usam.Transcript).filter(
@@ -255,6 +281,63 @@ def test_exon(self):
         #self.assertEqual( (es.exons[0].start_i,es.exons[0].end_i) , (0,1319) )
         #self.assertEqual( (es.exons[1].start_i,es.exons[1].end_i) , (1319,2281) )
 
+    def test_associated_accessions(self):
+        all_aa = self.session.query(usam.AssociatedAccessions).all()
+        self.assertEqual(len(all_aa), 4)
+        # check values in one row:
+        aa = self.session.query(usam.AssociatedAccessions).filter_by(tx_ac='NM_000680.2').one()
+        self.assertIsInstance(aa.associated_accession_id, int)
+        self.assertIsInstance(aa.added, datetime.datetime)
+        self.assertEqual(aa.tx_ac, 'NM_000680.2')
+        self.assertEqual(aa.pro_ac, 'NP_000671.2')
+        self.assertEqual(aa.origin, 'NCBI')
+
+    def test_associated_accessions_transcript_not_in_database(self):
+        """
+        Should create row in associated_accessions even for transcripts not in database.
+        This is only the case until associated_accessions.tx_ac is converted to a transcript foreign key.
+        """
+        p = usam.AssociatedAccessions(
+            tx_ac='dummy_transcript',
+            pro_ac='dummy_protein',
+            origin='dummy_origin',
+        )
+        self.session.add(p)
+        self.session.commit()
+        aa = self.session.query(usam.AssociatedAccessions).filter_by(tx_ac='dummy_transcript').one()
+        self.assertEqual(aa.tx_ac, 'dummy_transcript')
+        self.assertEqual(aa.pro_ac, 'dummy_protein')
+        self.assertEqual(aa.origin, 'dummy_origin')
+
+    def test_translation_exception(self):
+        """
+        Should create rows in translation_exception table.
+        """
+        translation_exceptions = self.session.query(usam.TranslationException).filter_by(tx_ac='NM_000680.2').all()
+        self.assertEqual(len(translation_exceptions), 2)
+
+    def test_translation_exception_start_not_greater_than_end(self):
+        """
+        Should not create row in translation_exception table if start is greater than end.
+        """
+        te = usam.TranslationException(
+            tx_ac='NM_033302.2',
+            start_position=100,
+            end_position=99,
+            amino_acid='dummy_aa',
+        )
+        self.session.add(te)
+
+        with self.assertRaises(sqlalchemy.exc.IntegrityError):
+            self.session.commit()
+
+        # allow session to be used after failure
+        self.session.rollback()
+
+        # translation exception should not exist because transaction failed
+        translation_exceptions = self.session.query(usam.TranslationException).filter_by(tx_ac='NM_033302.2').all()
+        self.assertEqual(translation_exceptions, [])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_uta_parsers_seqrecord.py b/tests/test_uta_parsers_seqrecord.py
new file mode 100644
index 0000000..469e1c5
--- /dev/null
+++ b/tests/test_uta_parsers_seqrecord.py
@@ -0,0 +1,202 @@
+import os
+import unittest
+from unittest.mock import Mock, PropertyMock, patch
+
+from Bio import SeqIO
+from parameterized import param, parameterized
+
+from uta.parsers.seqrecord import SeqRecordFacade, SeqRecordFeatureError
+
+
+class TestSeqRecordFacade(unittest.TestCase):
+    test_data_dir = os.path.join(os.path.dirname(__file__), 'data')
+
+    @parameterized.expand([
+        param(
+            'NM_001396027 - single exon feature',
+            file_name='rna.NM_001396027.gbff',
+            expected_id='NM_001396027.1',
+            expected_gene_symbol='FAM246C',
+            expected_gene_synonyms=[],
+            expected_gene_type="protein-coding",
+            expected_gene_id='117134596',
+            expected_db_xrefs=["GeneID:117134596", "HGNC:HGNC:54842"],
+            expected_cds_se_i=(0, 696),
+            expected_cds_product="protein FAM246C",
+            expected_cds_protein_id="NP_001382956.1",
+            expected_cds_translation="MAESGRPWAQARSAYRASEVLRRGTGRRRDPGPQSNGPGQEDARAPGRMARLRGQLRAEAASRSEVPRLLKLVERAGAG" \
+                "AAGAGERTGAHSRGSVCSVCGEPRGGATYPAGVLEVSERRLQEGLAAVREELGAGIEALRAELRAELDALRALLPPPPSPPARREPRAVPRAAPRGPTLP" \
+                "RTLGTVSALVAASRPADDAPDGPAECGAHRAPARKNHKKMPVPPGAPQGGGD",
+            expected_codon_table="1",
+            expected_exons_se_i=[(0, 696)],
+        ),
+        param(
+            'NM_001396027 - multiple exon features',
+            file_name='rna.NM_001996.gbff',
+            expected_id='NM_001996.4',
+            expected_gene_symbol='FBLN1',
+            expected_gene_synonyms=["FBLN","FIBL1"],
+            expected_gene_type="protein-coding",
+            expected_gene_id="2192",
+            expected_db_xrefs=["GeneID:2192", "HGNC:HGNC:3600", "MIM:135820"],
+            expected_cds_se_i=(103, 2155),
+            expected_cds_product="fibulin-1 isoform C precursor",
+            expected_cds_protein_id="NP_001987.3",
+            expected_cds_translation="MERAAPSRRVPLPLLLLGGLALLAAGVDADVLLEACCADGHRMATHQKDCSLPYATESKECRMVQEQCCHSQLEELHCA" \
+                "TGISLANEQDRCATPHGDNASLEATFVKRCCHCCLLGRAAQAQGQSCEYSLMVGYQCGQVFQACCVKSQETGDLDVGGLQETDKIIEVEEEQEDPYLNDR" \
+                "CRGGGPCKQQCRDTGDEVVCSCFVGYQLLSDGVSCEDVNECITGSHSCRLGESCINTVGSFRCQRDSSCGTGYELTEDNSCKDIDECESGIHNCLPDFIC" \
+                "QNTLGSFRCRPKLQCKSGFIQDALGNCIDINECLSISAPCPIGHTCINTEGSYTCQKNVPNCGRGYHLNEEGTRCVDVDECAPPAEPCGKGHRCVNSPGS" \
+                "FRCECKTGYYFDGISRMCVDVNECQRYPGRLCGHKCENTLGSYLCSCSVGFRLSVDGRSCEDINECSSSPCSQECANVYGSYQCYCRRGYQLSDVDGVTC" \
+                "EDIDECALPTGGHICSYRCINIPGSFQCSCPSSGYRLAPNGRNCQDIDECVTGIHNCSINETCFNIQGGFRCLAFECPENYRRSAATRCERLPCHENREC" \
+                "SKLPLRITYYHLSFPTNIQAPAVVFRMGPSSAVPGDSMQLAITGGNEEGFFTTRKVSPHSGVVALTKPVPEPRDLLLTVKMDLSRHGTVSSFVAKLFIFV" \
+                "SAEL",
+            expected_codon_table="1",
+            expected_exons_se_i=[
+                (0, 182),
+                (182, 288),
+                (288, 424),
+                (424, 587),
+                (587, 647),
+                (647, 749),
+                (749, 887),
+                (887, 1025),
+                (1025, 1169),
+                (1169, 1298),
+                (1298, 1424),
+                (1424, 1544),
+                (1544, 1676),
+                (1676, 1800),
+                (1800, 2251),
+            ],
+        ),
+        param(
+            'NR_173080 - no exon features, ncRNA',
+            file_name='rna.NR_173080.gbff',
+            expected_id='NR_173080.1',
+            expected_gene_symbol='LOC122455341',
+            expected_gene_synonyms=[],
+            expected_gene_type="ncRNA",
+            expected_gene_id='122455341',
+            expected_db_xrefs=["GeneID:122455341"],
+            expected_cds_se_i=None,
+            expected_cds_product=None,
+            expected_cds_protein_id=None,
+            expected_cds_translation=None,
+            expected_codon_table=None,
+            expected_exons_se_i=[],
+        ),
+        param(
+            'NR_173148 - no exon features, misc_RNA',
+            file_name='rna.NR_173148.gbff',
+            expected_id='NR_173148.1',
+            expected_gene_symbol='FAM246C',
+            expected_gene_synonyms=[],
+            expected_gene_type="misc_RNA",
+            expected_gene_id='117134596',
+            expected_db_xrefs=["GeneID:117134596", "HGNC:HGNC:54842"],
+            expected_cds_se_i=None,
+            expected_cds_product=None,
+            expected_cds_protein_id=None,
+            expected_cds_translation=None,
+            expected_codon_table=None,
+            expected_exons_se_i=[],
+        ),
+    ])
+    def test_seq_record_facade(
+        self,
+        test_name,
+        file_name,
+        expected_id,
+        expected_gene_symbol,
+        expected_gene_synonyms,
+        expected_gene_type,
+        expected_gene_id,
+        expected_db_xrefs,
+        expected_cds_se_i,
+        expected_cds_product,
+        expected_cds_protein_id,
+        expected_cds_translation,
+        expected_codon_table,
+        expected_exons_se_i,
+    ):
+        gbff_file = os.path.join(self.test_data_dir, file_name)
+        seq_record = [sr for sr in SeqIO.parse(gbff_file, 'gb')][0]
+        self.seq_record_facade = SeqRecordFacade(seq_record)
+        assert self.seq_record_facade.id == expected_id
+        assert self.seq_record_facade.gene_symbol == expected_gene_symbol
+        assert self.seq_record_facade.gene_synonyms == expected_gene_synonyms
+        assert self.seq_record_facade.gene_type == expected_gene_type
+        assert self.seq_record_facade.gene_id == expected_gene_id
+        assert self.seq_record_facade.db_xrefs == expected_db_xrefs
+        assert self.seq_record_facade.cds_se_i == expected_cds_se_i
+        assert self.seq_record_facade.cds_product == expected_cds_product
+        assert self.seq_record_facade.cds_protein_id == expected_cds_protein_id
+        assert self.seq_record_facade.cds_translation == expected_cds_translation
+        assert self.seq_record_facade.codon_table == expected_codon_table
+        assert self.seq_record_facade.exons_se_i == expected_exons_se_i
+
+    @parameterized.expand([
+        param("no gene feature", gene_feature_mock={}),
+        param("gene feature is None", gene_feature_mock={"gene": None}),
+        param("gene feature is empty", gene_feature_mock={"gene": []}),
+        param("gene feature has more than one", gene_feature_mock={"gene": [Mock(), Mock()]}),
+    ])
+    def test_validate_gene_feature(self, test_name, gene_feature_mock):
+        with patch('uta.parsers.seqrecord.SeqRecordFacade.features_by_type',
+                   new_callable=PropertyMock) as mock_features_by_type:
+            mock_features_by_type.return_value = gene_feature_mock
+            srf = SeqRecordFacade(seqrecord=Mock())
+            with self.assertRaises(SeqRecordFeatureError):
+                _ = srf.gene_feature
+
+    def test_cds_feature_validation_error(self):
+        with patch('uta.parsers.seqrecord.SeqRecordFacade.features_by_type',
+                   new_callable=PropertyMock) as mock_features_by_type:
+            mock_cds_feature = Mock()
+            mock_cds_feature.qualifiers = {"protein_id": "NP_fake", "translation": "MNBVCXZ"}
+            mock_features_by_type.return_value = {'CDS': [mock_cds_feature, mock_cds_feature]}
+            srf = SeqRecordFacade(seqrecord=Mock())
+            with self.assertRaises(SeqRecordFeatureError):
+                _ = srf.cds_feature
+
+    def test_cds_feature(self):
+        with patch('uta.parsers.seqrecord.SeqRecordFacade.features_by_type', new_callable=PropertyMock) as mock_features_by_type:
+            # no CDS feature
+            mock_features_by_type.return_value = {}
+            srf = SeqRecordFacade(seqrecord=Mock())
+            self.assertIsNone(srf.cds_feature)
+
+            # one CDS feature
+            desired_cds_feature = Mock()
+            desired_cds_feature.qualifiers = {"protein_id": "NP_fake", "translation": "MNBVCXZ"}
+            mock_features_by_type.return_value = {'CDS': [desired_cds_feature]}
+            srf = SeqRecordFacade(seqrecord=Mock())
+            self.assertIs(srf.cds_feature, desired_cds_feature)
+
+            # more than one CDS feature, but only one is returned
+            extra_cds_feature = Mock()
+            extra_cds_feature.qualifiers = {"other_key": "NP_fake", "second_key": "MNBVCXZ"}
+            mock_features_by_type.return_value = {'CDS': [desired_cds_feature, extra_cds_feature]}
+            srf = SeqRecordFacade(seqrecord=Mock())
+            self.assertIs(srf.cds_feature, desired_cds_feature)
+
+    def test_transl_except(self):
+        with patch('uta.parsers.seqrecord.SeqRecordFacade.cds_feature', new_callable=PropertyMock) as mock_cds_feature:
+            # no CDS feature
+            mock_cds_feature.return_value = None
+            srf = SeqRecordFacade(seqrecord=Mock())
+            self.assertIsNone(srf.transl_except)
+
+            # one CDS feature without transl_except
+            mock_cds_feature.return_value = Mock(qualifiers={})
+            srf = SeqRecordFacade(seqrecord=Mock())
+            self.assertIsNone(srf.transl_except)
+
+            # one CDS feature with transl_except
+            mock_cds_feature.return_value = Mock(qualifiers={'transl_except': ['(pos:333..335,aa:Sec)', '(pos:1017,aa:TERM)']})
+            srf = SeqRecordFacade(seqrecord=Mock())
+            self.assertEqual(srf.transl_except, ['(pos:333..335,aa:Sec)', '(pos:1017,aa:TERM)'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_uta_tools_eutils.py b/tests/test_uta_tools_eutils.py
new file mode 100644
index 0000000..c875a7f
--- /dev/null
+++ b/tests/test_uta_tools_eutils.py
@@ -0,0 +1,47 @@
+import os
+import unittest
+from unittest.mock import Mock, patch
+
+from uta import EutilsDownloadError
+from uta.tools.eutils import download_from_eutils, NcbiFileFormatEnum
+
+
+class TestEutils(unittest.TestCase):
+    URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
+
+    def setUp(self):
+        self.output_file = 'test_output.fa'
+
+    def tearDown(self):
+        if os.path.exists(self.output_file):
+            os.remove(self.output_file)
+
+    @patch('requests.get')
+    def test_successful_download(self, mock_get):
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.text = 'file content'
+        mock_get.return_value = mock_response
+
+        download_from_eutils('accession', NcbiFileFormatEnum.FASTA, self.output_file)
+
+        mock_get.assert_called_once_with(
+            self.URL,
+            params={
+                'db': 'nuccore',
+                'id': 'accession',
+                'retmode': 'text',
+                'rettype': 'fasta'
+            }
+        )
+
+        with open(self.output_file, 'r') as file:
+            self.assertEqual(file.read(), 'file content')
+
+    @patch('requests.get')
+    def test_unsuccessful_download(self, mock_get):
+        mock_response = Mock()
+        mock_response.status_code = 404
+        mock_get.return_value = mock_response
+        with self.assertRaises(EutilsDownloadError):
+            download_from_eutils('accession', NcbiFileFormatEnum.FASTA, self.output_file)