Skip to content

Commit

Permalink
refactor: get accessions from yaml file when building catalog source (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
hunterckx authored Jan 23, 2025
1 parent 529622c commit bb79f88
Show file tree
Hide file tree
Showing 17 changed files with 142 additions and 1,043 deletions.
3 changes: 3 additions & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ venv

# releases
/CHANGELOG.md

#catalog
/catalog
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ Create a Python virtual environment and install requirements:
```shell
python3 -m venv ./venv
source ./venv/bin/activate
pip install -r ./files/requirements.txt
pip install -r ./catalog-build/requirements.txt
```

Then run the script:

```shell
python3 ./files/build-genomes-files.py
python3 ./catalog-build/build-files-from-ncbi.py
```

The environment can be deactivated by running `deactivate`, and re-activated by running `source ./venv/bin/activate`
Expand All @@ -38,5 +38,5 @@ again.
Using the Python environment described above, run the script:

```shell
python3 ./files/build-files-from-ncbi.py
python3 ./catalog-build/build-files-from-ncbi.py
```
2 changes: 0 additions & 2 deletions app/apis/catalog/brc-analytics-catalog/common/entities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ export interface BRCDataCatalogGenome {
species: string;
speciesTaxonomyId: string;
strain: string | null;
tags: string[];
ucscBrowserUrl: string | null;
}

Expand All @@ -36,7 +35,6 @@ export interface BRCDataCatalogOrganism {
genomes: BRCDataCatalogGenome[];
ncbiTaxonomyId: string;
species: string;
tags: string[];
}

export interface EntitiesResponse<R> {
Expand Down
8 changes: 3 additions & 5 deletions files/build-catalog.ts → catalog-build/build-catalog.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import {
} from "../app/apis/catalog/brc-analytics-catalog/common/entities";
import { SourceGenome } from "./entities";

const SOURCE_PATH_GENOMES = "files/source/genomes-from-ncbi.tsv";
const SOURCE_PATH_GENOMES = "catalog-build/source/genomes-from-ncbi.tsv";

buildCatalog();

Expand All @@ -15,10 +15,10 @@ async function buildCatalog(): Promise<void> {
const organisms = buildOrganisms(genomes);

console.log("Genomes:", genomes.length);
await saveJson("files/out/genomes.json", genomes);
await saveJson("catalog/genomes.json", genomes);

console.log("Organisms:", genomes.length);
await saveJson("files/out/organisms.json", organisms);
await saveJson("catalog/organisms.json", organisms);

console.log("Done");
}
Expand All @@ -43,7 +43,6 @@ async function buildGenomes(): Promise<BRCDataCatalogGenome[]> {
species: row.species,
speciesTaxonomyId: row.speciesTaxonomyId,
strain: parseStringOrNull(row.strain),
tags: row.CustomTags.split(/,\s*/),
ucscBrowserUrl: parseStringOrNull(row.ucscBrowser),
};
});
Expand Down Expand Up @@ -77,7 +76,6 @@ function buildOrganism(
genomes: [...(organism?.genomes ?? []), genome],
ncbiTaxonomyId: genome.speciesTaxonomyId,
species: genome.species,
tags: Array.from(new Set([...(organism?.tags ?? []), ...genome.tags])),
};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@
import requests
import urllib.parse
import re
import yaml

SOURCE_LIST_URL = "https://docs.google.com/spreadsheets/d/1Gg9sw2Qw765tOx2To53XkTAn-RAMiBtqYrfItlLXXrc/gviz/tq?tqx=out:csv&sheet=Sheet1.csv"
ASSEMBLIES_PATH = "catalog-build/source/assemblies.yml"

ASSEMBLIES_URL = "https://hgdownload.soe.ucsc.edu/hubs/BRC/assemblyList.json"
UCSC_ASSEMBLIES_URL = "https://hgdownload.soe.ucsc.edu/hubs/BRC/assemblyList.json"

GENOMES_OUTPUT_PATH = "files/source/genomes-from-ncbi.tsv"
GENOMES_OUTPUT_PATH = "catalog-build/source/genomes-from-ncbi.tsv"

def read_assemblies():
with open(ASSEMBLIES_PATH) as stream:
return pd.DataFrame(yaml.safe_load(stream)["assemblies"])

def get_paginated_ncbi_results(base_url, query_description):
page = 1
Expand Down Expand Up @@ -93,19 +98,15 @@ def add_gene_model_url(genomes_df: pd.DataFrame):
def build_files():
print("Building files")

source_list_df = pd.read_csv(SOURCE_LIST_URL, keep_default_na=False)
source_list_df = read_assemblies()

base_genomes_df = get_genomes_df(source_list_df["Reference"])
base_genomes_df = get_genomes_df(source_list_df["accession"])

species_df = get_species_df(base_genomes_df["taxonomyId"])

genomes_with_species_df = (
base_genomes_df
.merge(source_list_df[["Reference", "CustomTags"]], how="left", left_on="accession", right_on="Reference").drop(columns=["Reference"])
.merge(species_df, how="left", on="taxonomyId")
)
genomes_with_species_df = base_genomes_df.merge(species_df, how="left", on="taxonomyId")

assemblies_df = pd.DataFrame(requests.get(ASSEMBLIES_URL).json()["data"])[["ucscBrowser", "genBank", "refSeq"]]
assemblies_df = pd.DataFrame(requests.get(UCSC_ASSEMBLIES_URL).json()["data"])[["ucscBrowser", "genBank", "refSeq"]]

gen_bank_merge_df = genomes_with_species_df.merge(assemblies_df, how="left", left_on="accession", right_on="genBank")
ref_seq_merge_df = genomes_with_species_df.merge(assemblies_df, how="left", left_on="accession", right_on="refSeq")
Expand Down
File renamed without changes.
1 change: 0 additions & 1 deletion files/entities.ts → catalog-build/entities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ export interface SourceGenome {
annotationStatus: string;
chromosomeCount: string;
coverage: string;
CustomTags: string;
gcPercent: string;
geneModelUrl: string;
isRef: string;
Expand Down
1 change: 1 addition & 0 deletions files/requirements.txt → catalog-build/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ numpy==2.1.0
pandas==2.2.2
python-dateutil==2.9.0.post0
pytz==2024.1
PyYAML==6.0.2
requests==2.32.3
six==1.16.0
tzdata==2024.1
Expand Down
70 changes: 70 additions & 0 deletions catalog-build/source/assemblies.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
assemblies:
# plasmodium falciparum
- accession: GCF_000002765.6
source: plasmodb.org
# plasmodium vivax
- accession: GCF_000002415.2
ncbi_dataset: true
- accession: GCA_900093555.2
source: plasmodb.org
# plasmodium yoelii
- accession: GCF_900002385.2
source: plasmodb.org
# plasmodium vinckei
- accession: GCF_900681995.1
source: plasmodb.org
# culex p. pallens
- accession: GCF_016801865.2
source: vectorbase.org
# culex p. quinquefasciatus
- accession: GCF_015732765.1
source: vectorbase.org
# anopheles gambiae
- accession: GCF_943734735.2
ncbi_dataset: true
- accession: GCF_000005575.2
source: vectorbase.org
# toxoplasma gondii
- accession: GCF_000006565.2
source: toxodb.org
# mycobacterium tuberculosis
- accession: GCF_000195955.2
# coccidioides posadasii
- accession: GCF_018416015.2
ncbi_dataset: true
- accession: GCA_000170175.2
source: fungidb.org
# coccidioides immitis
- accession: GCF_000149335.2
source: fungidb.org
# trypanosoma cruzi
- accession: GCF_000209065.1
source: tritrypdb.org
# trypanosoma brucei
- accession: GCF_000002445.2
source: tritrypdb.org
# leishmania major
- accession: GCF_000002725.2
source: tritrypdb.org
# leishmania donovani
- accession: GCF_000227135.1
source: tritrypdb.org
# leishmania braziliensis
# danielle's favorite
- accession: GCF_000002845.2
source: tritrypdb.org
# covid
- accession: GCF_009858895.2
# mpox
- accession: GCF_000857045.1
# aspergillus fumigatus
- accession: GCF_000002655.1
source: fungidb.org
# candida albicans
- accession: GCF_000182965.3
source: fungidb.org
# crypococcus neoformans
- accession: GCF_000091045.1
ncbi_dataset: true
- accession: GCA_000149245.3
source: fungidb.org
26 changes: 26 additions & 0 deletions catalog-build/source/genomes-from-ncbi.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
strain taxonomyId accession isRef level chromosomeCount length scaffoldCount scaffoldN50 scaffoldL50 coverage gcPercent annotationStatus pairedAccession species speciesTaxonomyId ucscBrowser genBank refSeq geneModelUrl
H37Rv 83332 GCF_000195955.2 True Complete Genome 1.0 4411532 1 4411532 1 65.5 GCA_000195955.2 Mycobacterium tuberculosis 1773 https://genome.ucsc.edu/h/GCF_000195955.2 GCA_000195955.2 GCF_000195955.2
36329 GCF_000002765.6 True Complete Genome 14.0 23292622 14 1687656 5 100.0x 19.5 Full annotation GCA_000002765.3 Plasmodium falciparum 5833 https://genome.ucsc.edu/h/GCF_000002765.6 GCA_000002765.3 GCF_000002765.6 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/765/GCF_000002765.6/genes/GCF_000002765.6_GCA_000002765.ncbiRefSeq.gtf.gz
Friedlin 347515 GCF_000002725.2 True Complete Genome 36.0 32855089 36 1091540 11 59.5 Full annotation GCA_000002725.2 Leishmania major 5664 https://genome.ucsc.edu/h/GCF_000002725.2 GCA_000002725.2 GCF_000002725.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/725/GCF_000002725.2/genes/GCF_000002725.2_ASM272v2.ncbiRefSeq.gtf.gz
17X 5861 GCF_900002385.2 True Complete Genome 14.0 23043114 14 2046250 5 100.0x 21.5 Full annotation GCA_900002385.2 Plasmodium yoelii 5861 https://genome.ucsc.edu/h/GCF_900002385.2 GCA_900002385.2 GCF_900002385.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/900/002/385/GCF_900002385.2/genes/GCF_900002385.2_GCA_900002385.ncbiRefSeq.gtf.gz
Silveira 443226 GCF_018416015.2 True Complete Genome 9.0 28193268 9 8079863 2 475.0x 46.5 Full annotation GCA_018416015.2 Coccidioides posadasii 199306 https://genome.ucsc.edu/h/GCF_018416015.2 GCA_018416015.2 GCF_018416015.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/018/416/015/GCF_018416015.2/genes/GCF_018416015.2_ASM1841601v2.ncbiRefSeq.gtf.gz
54757 GCF_900681995.1 True Chromosome 14.0 18338688 14 1692345 5 155.0x 23.0 Full annotation GCA_900681995.1 Plasmodium vinckei 5860 https://genome.ucsc.edu/h/GCF_900681995.1 GCA_900681995.1 GCF_900681995.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/900/681/995/GCF_900681995.1/genes/GCF_900681995.1_PVVCY_v1.ncbiRefSeq.gtf.gz
SC5314 237561 GCF_000182965.3 True Chromosome 8.0 14282666 8 2231883 3 700.0x 33.5 Full annotation GCA_000182965.3 Candida albicans 5476 https://genome.ucsc.edu/h/GCF_000182965.3 GCA_000182965.3 GCF_000182965.3 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/182/965/GCF_000182965.3/genes/GCF_000182965.3_ASM18296v3.ncbiRefSeq.gtf.gz
JEC21 214684 GCF_000091045.1 True Chromosome 14.0 19051922 14 1438950 6 48.5 Full annotation GCA_000091045.1 Cryptococcus neoformans 5207 https://genome.ucsc.edu/h/GCF_000091045.1 GCA_000091045.1 GCF_000091045.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/091/045/GCF_000091045.1/genes/GCF_000091045.1_ASM9104v1.ncbiRefSeq.gtf.gz
BPK282A1 5661 GCF_000227135.1 True Chromosome 36.0 32444968 36 1024085 11 59.5 Full annotation GCA_000227135.2 Leishmania donovani 5661 https://genome.ucsc.edu/h/GCF_000227135.1 GCA_000227135.2 GCF_000227135.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/227/135/GCF_000227135.1/genes/GCF_000227135.1_ASM22713v2.ncbiRefSeq.gtf.gz
Af293 330879 GCF_000002655.1 True Chromosome 8.0 29384958 8 3948441 4 50.0 Full annotation GCA_000002655.1 Aspergillus fumigatus 746128 https://genome.ucsc.edu/h/GCF_000002655.1 GCA_000002655.1 GCF_000002655.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/655/GCF_000002655.1/genes/GCF_000002655.1_ASM265v1.ncbiRefSeq.gtf.gz
ME49 508771 GCF_000006565.2 True Chromosome 14.0 65633124 2276 4973582 6 26.5x 52.5 GCA_000006565.2 Toxoplasma gondii 5811 https://genome.ucsc.edu/h/GCF_000006565.2 GCA_000006565.2 GCF_000006565.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/006/565/GCF_000006565.2/genes/GCF_000006565.2_TGA4.ncbiRefSeq.gtf.gz
185431 GCF_000002445.2 True Chromosome 11.0 26075494 12 2481190 4 46.5 Full annotation GCA_000002445.1 Trypanosoma brucei 5691 https://genome.ucsc.edu/h/GCF_000002445.2 GCA_000002445.1 GCF_000002445.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/445/GCF_000002445.2/genes/GCF_000002445.2_ASM244v1.ncbiRefSeq.gtf.gz
7165 GCF_943734735.2 True Chromosome 3.0 264451381 190 99149756 2 54.0x 44.5 Full annotation GCA_943734735.2 Anopheles gambiae 7165 https://genome.ucsc.edu/h/GCF_943734735.2 GCA_943734735.2 GCF_943734735.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/943/734/735/GCF_943734735.2/genes/GCF_943734735.2_idAnoGambNW_F1_1.ncbiRefSeq.gtf.gz
Salvador I 5855 GCF_000002415.2 True Chromosome 14.0 27007701 2747 1678596 6 42.5 Full annotation GCA_000002415.2 Plasmodium vivax 5855 https://genome.ucsc.edu/h/GCF_000002415.2 GCA_000002415.2 GCF_000002415.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/415/GCF_000002415.2/genes/GCF_000002415.2_ASM241v2.ncbiRefSeq.gtf.gz
JHB 7176 GCF_015732765.1 True Chromosome 3.0 573214445 56 201550677 2 76.0x 37.0 Full annotation GCA_015732765.1 Culex quinquefasciatus 7176 https://genome.ucsc.edu/h/GCF_015732765.1 GCA_015732765.1 GCF_015732765.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/015/732/765/GCF_015732765.1/genes/GCF_015732765.1_VPISU_Cqui_1.0_pri_paternal.ncbiRefSeq.gtf.gz
42434 GCF_016801865.2 True Chromosome 3.0 566339288 289 186194774 2 250.0x 37.0 Full annotation GCA_016801865.2 Culex pipiens 7175 https://genome.ucsc.edu/h/GCF_016801865.2 GCA_016801865.2 GCF_016801865.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/016/801/865/GCF_016801865.2/genes/GCF_016801865.2_TS_CPP_V2.ncbiRefSeq.gtf.gz
MHOM/BR/75/M2904 420245 GCF_000002845.2 True Chromosome 35.0 32068771 138 992961 11 58.0 GCA_000002845.2 Leishmania braziliensis 5660 https://genome.ucsc.edu/h/GCF_000002845.2 GCA_000002845.2 GCF_000002845.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/845/GCF_000002845.2/genes/GCF_000002845.2_ASM284v2.ncbiRefSeq.gtf.gz
CL Brener 5693 GCF_000209065.1 True Scaffold 89937456 29495 88624 212 51.5 Full annotation GCA_000209065.1 Trypanosoma cruzi 5693 https://genome.ucsc.edu/h/GCF_000209065.1 GCA_000209065.1 GCF_000209065.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/209/065/GCF_000209065.1/genes/GCF_000209065.1_ASM20906v1.ncbiRefSeq.gtf.gz
RS 246410 GCF_000149335.2 True Scaffold 28947925 6 4323945 3 46.0 Full annotation GCA_000149335.2 Coccidioides immitis 5501 https://genome.ucsc.edu/h/GCF_000149335.2 GCA_000149335.2 GCF_000149335.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/149/335/GCF_000149335.2/genes/GCF_000149335.2_ASM14933v2.ncbiRefSeq.gtf.gz
2697049 GCF_009858895.2 False Complete Genome 1.0 29903 1 29903 1 38.0 GCA_009858895.3 Severe acute respiratory syndrome-related coronavirus 694009 https://genome.ucsc.edu/h/GCF_009858895.2 GCA_009858895.3 GCF_009858895.2
Zaire-96-I-16 10244 GCF_000857045.1 False Complete Genome 1.0 196858 1 196858 1 33.0 GCA_000857045.1 Monkeypox virus 10244 https://genome.ucsc.edu/h/GCF_000857045.1 GCA_000857045.1 GCF_000857045.1
H99 235443 GCA_000149245.3 False Chromosome 14.0 18891193 14 1422463 6 11.8x 48.0 GCF_000149245.1 Cryptococcus neoformans 5207 https://genome.ucsc.edu/h/GCF_000149245.1 GCA_000149245.3 GCF_000149245.1
5855 GCA_900093555.2 False Chromosome 16.0 29040213 242 1761288 6 100.0x 40.0 Plasmodium vivax 5855 https://genome.ucsc.edu/h/GCA_900093555.2 GCA_900093555.2 https://hgdownload.soe.ucsc.edu/hubs/GCA/900/093/555/GCA_900093555.2/genes/GCA_900093555.2_GCA_900093555.augustus.gtf.gz
Silveira 443226 GCA_000170175.2 False Scaffold 27583241 54 1220107 8 5.0x 46.5 Coccidioides posadasii 199306 https://genome.ucsc.edu/h/GCA_000170175.2 GCA_000170175.2 https://hgdownload.soe.ucsc.edu/hubs/GCA/000/170/175/GCA_000170175.2/genes/GCA_000170175.2_CPS2.augustus.gtf.gz
PEST 180454 GCF_000005575.2 False Chromosome 6.0 265011681 8144 12309988 9 44.5 Full annotation GCA_000005575.1 Anopheles gambiae 7165 https://genome.ucsc.edu/h/GCF_000005575.2 GCA_000005575.1 GCF_000005575.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/005/575/GCF_000005575.2/genes/GCF_000005575.2_AgamP3.ncbiRefSeq.gtf.gz
Loading

0 comments on commit bb79f88

Please sign in to comment.