From 278fdbd2d0af451c5269cc26a0fa9cebd84903a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 18 Dec 2023 14:38:28 +0100 Subject: [PATCH 001/107] lib: download polygenic scores (PGS), #TASK-5406, #TASK-5387 --- .../app/cli/admin/AdminCliOptionsParser.java | 6 +- .../executors/DownloadCommandExecutor.java | 5 +- .../core/config/DownloadProperties.java | 11 ++- .../src/main/resources/configuration.yml | 5 ++ .../org/opencb/cellbase/lib/EtlCommons.java | 1 + .../cellbase/lib/download/Downloader.java | 5 ++ .../lib/download/PgsDownloadManager.java | 88 +++++++++++++++++++ 7 files changed, 116 insertions(+), 5 deletions(-) create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PgsDownloadManager.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 6049ef9b4b..47bc4e8186 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -87,9 +87,9 @@ public class DownloadCommandOptions { @ParametersDelegate public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: genome, gene, " - + "variation, variation_functional_score, regulation, protein, conservation, " - + "clinical_variants, repeats, svs, pubmed and 'all' to download everything", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: genome, gene, variation," + + " variation_functional_score, regulation, protein, conservation, clinical_variants, repeats, svs, pubmed," + + " pharmacogenomics, polygenic_score; and 'all' to download everything", required = true, arity = 1) public String data; @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index f8197e6558..dfbad479a2 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -105,10 +105,13 @@ public void execute() { case EtlCommons.PHARMACOGENOMICS_DATA: downloadFiles.addAll(downloader.downloadPharmKGB()); break; + case EtlCommons.PGS_DATA: + downloadFiles.addAll(downloader.downloadPolygenicScores()); + break; default: System.out.println("Value \"" + data + "\" is not allowed for the data parameter. Allowed values" + " are: {genome, gene, gene_disease_association, variation, variation_functional_score," - + " regulation, protein, conservation, clinical_variants, ontology, pubmed}"); + + " regulation, protein, conservation, clinical_variants, ontology, pubmed, polygenic_score}"); break; } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index ee4216f560..d994f2af4b 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -74,6 +74,7 @@ public class DownloadProperties { private URLProperties revel; private URLProperties pubmed; private URLProperties pharmGKB; + private URLProperties pgs; public EnsemblProperties getEnsembl() { return ensembl; @@ -485,6 +486,15 @@ public DownloadProperties setPharmGKB(URLProperties pharmGKB) { return this; } + public URLProperties getPgs() { + return pgs; + } + + public DownloadProperties setPgs(URLProperties pgs) { + this.pgs = pgs; + return this; + } + public DownloadProperties setRefSeqProteinFasta(URLProperties refSeqProteinFasta) { this.refSeqProteinFasta = refSeqProteinFasta; return this; @@ -579,6 +589,5 @@ public URLProperties setFiles(List files) { this.files = files; return this; } - } } diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 0f8d199118..c779bb3874 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -177,6 +177,11 @@ download: - https://api.pharmgkb.org/v1/download/file/data/clinicalVariants.zip - https://api.pharmgkb.org/v1/download/file/data/drugLabels.zip - https://api.pharmgkb.org/v1/download/file/data/relationships.zip + pgs: + host: https://www.pgscatalog.org/ + version: "Dec. 15, 2023" + files: + - https://ftp.ebi.ac.uk/pub/databases/spot/pgs/metadata/pgs_all_metadata_scores.csv species: vertebrates: - id: hsapiens diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 4396f0c2f1..51038fafb3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -47,6 +47,7 @@ public class EtlCommons { public static final String CONSERVATION_DATA = "conservation"; public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; public static final String SPLICE_SCORE_DATA = "splice_score"; + public static final String PGS_DATA = "polygenic_score"; public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; public static final String PHARMGKB_NAME = "PharmGKB"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java index 17022cae4b..49023d89cd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java @@ -98,4 +98,9 @@ public List downloadPharmKGB() throws IOException, CellBaseExcepti PharmGKBDownloadManager manager = new PharmGKBDownloadManager(species, assembly, outputDirectory, configuration); return manager.download(); } + + public List downloadPolygenicScores() throws IOException, CellBaseException, InterruptedException { + PgsDownloadManager manager = new PgsDownloadManager(species, assembly, outputDirectory, configuration); + return manager.download(); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PgsDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PgsDownloadManager.java new file mode 100644 index 0000000000..018baff1f0 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PgsDownloadManager.java @@ -0,0 +1,88 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.download; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.lib.EtlCommons; +import org.opencb.commons.utils.FileUtils; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class PgsDownloadManager extends AbstractDownloadManager { + + public PgsDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) + throws IOException, CellBaseException { + super(species, assembly, targetDirectory, configuration); + } + + @Override + public List download() throws IOException, InterruptedException { + logger.info("Downloading PGS files..."); + + DownloadProperties.URLProperties pgsUrlProperties = configuration.getDownload().getPgs(); + + Path pgsFolder = downloadFolder.resolve("pgs"); + Files.createDirectories(pgsFolder); + + List urls = new ArrayList<>(); + urls.add(pgsUrlProperties.getHost()); + + String urlAllMeta = pgsUrlProperties.getFiles().get(0); + urls.add(urlAllMeta); + + String filename = new File(urlAllMeta).getName(); + + // Downloads PGS files + List list = new ArrayList<>(); + System.out.println(urlAllMeta); + list.add(downloadFile(urlAllMeta, pgsFolder.resolve(filename).toString())); + + String baseUrl = urlAllMeta.replace(filename, "").replace("metadata", "scores"); + BufferedReader br = FileUtils.newBufferedReader(pgsFolder.resolve(filename)); + // Skip first line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] field = line.split(","); + String pgsId = field[0]; + + String url = baseUrl + pgsId + "/Metadata/" + pgsId + "_metadata.tar.gz"; + logger.info("Downloading file {}", url); + list.add(downloadFile(url, pgsFolder.resolve(new File(url).getName()).toString())); + + url = baseUrl + pgsId + "/ScoringFiles/Harmonized/" + pgsId + "_hmPOS_GRCh38.txt.gz"; + logger.info("Downloading file {}", url); + list.add(downloadFile(url, pgsFolder.resolve(new File(url).getName()).toString())); + } + br.close(); + + // Save version file + saveVersionData(EtlCommons.PGS_DATA, EtlCommons.PGS_DATA, pgsUrlProperties.getVersion(), getTimeStamp(), urls, + pgsFolder.resolve("pgsVersion.json")); + + logger.info("Done. Downloaded PGS files!"); + + return list; + } +} From d327c47887c63fafd3fd16273f8deed2e19281e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 19 Dec 2023 14:20:28 +0100 Subject: [PATCH 002/107] lib: update the build command to support PGS data, #TASK-5407, #TASK-5387 --- .../app/cli/admin/AdminCliOptionsParser.java | 3 +- .../admin/executors/BuildCommandExecutor.java | 23 + .../core/config/DownloadProperties.java | 10 + .../src/main/resources/configuration.yml | 1 + cellbase-lib/pom.xml | 5 + .../org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/builders/PolygenicScoreBuilder.java | 472 ++++++++++++++++++ .../lib/download/PgsDownloadManager.java | 7 +- 8 files changed, 517 insertions(+), 5 deletions(-) create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 47bc4e8186..3468cf7eeb 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -104,7 +104,8 @@ public class BuildCommandOptions { @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: genome, genome_info, " + "gene, variation, variation_functional_score, regulation, protein, ppi, conservation, drug, " - + "clinical_variants, repeats, svs, splice_score, pubmed. 'all' builds everything.", required = true, arity = 1) + + "clinical_variants, repeats, svs, splice_score, pubmed, pharmacogenomics, polygenic_score; 'all' builds everything.", + required = true, arity = 1) public String data; @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 8c0d477023..7586afd7aa 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -38,6 +38,7 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.PGS_DATA; import static org.opencb.cellbase.lib.EtlCommons.PHARMGKB_DATA; /** @@ -168,6 +169,9 @@ public void execute() { case EtlCommons.PHARMACOGENOMICS_DATA: parser = buildPharmacogenomics(); break; + case EtlCommons.PGS_DATA: + parser = buildPolygenicScores(); + break; default: logger.error("Build option '" + buildCommandOptions.data + "' is not valid"); break; @@ -437,4 +441,23 @@ private CellBaseBuilder buildPharmacogenomics() throws IOException { CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder); return new PharmGKBBuilder(inFolder, serializer); } + + private CellBaseBuilder buildPolygenicScores() throws IOException { + Path inFolder = downloadFolder.resolve(EtlCommons.PGS_DATA); + Path outFolder = buildFolder.resolve(EtlCommons.PGS_DATA); + if (!outFolder.toFile().exists()) { + outFolder.toFile().mkdirs(); + } + + logger.info("Copying PGS version file..."); + if (inFolder.resolve(PGS_DATA).resolve(EtlCommons.PGS_VERSION_FILENAME).toFile().exists()) { + Files.copy(inFolder.resolve(PGS_DATA).resolve(EtlCommons.PGS_VERSION_FILENAME), + outFolder.resolve(EtlCommons.PGS_VERSION_FILENAME), StandardCopyOption.REPLACE_EXISTING); + } + + String basename = PolygenicScoreBuilder.VARIANT_POLYGENIC_SCORE_FILENAME.split("\\.")[0]; + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder, basename); + return new PolygenicScoreBuilder(configuration.getDownload().getPgs().getSourceName(), + configuration.getDownload().getPgs().getVersion(), inFolder, serializer); + } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index d994f2af4b..89eead210f 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -561,6 +561,7 @@ public void setUrl(URLProperties url) { public static class URLProperties { private String host; + private String sourceName; private String version; private List files; @@ -572,6 +573,15 @@ public void setHost(String host) { this.host = host; } + public String getSourceName() { + return sourceName; + } + + public URLProperties setSourceName(String sourceName) { + this.sourceName = sourceName; + return this; + } + public String getVersion() { return version; } diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index c779bb3874..494bdf8782 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -179,6 +179,7 @@ download: - https://api.pharmgkb.org/v1/download/file/data/relationships.zip pgs: host: https://www.pgscatalog.org/ + sourceName: "PGS Catalog" version: "Dec. 15, 2023" files: - https://ftp.ebi.ac.uk/pub/databases/spot/pgs/metadata/pgs_all_metadata_scores.csv diff --git a/cellbase-lib/pom.xml b/cellbase-lib/pom.xml index f76602ad3e..8e07b6652b 100644 --- a/cellbase-lib/pom.xml +++ b/cellbase-lib/pom.xml @@ -185,6 +185,11 @@ junit-platform-engine test + + org.apache.commons + commons-csv + 1.0 + diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 51038fafb3..d172cf3438 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -48,6 +48,7 @@ public class EtlCommons { public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; public static final String SPLICE_SCORE_DATA = "splice_score"; public static final String PGS_DATA = "polygenic_score"; + public static final String PGS_VERSION_FILENAME = "pgsVersion.json"; public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; public static final String PHARMGKB_NAME = "PharmGKB"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java new file mode 100644 index 0000000000..a3af85cdf2 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java @@ -0,0 +1,472 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import com.fasterxml.jackson.databind.ObjectWriter; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.pgs.*; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.commons.utils.FileUtils; +import org.rocksdb.Options; +import org.rocksdb.RocksDB; +import org.rocksdb.RocksDBException; +import org.rocksdb.RocksIterator; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +public class PolygenicScoreBuilder extends CellBaseBuilder { + + private String source; + private String version; + + private Path pgsDir; + private CellBaseFileSerializer fileSerializer; + + protected RocksDB rdb; + + protected static ObjectMapper mapper; + protected static ObjectReader varPgsReader; + protected static ObjectWriter jsonObjectWriter; + + public static final String COMMON_POLYGENIC_SCORE_FILENAME = "common_polygenic_score.json.gz"; + public static final String VARIANT_POLYGENIC_SCORE_FILENAME = "variant_polygenic_score.json.gz"; + + private static final String RSID = "rsID"; + private static final String CHR_NAME = "chr_name"; + private static final String EFFECT_ALLELE = "effect_allele"; + private static final String OTHER_ALLELE = "other_allele"; + private static final String EFFECT_WEIGHT = "effect_weight"; + private static final String ALLELEFREQUENCY_EFFECT = "allelefrequency_effect"; + private static final String LOCUS_NAME = "locus_name"; + private static final String OR = "OR"; + private static final String HM_SOURCE = "hm_source"; + private static final String HM_RSID = "hm_rsID"; + private static final String HM_CHR = "hm_chr"; + private static final String HM_POS = "hm_pos"; + private static final String HM_INFEROTHERALLELE = "hm_inferOtherAllele"; + + static { + mapper = new ObjectMapper(); + mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); + varPgsReader = mapper.readerFor(VariantPolygenicScore.class); + jsonObjectWriter = mapper.writer(); + } + + public PolygenicScoreBuilder(String source, String version, Path pgsDir, CellBaseFileSerializer serializer) { + super(serializer); + + this.source = source; + this.version = version; + + this.fileSerializer = serializer; + this.pgsDir = pgsDir; + + logger = LoggerFactory.getLogger(PolygenicScoreBuilder.class); + } + + @Override + public void parse() throws Exception { + // Check input folder + FileUtils.checkPath(pgsDir); + + logger.info("Parsing polygenic score (PGS) files..."); + + Object[] dbConnection = getDBConnection(pgsDir.resolve("rdb.idx").toString(), true); + rdb = (RocksDB) dbConnection[0]; + Options dbOption = (Options) dbConnection[1]; + String dbLocation = (String) dbConnection[2]; + + BufferedWriter bw = FileUtils.newBufferedWriter(serializer.getOutdir().resolve(COMMON_POLYGENIC_SCORE_FILENAME)); + + for (File file : pgsDir.toFile().listFiles()) { + if (file.isFile()) { + if (file.getName().endsWith(".txt.gz")) { + logger.info("Processing PGS file: {}", file.getName()); + + String pgsId = null; + Map labelPos = new HashMap<>(); + + BufferedReader br = FileUtils.newBufferedReader(file.toPath()); + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + if (line.startsWith("#pgs_id=")) { + pgsId = line.split("=")[1].trim(); + // Sanity check + if (!file.getName().startsWith(pgsId)) { + throw new CellBaseException("Error parsing file " + file.getName() + ": pgs_id mismatch"); + } + } + } else if (line.startsWith(RSID) || line.startsWith(CHR_NAME)) { + String[] fields = line.split("\t"); + for (int i = 0; i < fields.length; i++) { + labelPos.put(fields[i], i); + } + } else { + // Sanity check + if (pgsId == null) { + throw new CellBaseException("Error parsing file " + file.getName() + ": pgs_id is null"); + } + saveVariantPolygenicScore(line, labelPos, pgsId); + } + } + br.close(); + } else if (file.getName().endsWith("_metadata.tar.gz")) { + processPgsMetadataFile(file, bw); + } + } + } + + // Serialize/write the saved variant polygenic scores in the RocksDB + serializeRDB(rdb); + closeIndex(rdb, dbOption, dbLocation); + serializer.close(); + + // Close PGS file (with common attributes) + bw.close(); + + logger.info("Parsing PGS files finished."); + } + + private void processPgsMetadataFile(File metadataFile, BufferedWriter bw) throws IOException, CellBaseException { + String pgsId = metadataFile.getName().split("_")[0]; + + Path tmp = pgsDir.resolve("tmp"); + if (!tmp.toFile().exists()) { + tmp.toFile().mkdirs(); + } + + String command = "tar -xzf " + metadataFile.getAbsolutePath() + " -C " + tmp.toAbsolutePath(); + Process process = Runtime.getRuntime().exec(command); + + // Wait for the process to complete + int exitCode; + try { + exitCode = process.waitFor(); + } catch (InterruptedException e) { + throw new IOException("Error waiting for the process to complete.", e); + } + + // Check the exit code + if (exitCode != 0) { + throw new IOException("Error executing the command. Exit code: " + exitCode); + } + + // Create PGS object, with the common fields + CommonPolygenicScore pgs = new CommonPolygenicScore(); + pgs.setId(pgsId); + pgs.setSource(source); + pgs.setVersion(version); + + String line; + String[] field; + BufferedReader br; + // PGSxxxxx_metadata_publications.csv + br = FileUtils.newBufferedReader(tmp.resolve(pgsId + "_metadata_publications.csv")); + // Skip first line + line = br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 4 5 6 + // PGS Publication/Study (PGP) ID First Author Title Journal Name Publication Date Release Date Authors + // 7 8 + // digital object identifier (doi) PubMed ID (PMID) + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + pgs.getPubmedIds().add(csvParser.getRecords().get(0).get(8)); + } + + // PGSxxxxx_metadata_efo_traits.csv + br = FileUtils.newBufferedReader(tmp.resolve(pgsId + "_metadata_efo_traits.csv")); + // Skip first line + line = br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 + // Ontology Trait ID Ontology Trait Label Ontology Trait Description Ontology URL + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + pgs.getEfoTraits().add(new EfoTrait(strings.get(0), strings.get(1), strings.get(2), strings.get(3))); + } + + // PGSxxxxx_metadata_scores.csv + br = FileUtils.newBufferedReader(tmp.resolve(pgsId + "_metadata_scores.csv")); + // Skip first line + line = br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 4 + // Polygenic Score (PGS) ID PGS Name Reported Trait Mapped Trait(s) (EFO label) Mapped Trait(s) (EFO ID) + // 5 6 7 8 + // PGS Development Method PGS Development Details/Relevant Parameters Original Genome Build Number of Variants + // 9 10 11 12 13 + // Number of Interaction Terms Type of Variant Weight PGS Publication (PGP) ID Publication (PMID) Publication (doi) + // 14 15 + // Score and results match the original publication Ancestry Distribution (%) - Source of Variant Associations (GWAS) + // 16 17 18 19 + // Ancestry Distribution (%) - Score Development/Training Ancestry Distribution (%) - PGS Evaluation FTP link Release Date + // 19 + // License/Terms of Use + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + // Sanity check + if (!pgsId.equals(strings.get(0))) { + throw new CellBaseException("Mismatch PGS ID when parsing file " + pgsId + "_metadata_scores.csv"); + } + if (StringUtils.isNotEmpty(pgs.getName())) { + throw new CellBaseException("More than one PGS in file " + pgsId + "_metadata_scores.csv"); + } + pgs.setName(strings.get(1)); + } + + // PGSxxxxx_metadata_score_development_samples.csv + // 0 1 2 3 4 + // Polygenic Score (PGS) ID Stage of PGS Development Number of Individuals Number of Cases Number of Controls + // 5 6 7 8 + // Percent of Participants Who are Male Sample Age Broad Ancestry Category "Ancestry (e.g. French, Chinese)" + // 9 10 11 12 + // Country of Recruitment Additional Ancestry Description Phenotype Definitions and Methods Followup Time + // 13 13 14 15 16 + // GWAS Catalog Study ID (GCST...) Source PubMed ID (PMID) Source DOI Cohort(s) Additional Sample/Cohort Information + + // PGSxxxxx_metadata_performance_metrics.csv + br = FileUtils.newBufferedReader(tmp.resolve(pgsId + "_metadata_performance_metrics.csv")); + // Skip first line + line = br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 4 + // PGS Performance Metric (PPM) ID Evaluated Score PGS Sample Set (PSS) PGS Publication (PGP) ID Reported Trait + // 5 6 7 8 + // Covariates Included in the Model PGS Performance: Other Relevant Information Publication (PMID) Publication (doi) + // 9 10 11 12 + // Hazard Ratio (HR) Odds Ratio (OR) Beta Area Under the Receiver-Operating Characteristic Curve (AUROC) + // 13 14 + // Concordance Statistic (C-index) Other Metric(s) + + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + + // Sanity check + if (!pgsId.equals(strings.get(1))) { + continue; + } + + PerformanceMetrics metrics = new PerformanceMetrics(); + metrics.setId(strings.get(0)); + if (StringUtils.isNotEmpty(strings.get(9))) { + metrics.setHazardRatio(strings.get(9)); + } + if (StringUtils.isNotEmpty(strings.get(10))) { + metrics.setOddsRatio(strings.get(10)); + } + if (StringUtils.isNotEmpty(strings.get(11))) { + metrics.setBeta(strings.get(11)); + } + if (StringUtils.isNotEmpty(strings.get(12))) { + metrics.setAuroc(strings.get(12)); + } + if (StringUtils.isNotEmpty(strings.get(13))) { + metrics.setcIndex(strings.get(13)); + } + if (StringUtils.isNotEmpty(strings.get(14))) { + metrics.setOtherMetrics(strings.get(14)); + } + pgs.getPerformanceMetrics().add(metrics); + } + + // PGSxxxxx_metadata_evaluation_sample_sets.csv + // 0 1 2 3 4 + // PGS Sample Set (PSS) Polygenic Score (PGS) ID Number of Individuals Number of Cases Number of Controls + // 5 6 7 + // Percent of Participants Who are Male Sample Age,Broad Ancestry Category "Ancestry (e.g.French, Chinese)" + // 8 9 10 11 + // Country of Recruitment Additional Ancestry Description Phenotype Definitions and Methods Followup Time + // 12 13 14 15 16 + // GWAS Catalog Study ID (GCST...) Source PubMed ID (PMID) Source DOI Cohort(s) Additional Sample/Cohort Information + + // PGSxxxxx_metadata_cohorts.csv + br = FileUtils.newBufferedReader(tmp.resolve(pgsId + "_metadata_cohorts.csv")); + // Skip first line + line = br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 + // Cohort ID Cohort Name Previous/other/additional names + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + pgs.getCohorts().add(new Cohort(strings.get(0), strings.get(1))); + } + + + // Create PGS object, with the common fields + bw.write(jsonObjectWriter.writeValueAsString(pgs)); + bw.write("\n"); + + // Clean tmp folder + for (File tmpFile : tmp.toFile().listFiles()) { + tmpFile.delete(); + } + } + + private void saveVariantPolygenicScore(String line, Map labelPos, String pgsId) + throws RocksDBException, IOException { + String chrom; + int position; + String effectAllele; + String otherAllele; + + String[] field = line.split("\t", -1); + + if (labelPos.containsKey(HM_CHR)) { + chrom = field[labelPos.get(HM_CHR)]; + } else { + logger.warn("Missing field '{}', skipping line: {}", HM_CHR, line); + return; + } + if (labelPos.containsKey(HM_POS)) { + position = Integer.parseInt(field[labelPos.get(HM_POS)]); + } else { + logger.warn("Missing field '{}', skipping line: {}", HM_POS, line); + return; + } + if (labelPos.containsKey(EFFECT_ALLELE)) { + effectAllele = field[labelPos.get(EFFECT_ALLELE)]; + } else { + logger.warn("Missing field '{}', skipping line: {}", EFFECT_ALLELE, line); + return; + } + if (labelPos.containsKey(HM_INFEROTHERALLELE) && StringUtils.isNotEmpty(field[labelPos.get(HM_INFEROTHERALLELE)])) { + otherAllele = field[labelPos.get(HM_INFEROTHERALLELE)]; + } else if (labelPos.containsKey(OTHER_ALLELE)) { + otherAllele = field[labelPos.get(OTHER_ALLELE)]; + } else { + logger.warn("Missing fields '{}' and '{}' (at least one is mandatory), skipping line: {}", HM_INFEROTHERALLELE, OTHER_ALLELE, + line); + return; + } + + // Create polygenic score + PolygenicScore pgs = new PolygenicScore(); + pgs.setId(pgsId); + if (labelPos.containsKey(EFFECT_WEIGHT)) { + pgs.setEffectWeight(Double.parseDouble(field[labelPos.get(EFFECT_WEIGHT)])); + } + if (labelPos.containsKey(ALLELEFREQUENCY_EFFECT)) { + pgs.setAlleleFrequencyEffect(Double.parseDouble(field[labelPos.get(ALLELEFREQUENCY_EFFECT)])); + } + if (labelPos.containsKey(OR)) { + pgs.setOr(Double.parseDouble(field[labelPos.get(OR)])); + } + if (labelPos.containsKey(LOCUS_NAME)) { + pgs.setLocusName(field[labelPos.get(LOCUS_NAME)]); + } + + // Creating and/or updating variant polygenic score + VariantPolygenicScore varPgs; + String key = chrom + ":" + position + ":" + otherAllele + ":" + effectAllele; + byte[] dbContent = rdb.get(key.getBytes()); + if (dbContent == null) { + varPgs = new VariantPolygenicScore(chrom, position, otherAllele, effectAllele, + Collections.singletonList(pgs)); + } else { + varPgs = varPgsReader.readValue(dbContent); + varPgs.getPolygenicScores().add(pgs); + } + rdb.put(key.getBytes(), jsonObjectWriter.writeValueAsBytes(varPgs)); + } + + private void serializeRDB(RocksDB rdb) throws IOException { + // DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's + // named "iterator" + RocksIterator rocksIterator = rdb.newIterator(); + + logger.info("Reading from RocksDB index and serializing to {}.json.gz", serializer.getOutdir().resolve(serializer.getFileName())); + int counter = 0; + for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { + logger.info("variant = {}", new String(rocksIterator.key())); + VariantPolygenicScore varPgs = varPgsReader.readValue(rocksIterator.value()); + logger.info("variant PGS: {}", varPgs.toString()); + serializer.serialize(varPgs); + counter++; + if (counter % 10000 == 0) { + logger.info("{} written", counter); + } + } + serializer.close(); + logger.info("Done."); + } + + private void closeIndex(RocksDB rdb, Options dbOption, String dbLocation) throws IOException { + if (rdb != null) { + rdb.close(); + } + if (dbOption != null) { + dbOption.dispose(); + } + if (dbLocation != null && Files.exists(Paths.get(dbLocation))) { + org.apache.commons.io.FileUtils.deleteDirectory(new File(dbLocation)); + } + } + + private Object[] getDBConnection(String dbLocation, boolean forceCreate) { + boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation)); + // a static method that loads the RocksDB C++ library. + RocksDB.loadLibrary(); + // the Options class contains a set of configurable DB options + // that determines the behavior of a database. + Options options = new Options().setCreateIfMissing(true); + +// options.setMaxBackgroundCompactions(4); +// options.setMaxBackgroundFlushes(1); +// options.setCompressionType(CompressionType.NO_COMPRESSION); +// options.setMaxOpenFiles(-1); +// options.setIncreaseParallelism(4); +// options.setCompactionStyle(CompactionStyle.LEVEL); +// options.setLevelCompactionDynamicLevelBytes(true); + + RocksDB db = null; + try { + // a factory method that returns a RocksDB instance + if (indexingNeeded) { + db = RocksDB.open(options, dbLocation); + } else { + db = RocksDB.openReadOnly(options, dbLocation); + } + // do something + } catch (RocksDBException e) { + // do some error handling + e.printStackTrace(); + System.exit(1); + } + + return new Object[]{db, options, dbLocation, indexingNeeded}; + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PgsDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PgsDownloadManager.java index 018baff1f0..8bc34e1a2f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PgsDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PgsDownloadManager.java @@ -43,7 +43,7 @@ public List download() throws IOException, InterruptedException { DownloadProperties.URLProperties pgsUrlProperties = configuration.getDownload().getPgs(); - Path pgsFolder = downloadFolder.resolve("pgs"); + Path pgsFolder = downloadFolder.resolve(EtlCommons.PGS_DATA); Files.createDirectories(pgsFolder); List urls = new ArrayList<>(); @@ -56,7 +56,6 @@ public List download() throws IOException, InterruptedException { // Downloads PGS files List list = new ArrayList<>(); - System.out.println(urlAllMeta); list.add(downloadFile(urlAllMeta, pgsFolder.resolve(filename).toString())); String baseUrl = urlAllMeta.replace(filename, "").replace("metadata", "scores"); @@ -78,8 +77,8 @@ public List download() throws IOException, InterruptedException { br.close(); // Save version file - saveVersionData(EtlCommons.PGS_DATA, EtlCommons.PGS_DATA, pgsUrlProperties.getVersion(), getTimeStamp(), urls, - pgsFolder.resolve("pgsVersion.json")); + saveVersionData(EtlCommons.PGS_DATA, pgsUrlProperties.getSourceName(), pgsUrlProperties.getVersion(), getTimeStamp(), urls, + pgsFolder.resolve(EtlCommons.PGS_VERSION_FILENAME)); logger.info("Done. Downloaded PGS files!"); From fd6ac92493a28c1c16443866d98c1dead29f89d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 19 Dec 2023 14:46:02 +0100 Subject: [PATCH 003/107] lib: copy PGS version file from download to generated json directory, #TASK-5407, #TASK-5387 --- .../app/cli/admin/executors/BuildCommandExecutor.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 7586afd7aa..6f8fa3db0b 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -38,7 +38,6 @@ import java.util.Collections; import java.util.List; -import static org.opencb.cellbase.lib.EtlCommons.PGS_DATA; import static org.opencb.cellbase.lib.EtlCommons.PHARMGKB_DATA; /** @@ -450,9 +449,9 @@ private CellBaseBuilder buildPolygenicScores() throws IOException { } logger.info("Copying PGS version file..."); - if (inFolder.resolve(PGS_DATA).resolve(EtlCommons.PGS_VERSION_FILENAME).toFile().exists()) { - Files.copy(inFolder.resolve(PGS_DATA).resolve(EtlCommons.PGS_VERSION_FILENAME), - outFolder.resolve(EtlCommons.PGS_VERSION_FILENAME), StandardCopyOption.REPLACE_EXISTING); + if (inFolder.resolve(EtlCommons.PGS_VERSION_FILENAME).toFile().exists()) { + Files.copy(inFolder.resolve(EtlCommons.PGS_VERSION_FILENAME), outFolder.resolve(EtlCommons.PGS_VERSION_FILENAME), + StandardCopyOption.REPLACE_EXISTING); } String basename = PolygenicScoreBuilder.VARIANT_POLYGENIC_SCORE_FILENAME.split("\\.")[0]; From 53aa886552530ecbce98c9ddd54aa29051023130 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 19 Dec 2023 18:03:54 +0100 Subject: [PATCH 004/107] lib: update the load command to support PGS data, #TASK-5410, #TASK-5387 --- .../admin/executors/LoadCommandExecutor.java | 51 ++++++++++++++++++- .../src/main/resources/mongodb-indexes.json | 5 ++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 5a8fd9417b..bc6f722dc0 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -24,6 +24,7 @@ import org.opencb.cellbase.core.models.DataRelease; import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.EtlCommons; +import org.opencb.cellbase.lib.builders.PolygenicScoreBuilder; import org.opencb.cellbase.lib.impl.core.CellBaseDBAdaptor; import org.opencb.cellbase.lib.indexer.IndexManager; import org.opencb.cellbase.lib.loader.LoadRunner; @@ -81,7 +82,7 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PUBMED_DATA, - EtlCommons.PHARMACOGENOMICS_DATA}; + EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PGS_DATA}; } else { loadOptions = loadCommandOptions.data.split(","); } @@ -299,6 +300,11 @@ public void execute() throws CellBaseException { loadPharmacogenomica(); break; } + case EtlCommons.PGS_DATA: { + // Load data, create index and update release + loadPolygenicScores(); + break; + } default: logger.warn("Not valid 'data'. We should not reach this point"); break; @@ -589,6 +595,49 @@ private void loadPharmacogenomica() throws IOException, CellBaseException { dataReleaseManager.update(dataRelease, EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PHARMACOGENOMICS_DATA, sources); } + private void loadPolygenicScores() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, + IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, CellBaseException, LoaderException { + Path pgsPath = input.resolve(EtlCommons.PGS_DATA); + + if (!Files.exists(pgsPath)) { + logger.warn("Polygenic scores (PGS) folder {} not found to load", pgsPath); + return; + } + + // Load common polygenic scores data + Path pathToLoad = pgsPath.resolve(PolygenicScoreBuilder.COMMON_POLYGENIC_SCORE_FILENAME); + logger.info("Loading file '{}'", pathToLoad.toFile().getName()); + try { + loadRunner.load(pathToLoad, "common_polygenic_score", dataRelease); + } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException + | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException + | LoaderException e) { + logger.error("Error loading file '{}': {}", pathToLoad.toFile().getName(), e.toString()); + } + + // Load variant polygenic scores data + pathToLoad = pgsPath.resolve(PolygenicScoreBuilder.VARIANT_POLYGENIC_SCORE_FILENAME); + logger.info("Loading file '{}'", pathToLoad.toFile().getName()); + try { + loadRunner.load(pathToLoad, "variant_polygenic_score", dataRelease); + } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException + | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException + | LoaderException e) { + logger.error("Error loading file '{}': {}", pathToLoad.toFile().getName(), e.toString()); + } + + // Create index + createIndex("variant_polygenic_score"); + createIndex("common_polygenic_score"); + + // Update release (collection and sources) + List sources = new ArrayList<>(Arrays.asList( + input.resolve(EtlCommons.PGS_DATA + "/" + EtlCommons.PGS_VERSION_FILENAME) + )); + dataReleaseManager.update(dataRelease, "variant_polygenic_score", EtlCommons.PGS_DATA, sources); + dataReleaseManager.update(dataRelease, "common_polygenic_score", null, null); + } + private void createIndex(String collection) { if (!createIndexes) { return; diff --git a/cellbase-lib/src/main/resources/mongodb-indexes.json b/cellbase-lib/src/main/resources/mongodb-indexes.json index de81c7b83b..160427bcdd 100644 --- a/cellbase-lib/src/main/resources/mongodb-indexes.json +++ b/cellbase-lib/src/main/resources/mongodb-indexes.json @@ -145,3 +145,8 @@ {"collection": "pharmacogenomics", "fields": {"variants.phenotypeType": 1}, "options": {"background": true}} {"collection": "pharmacogenomics", "fields": {"variants.confidence": 1}, "options": {"background": true}} {"collection": "pharmacogenomics", "fields": {"variants.evidences.pubmed": 1}, "options": {"background": true}} + +{"collection": "common_polygenic_score", "fields": {"id": 1}, "options": {"background": true}} +{"collection": "variant_polygenic_score", "fields": {"_chunkIds": 1}, "options": {"background": true}} +{"collection": "variant_polygenic_score", "fields": {"chromosome": 1, "position": 1}, "options": {"background": true}} +{"collection": "variant_polygenic_score", "fields": {"polygenicScores.id": 1}, "options": {"background": true}} From ecfee15b7b603e2df6c230a6f750f4b1af2eccfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 20 Dec 2023 17:57:34 +0100 Subject: [PATCH 005/107] lib: update CellBase downloader to take into account the AlphaMissense predictions, #TASK-5418, #TASK-5388 --- .../app/cli/admin/AdminCliOptionsParser.java | 2 +- .../executors/DownloadCommandExecutor.java | 3 + .../core/config/DownloadProperties.java | 10 +++ .../src/main/resources/configuration.yml | 5 ++ .../org/opencb/cellbase/lib/EtlCommons.java | 3 + .../AlphaMissenseDownloadManager.java | 61 +++++++++++++++++++ .../cellbase/lib/download/Downloader.java | 5 ++ 7 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AlphaMissenseDownloadManager.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 6049ef9b4b..57a7e29041 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -89,7 +89,7 @@ public class DownloadCommandOptions { @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: genome, gene, " + "variation, variation_functional_score, regulation, protein, conservation, " - + "clinical_variants, repeats, svs, pubmed and 'all' to download everything", required = true, arity = 1) + + "clinical_variants, repeats, svs, pubmed, alphamissense; and 'all' to download everything", required = true, arity = 1) public String data; @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index f8197e6558..d905f83f98 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -105,6 +105,9 @@ public void execute() { case EtlCommons.PHARMACOGENOMICS_DATA: downloadFiles.addAll(downloader.downloadPharmKGB()); break; + case EtlCommons.ALPHAMISSENSE_DATA: + downloadFiles.addAll(downloader.downloadAlphaMissense()); + break; default: System.out.println("Value \"" + data + "\" is not allowed for the data parameter. Allowed values" + " are: {genome, gene, gene_disease_association, variation, variation_functional_score," diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index ee4216f560..06e79c031e 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -74,6 +74,7 @@ public class DownloadProperties { private URLProperties revel; private URLProperties pubmed; private URLProperties pharmGKB; + private URLProperties alphaMissense; public EnsemblProperties getEnsembl() { return ensembl; @@ -485,6 +486,15 @@ public DownloadProperties setPharmGKB(URLProperties pharmGKB) { return this; } + public URLProperties getAlphaMissense() { + return alphaMissense; + } + + public DownloadProperties setAlphaMissense(URLProperties alphaMissense) { + this.alphaMissense = alphaMissense; + return this; + } + public DownloadProperties setRefSeqProteinFasta(URLProperties refSeqProteinFasta) { this.refSeqProteinFasta = refSeqProteinFasta; return this; diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 0f8d199118..cef9cf79a0 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -177,6 +177,11 @@ download: - https://api.pharmgkb.org/v1/download/file/data/clinicalVariants.zip - https://api.pharmgkb.org/v1/download/file/data/drugLabels.zip - https://api.pharmgkb.org/v1/download/file/data/relationships.zip + alphaMissense: + host: https://github.com/google-deepmind/alphamissense + version: "Aug. 3, 2023" + files: + - https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg38.tsv.gz species: vertebrates: - id: hsapiens diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 4396f0c2f1..fa3105b29a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -92,6 +92,9 @@ public class EtlCommons { public static final String PUBMED_DATA = "pubmed"; + public static final String ALPHAMISSENSE_DATA = "alphamissense"; + public static final String ALPHAMISSENSE_VERSION_FILENAME = "alphamissenseVersion.json"; + // Load specific data options public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AlphaMissenseDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AlphaMissenseDownloadManager.java new file mode 100644 index 0000000000..9f4b43fbfb --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AlphaMissenseDownloadManager.java @@ -0,0 +1,61 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.download; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.lib.EtlCommons; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import static org.opencb.cellbase.lib.EtlCommons.ALPHAMISSENSE_VERSION_FILENAME; + +public class AlphaMissenseDownloadManager extends AbstractDownloadManager { + + public AlphaMissenseDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) + throws IOException, CellBaseException { + super(species, assembly, targetDirectory, configuration); + } + + @Override + public List download() throws IOException, InterruptedException { + logger.info("Downloading AlphaMissense file..."); + + // Downloads AlphaMissense file + DownloadProperties.URLProperties alphaMissenseUrlProps = configuration.getDownload().getAlphaMissense(); + + List list = new ArrayList<>(); + for (String file : alphaMissenseUrlProps.getFiles()) { + String filename = new File(file).getName(); + logger.info("\tDownloading file " + filename); + list.add(downloadFile(file, downloadFolder.resolve(filename).toAbsolutePath().toString())); + } + + // Save version + saveVersionData(EtlCommons.ALPHAMISSENSE_DATA, EtlCommons.ALPHAMISSENSE_DATA, alphaMissenseUrlProps.getVersion(), getTimeStamp(), + alphaMissenseUrlProps.getFiles(), downloadFolder.resolve(ALPHAMISSENSE_VERSION_FILENAME)); + + logger.info("Downloaded AlphaMissense file. Done!"); + + return list; + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java index 17022cae4b..65f91a06d6 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java @@ -98,4 +98,9 @@ public List downloadPharmKGB() throws IOException, CellBaseExcepti PharmGKBDownloadManager manager = new PharmGKBDownloadManager(species, assembly, outputDirectory, configuration); return manager.download(); } + + public List downloadAlphaMissense() throws IOException, CellBaseException, InterruptedException { + AlphaMissenseDownloadManager manager = new AlphaMissenseDownloadManager(species, assembly, outputDirectory, configuration); + return manager.download(); + } } From 967d4cc8e7ef8a0e9754fb1cf4db22aa59a8c0b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 21 Dec 2023 16:55:24 +0100 Subject: [PATCH 006/107] lib: update builder to build AlphaMissense predictions, #TASK-5419, #TASK-5388 --- .../app/cli/admin/AdminCliOptionsParser.java | 2 +- .../admin/executors/BuildCommandExecutor.java | 22 +++ .../lib/builders/AlphaMissenseBuilder.java | 184 ++++++++++++++++++ .../lib/builders/utils/RocksDBUtils.java | 68 +++++++ 4 files changed, 275 insertions(+), 1 deletion(-) create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/utils/RocksDBUtils.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 57a7e29041..d3ab054ddb 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -104,7 +104,7 @@ public class BuildCommandOptions { @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: genome, genome_info, " + "gene, variation, variation_functional_score, regulation, protein, ppi, conservation, drug, " - + "clinical_variants, repeats, svs, splice_score, pubmed. 'all' builds everything.", required = true, arity = 1) + + "clinical_variants, repeats, svs, splice_score, pubmed and alphamissense; and 'all' builds everything.", required = true, arity = 1) public String data; @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 8c0d477023..bde6c86c2a 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -168,6 +168,9 @@ public void execute() { case EtlCommons.PHARMACOGENOMICS_DATA: parser = buildPharmacogenomics(); break; + case EtlCommons.ALPHAMISSENSE_DATA: + parser = buildAlphaMissense(); + break; default: logger.error("Build option '" + buildCommandOptions.data + "' is not valid"); break; @@ -437,4 +440,23 @@ private CellBaseBuilder buildPharmacogenomics() throws IOException { CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder); return new PharmGKBBuilder(inFolder, serializer); } + + private CellBaseBuilder buildAlphaMissense() throws IOException { + Path pubmedInputFolder = downloadFolder.resolve(EtlCommons.PUBMED_DATA); + Path pubmedOutputFolder = buildFolder.resolve(EtlCommons.PUBMED_DATA); + if (!pubmedOutputFolder.toFile().exists()) { + pubmedOutputFolder.toFile().mkdirs(); + } + + logger.info("Copying AlphaMissense version file..."); + if (downloadFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME).toFile().exists()) { + Files.copy(downloadFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME), + buildFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME), StandardCopyOption.REPLACE_EXISTING); + } + + String alphaMissenseFilename = new File(configuration.getDownload().getAlphaMissense().getFiles().get(0)).getName(); + File alphaMissenseFile = downloadFolder.resolve(alphaMissenseFilename).toFile(); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.ALPHAMISSENSE_DATA); + return new AlphaMissenseBuilder(alphaMissenseFile, serializer); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java new file mode 100644 index 0000000000..c4144df58d --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java @@ -0,0 +1,184 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import com.fasterxml.jackson.databind.ObjectWriter; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.ProteinSubstitutionPrediction; +import org.opencb.biodata.models.core.ProteinSubstitutionScore; +import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.builders.utils.RocksDBUtils; +import org.opencb.commons.utils.FileUtils; +import org.rocksdb.Options; +import org.rocksdb.RocksDB; +import org.rocksdb.RocksIterator; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class AlphaMissenseBuilder extends CellBaseBuilder { + + private File alphaMissenseFile; + private CellBaseFileSerializer fileSerializer; + + private RocksDB rdb; + + private String AA_CHANGE_PATTERN = "^([A-Z])(\\d+)([A-Z])$"; + private Pattern aaChangePattern = Pattern.compile(AA_CHANGE_PATTERN); + + private static ObjectMapper mapper; + private static ObjectReader predictionReader; + private static ObjectWriter jsonObjectWriter; + + static { + mapper = new ObjectMapper(); + mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); + predictionReader = mapper.readerFor(ProteinSubstitutionPrediction.class); + jsonObjectWriter = mapper.writer(); + } + + public AlphaMissenseBuilder(File alphaMissenseFile, CellBaseFileSerializer serializer) { + super(serializer); + + this.fileSerializer = serializer; + this.alphaMissenseFile = alphaMissenseFile; + + logger = LoggerFactory.getLogger(AlphaMissenseBuilder.class); + } + + @Override + public void parse() throws Exception { + logger.info("Parsing AlphaMissense file: {} ...", alphaMissenseFile.getName()); + + // Sanity check + FileUtils.checkFile(alphaMissenseFile.toPath()); + + Object[] dbConnection = RocksDBUtils.getDBConnection(serializer.getOutdir().resolve("rdb.idx").toString(), true); + rdb = (RocksDB) dbConnection[0]; + Options dbOption = (Options) dbConnection[1]; + String dbLocation = (String) dbConnection[2]; + + // AlphaMissense file reader + BufferedReader br = FileUtils.newBufferedReader(alphaMissenseFile.toPath()); + String line; + int counter = 0; + while ((line = br.readLine()) != null) { + if (!line.startsWith("#")) { + // 0 1 2 3 4 5 6 7 8 9 + // CHROM POS REF ALT genome uniprot_id transcript_id protein_variant am_pathogenicity am_class + String[] split = line.split("\t", -1); + + String transcriptId; + String uniprotId; + int position; + String aaReference; + String aaAlternate; + + if (StringUtils.isNotEmpty(split[6])) { + transcriptId = split[6]; + } else { + logger.warn("Missing field 'transcript_id', skipping line: {}", line); + return; + } + if (StringUtils.isNotEmpty(split[5])) { + uniprotId = split[5]; + } else { + logger.warn("Missing field 'uniprot_id', skipping line: {}", line); + return; + } + if (StringUtils.isNotEmpty(split[7])) { + Matcher matcher = aaChangePattern.matcher(split[7]); + if (matcher.matches()) { + aaReference = matcher.group(1); + position = Integer.parseInt(matcher.group(2)); + aaAlternate = matcher.group(3); + } else { + logger.warn("Error parsing field 'protein_variant' = {}, skipping line: {}", split[7], line); + return; + } + } else { + logger.warn("Missing field 'protein_variant', skipping line: {}", line); + return; + } + + // Create protein substitution score + ProteinSubstitutionScore score = new ProteinSubstitutionScore(); + score.setAaAlternate(aaAlternate); + if (StringUtils.isNotEmpty(split[8])) { + score.setScore(Double.parseDouble(split[8])); + } + if (StringUtils.isNotEmpty(split[9])) { + score.setEffect(split[9]); + } + + // Creating and/or updating protein substitution prediction + ProteinSubstitutionPrediction prediction; + String key = transcriptId + "_" + uniprotId + "_" + position + "_" + aaReference; + byte[] dbContent = rdb.get(key.getBytes()); + if (dbContent == null) { + prediction = new ProteinSubstitutionPrediction(transcriptId, uniprotId, position, aaReference, "AlphaMissense", + Collections.singletonList(score)); + } else { + prediction = predictionReader.readValue(dbContent); + prediction.getScores().add(score); + } + rdb.put(key.getBytes(), jsonObjectWriter.writeValueAsBytes(prediction)); + + // Log messages + counter++; + if (counter % 10000 == 0) { + logger.info("{} AlphaMissense predictions parsed", counter); + } + } + } + + // Serialize/write the saved variant polygenic scores in the RocksDB + serializeRDB(rdb); + RocksDBUtils.closeIndex(rdb, dbOption, dbLocation); + serializer.close(); + + logger.info("Parsed AlphaMissense file: {}. Done!", alphaMissenseFile.getName()); + } + + private void serializeRDB(RocksDB rdb) throws IOException { + // DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's + // named "iterator" + RocksIterator rocksIterator = rdb.newIterator(); + + logger.info("Reading from RocksDB index and serializing to {}.json.gz", serializer.getOutdir().resolve(serializer.getFileName())); + int counter = 0; + for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { + logger.info("variant = {}", new String(rocksIterator.key())); + ProteinSubstitutionPrediction prediction = predictionReader.readValue(rocksIterator.value()); + serializer.serialize(prediction); + counter++; + if (counter % 10000 == 0) { + logger.info("{} written", counter); + } + } + serializer.close(); + logger.info("Done."); + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/utils/RocksDBUtils.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/utils/RocksDBUtils.java new file mode 100644 index 0000000000..f6183e3040 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/utils/RocksDBUtils.java @@ -0,0 +1,68 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders.utils; + +import org.rocksdb.Options; +import org.rocksdb.RocksDB; +import org.rocksdb.RocksDBException; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; + +public class RocksDBUtils { + + public static void closeIndex(RocksDB rdb, Options dbOption, String dbLocation) throws IOException { + if (rdb != null) { + rdb.close(); + } + if (dbOption != null) { + dbOption.dispose(); + } + if (dbLocation != null && Files.exists(Paths.get(dbLocation))) { + org.apache.commons.io.FileUtils.deleteDirectory(new File(dbLocation)); + } + } + + public static Object[] getDBConnection(String dbLocation, boolean forceCreate) throws RocksDBException { + boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation)); + // a static method that loads the RocksDB C++ library. + RocksDB.loadLibrary(); + // the Options class contains a set of configurable DB options + // that determines the behavior of a database. + Options options = new Options().setCreateIfMissing(true); + +// options.setMaxBackgroundCompactions(4); +// options.setMaxBackgroundFlushes(1); +// options.setCompressionType(CompressionType.NO_COMPRESSION); +// options.setMaxOpenFiles(-1); +// options.setIncreaseParallelism(4); +// options.setCompactionStyle(CompactionStyle.LEVEL); +// options.setLevelCompactionDynamicLevelBytes(true); + + RocksDB db; + // a factory method that returns a RocksDB instance + if (indexingNeeded) { + db = RocksDB.open(options, dbLocation); + } else { + db = RocksDB.openReadOnly(options, dbLocation); + } + + return new Object[]{db, options, dbLocation, indexingNeeded}; + } +} From 345cd1c6d0828172ed0c2a76165012cdaa763d47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 22 Dec 2023 09:41:01 +0100 Subject: [PATCH 007/107] lib: update PGS builder according to PGS data models changes, #TASK-5407, #TASK-5387 --- .../lib/builders/PolygenicScoreBuilder.java | 196 ++++++++++++------ 1 file changed, 134 insertions(+), 62 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java index a3af85cdf2..a534244c3b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java @@ -24,6 +24,8 @@ import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.OntologyTermAnnotation; +import org.opencb.biodata.models.core.PubmedReference; import org.opencb.biodata.models.core.pgs.*; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; @@ -59,19 +61,53 @@ public class PolygenicScoreBuilder extends CellBaseBuilder { public static final String COMMON_POLYGENIC_SCORE_FILENAME = "common_polygenic_score.json.gz"; public static final String VARIANT_POLYGENIC_SCORE_FILENAME = "variant_polygenic_score.json.gz"; - private static final String RSID = "rsID"; - private static final String CHR_NAME = "chr_name"; - private static final String EFFECT_ALLELE = "effect_allele"; - private static final String OTHER_ALLELE = "other_allele"; - private static final String EFFECT_WEIGHT = "effect_weight"; - private static final String ALLELEFREQUENCY_EFFECT = "allelefrequency_effect"; - private static final String LOCUS_NAME = "locus_name"; - private static final String OR = "OR"; - private static final String HM_SOURCE = "hm_source"; - private static final String HM_RSID = "hm_rsID"; - private static final String HM_CHR = "hm_chr"; - private static final String HM_POS = "hm_pos"; - private static final String HM_INFEROTHERALLELE = "hm_inferOtherAllele"; + private static final String RSID_COL = "rsID"; + private static final String CHR_NAME_COL = "chr_name"; + private static final String EFFECT_ALLELE_COL = "effect_allele"; + private static final String OTHER_ALLELE_COL = "other_allele"; + private static final String EFFECT_WEIGHT_COL = "effect_weight"; + private static final String ALLELEFREQUENCY_EFFECT_COL = "allelefrequency_effect"; + private static final String ODDS_RATIO_COL = "OR"; + private static final String HAZARD_RATIO_COL = "HR"; + private static final String LOCUS_NAME_COL = "locus_name"; + private static final String IS_HAPLOTYPE_COL = "is_haplotype"; + private static final String IS_DIPLOTYPE_COL = "is_diplotype"; + private static final String IMPUTATION_METHOD_COL = "imputation_method"; + private static final String VARIANT_DESCRIPTION_COL = "variant_description"; + private static final String INCLUSION_CRITERIA_COL = "inclusion_criteria"; + private static final String IS_INTERACTION_COL = "is_interaction"; + private static final String IS_DOMINANT_COL = "is_dominant"; + private static final String IS_RECESSIVE_COL = "is_recessive"; + private static final String DOSAGE_0_WEIGHT_COL = "dosage_0_weight"; + private static final String DOSAGE_1_WEIGHT_COL = "dosage_1_weight"; + private static final String DOSAGE_2_WEIGHT_COL = "dosage_2_weight"; + private static final String HM_RSID_COL = "hm_rsID"; + private static final String HM_CHR_COL = "hm_chr"; + private static final String HM_POS_COL = "hm_pos"; + private static final String HM_INFEROTHERALLELE_COL = "hm_inferOtherAllele"; + + public static final String SAMPLE_SET_KEY = "Sample Set"; + public static final String ODDS_RATIO_KEY = "Odds ratio"; + public static final String HAZARD_RATIO_KEY = "Hazard ratio"; + public static final String BETA_KEY = "Beta"; + public static final String AUROC_KEY = "AUROC"; // Area Under the Receiver-Operating Characteristic Curve (AUROC) + public static final String CINDEX_KEY = "C-index"; // Concordance Statistic (C-index) + public static final String OTHER_KEY = "Other metric"; + private static final String EFFECT_WEIGHT_KEY = "Effect weight"; + private static final String ALLELE_FREQUENCY_EFFECT_KEY = "Allele frequency effect"; + private static final String LOCUS_NAME_KEY = "Locus name"; + private static final String IS_HAPLOTYPE_KEY = "Haplotype"; + private static final String IS_DIPLOTYPE_KEY = "Diplotype"; + private static final String IMPUTATION_METHOD_KEY = "Imputation method"; + private static final String VARIANT_DESCRIPTION_KEY = "Variant description"; + private static final String INCLUSION_CRITERIA_KEY = "Score inclusion criteria"; + private static final String IS_INTERACTION_KEY = "Interaction"; + private static final String IS_DOMINANT_KEY = "Dominant inheritance model"; + private static final String IS_RECESSIVE_KEY = "Recessive inheritance model"; + private static final String DOSAGE_0_WEIGHT_KEY = "Effect weight with 0 copy of the effect allele"; + private static final String DOSAGE_1_WEIGHT_KEY = "Effect weight with 1 copy of the effect allele"; + private static final String DOSAGE_2_WEIGHT_KEY = "Effect weight with 1 copy of the effect allele"; + static { mapper = new ObjectMapper(); @@ -112,7 +148,7 @@ public void parse() throws Exception { logger.info("Processing PGS file: {}", file.getName()); String pgsId = null; - Map labelPos = new HashMap<>(); + Map columnPos = new HashMap<>(); BufferedReader br = FileUtils.newBufferedReader(file.toPath()); String line; @@ -125,17 +161,17 @@ public void parse() throws Exception { throw new CellBaseException("Error parsing file " + file.getName() + ": pgs_id mismatch"); } } - } else if (line.startsWith(RSID) || line.startsWith(CHR_NAME)) { + } else if (line.startsWith(RSID_COL) || line.startsWith(CHR_NAME_COL)) { String[] fields = line.split("\t"); for (int i = 0; i < fields.length; i++) { - labelPos.put(fields[i], i); + columnPos.put(fields[i], i); } } else { // Sanity check if (pgsId == null) { throw new CellBaseException("Error parsing file " + file.getName() + ": pgs_id is null"); } - saveVariantPolygenicScore(line, labelPos, pgsId); + saveVariantPolygenicScore(line, columnPos, pgsId); } } br.close(); @@ -200,7 +236,8 @@ private void processPgsMetadataFile(File metadataFile, BufferedWriter bw) throws // digital object identifier (doi) PubMed ID (PMID) StringReader stringReader = new StringReader(line); CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); - pgs.getPubmedIds().add(csvParser.getRecords().get(0).get(8)); + CSVRecord strings = csvParser.getRecords().get(0); + pgs.getPubmedRefs().add(new PubmedReference(strings.get(8), strings.get(2), strings.get(3), strings.get(4), null)); } // PGSxxxxx_metadata_efo_traits.csv @@ -213,7 +250,8 @@ private void processPgsMetadataFile(File metadataFile, BufferedWriter bw) throws StringReader stringReader = new StringReader(line); CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); CSVRecord strings = csvParser.getRecords().get(0); - pgs.getEfoTraits().add(new EfoTrait(strings.get(0), strings.get(1), strings.get(2), strings.get(3))); + pgs.getTraits().add(new OntologyTermAnnotation(strings.get(0), strings.get(1), strings.get(2), "EFO", strings.get(3), + new HashMap<>())); } // PGSxxxxx_metadata_scores.csv @@ -246,7 +284,7 @@ private void processPgsMetadataFile(File metadataFile, BufferedWriter bw) throws pgs.setName(strings.get(1)); } - // PGSxxxxx_metadata_score_development_samples.csv + // TODO: PGSxxxxx_metadata_score_development_samples.csv // 0 1 2 3 4 // Polygenic Score (PGS) ID Stage of PGS Development Number of Individuals Number of Cases Number of Controls // 5 6 7 8 @@ -279,30 +317,32 @@ private void processPgsMetadataFile(File metadataFile, BufferedWriter bw) throws continue; } - PerformanceMetrics metrics = new PerformanceMetrics(); - metrics.setId(strings.get(0)); + Map values = new HashMap<>(); + if (StringUtils.isNotEmpty(strings.get(2))) { + values.put(SAMPLE_SET_KEY, strings.get(2)); + } if (StringUtils.isNotEmpty(strings.get(9))) { - metrics.setHazardRatio(strings.get(9)); + values.put(HAZARD_RATIO_KEY, strings.get(9)); } if (StringUtils.isNotEmpty(strings.get(10))) { - metrics.setOddsRatio(strings.get(10)); + values.put(ODDS_RATIO_KEY, strings.get(10)); } if (StringUtils.isNotEmpty(strings.get(11))) { - metrics.setBeta(strings.get(11)); + values.put(BETA_KEY, strings.get(11)); } if (StringUtils.isNotEmpty(strings.get(12))) { - metrics.setAuroc(strings.get(12)); + values.put(AUROC_KEY, strings.get(12)); } if (StringUtils.isNotEmpty(strings.get(13))) { - metrics.setcIndex(strings.get(13)); + values.put(CINDEX_KEY, strings.get(13)); } if (StringUtils.isNotEmpty(strings.get(14))) { - metrics.setOtherMetrics(strings.get(14)); + values.put(OTHER_KEY, strings.get(14)); } - pgs.getPerformanceMetrics().add(metrics); + pgs.getValues().add(values); } - // PGSxxxxx_metadata_evaluation_sample_sets.csv + // TODO: PGSxxxxx_metadata_evaluation_sample_sets.csv // 0 1 2 3 4 // PGS Sample Set (PSS) Polygenic Score (PGS) ID Number of Individuals Number of Cases Number of Controls // 5 6 7 @@ -322,10 +362,9 @@ private void processPgsMetadataFile(File metadataFile, BufferedWriter bw) throws StringReader stringReader = new StringReader(line); CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); CSVRecord strings = csvParser.getRecords().get(0); - pgs.getCohorts().add(new Cohort(strings.get(0), strings.get(1))); + pgs.getCohorts().add(new PgsCohort(strings.get(0), strings.get(1), strings.get(2))); } - // Create PGS object, with the common fields bw.write(jsonObjectWriter.writeValueAsString(pgs)); bw.write("\n"); @@ -336,7 +375,7 @@ private void processPgsMetadataFile(File metadataFile, BufferedWriter bw) throws } } - private void saveVariantPolygenicScore(String line, Map labelPos, String pgsId) + private void saveVariantPolygenicScore(String line, Map columnPos, String pgsId) throws RocksDBException, IOException { String chrom; int position; @@ -345,48 +384,83 @@ private void saveVariantPolygenicScore(String line, Map labelPo String[] field = line.split("\t", -1); - if (labelPos.containsKey(HM_CHR)) { - chrom = field[labelPos.get(HM_CHR)]; + if (columnPos.containsKey(HM_CHR_COL)) { + chrom = field[columnPos.get(HM_CHR_COL)]; } else { - logger.warn("Missing field '{}', skipping line: {}", HM_CHR, line); + logger.warn("Missing field '{}', skipping line: {}", HM_CHR_COL, line); return; } - if (labelPos.containsKey(HM_POS)) { - position = Integer.parseInt(field[labelPos.get(HM_POS)]); + if (columnPos.containsKey(HM_POS_COL)) { + position = Integer.parseInt(field[columnPos.get(HM_POS_COL)]); } else { - logger.warn("Missing field '{}', skipping line: {}", HM_POS, line); + logger.warn("Missing field '{}', skipping line: {}", HM_POS_COL, line); return; } - if (labelPos.containsKey(EFFECT_ALLELE)) { - effectAllele = field[labelPos.get(EFFECT_ALLELE)]; + if (columnPos.containsKey(EFFECT_ALLELE_COL)) { + effectAllele = field[columnPos.get(EFFECT_ALLELE_COL)]; } else { - logger.warn("Missing field '{}', skipping line: {}", EFFECT_ALLELE, line); + logger.warn("Missing field '{}', skipping line: {}", EFFECT_ALLELE_COL, line); return; } - if (labelPos.containsKey(HM_INFEROTHERALLELE) && StringUtils.isNotEmpty(field[labelPos.get(HM_INFEROTHERALLELE)])) { - otherAllele = field[labelPos.get(HM_INFEROTHERALLELE)]; - } else if (labelPos.containsKey(OTHER_ALLELE)) { - otherAllele = field[labelPos.get(OTHER_ALLELE)]; + if (columnPos.containsKey(HM_INFEROTHERALLELE_COL) && StringUtils.isNotEmpty(field[columnPos.get(HM_INFEROTHERALLELE_COL)])) { + otherAllele = field[columnPos.get(HM_INFEROTHERALLELE_COL)]; + } else if (columnPos.containsKey(OTHER_ALLELE_COL)) { + otherAllele = field[columnPos.get(OTHER_ALLELE_COL)]; } else { - logger.warn("Missing fields '{}' and '{}' (at least one is mandatory), skipping line: {}", HM_INFEROTHERALLELE, OTHER_ALLELE, - line); + logger.warn("Missing fields '{}' and '{}' (at least one is mandatory), skipping line: {}", HM_INFEROTHERALLELE_COL, + OTHER_ALLELE_COL, line); return; } // Create polygenic score - PolygenicScore pgs = new PolygenicScore(); - pgs.setId(pgsId); - if (labelPos.containsKey(EFFECT_WEIGHT)) { - pgs.setEffectWeight(Double.parseDouble(field[labelPos.get(EFFECT_WEIGHT)])); + Map values = new HashMap<>(); + if (columnPos.containsKey(EFFECT_WEIGHT_COL)) { + values.put(EFFECT_WEIGHT_KEY, Double.parseDouble(field[columnPos.get(EFFECT_WEIGHT_COL)])); + } + if (columnPos.containsKey(ALLELEFREQUENCY_EFFECT_COL)) { + values.put(ALLELE_FREQUENCY_EFFECT_KEY, Double.parseDouble(field[columnPos.get(ALLELEFREQUENCY_EFFECT_COL)])); + } + if (columnPos.containsKey(ODDS_RATIO_COL)) { + values.put(ODDS_RATIO_KEY, Double.parseDouble(field[columnPos.get(ODDS_RATIO_COL)])); + } + if (columnPos.containsKey(HAZARD_RATIO_COL)) { + values.put(HAZARD_RATIO_KEY, Double.parseDouble(field[columnPos.get(HAZARD_RATIO_COL)])); + } + if (columnPos.containsKey(LOCUS_NAME_COL)) { + values.put(LOCUS_NAME_KEY, field[columnPos.get(LOCUS_NAME_COL)]); + } + if (columnPos.containsKey(IS_HAPLOTYPE_COL)) { + values.put(IS_HAPLOTYPE_KEY, field[columnPos.get(IS_HAPLOTYPE_COL)]); + } + if (columnPos.containsKey(IS_DIPLOTYPE_COL)) { + values.put(IS_DIPLOTYPE_KEY, field[columnPos.get(IS_DIPLOTYPE_COL)]); + } + if (columnPos.containsKey(IMPUTATION_METHOD_COL)) { + values.put(IMPUTATION_METHOD_KEY, field[columnPos.get(IMPUTATION_METHOD_COL)]); + } + if (columnPos.containsKey(VARIANT_DESCRIPTION_COL)) { + values.put(VARIANT_DESCRIPTION_KEY, field[columnPos.get(VARIANT_DESCRIPTION_COL)]); + } + if (columnPos.containsKey(INCLUSION_CRITERIA_COL)) { + values.put(INCLUSION_CRITERIA_KEY, field[columnPos.get(INCLUSION_CRITERIA_COL)]); + } + if (columnPos.containsKey(IS_INTERACTION_COL)) { + values.put(IS_INTERACTION_KEY, field[columnPos.get(IS_INTERACTION_COL)]); + } + if (columnPos.containsKey(IS_DOMINANT_COL)) { + values.put(IS_DOMINANT_KEY, field[columnPos.get(IS_DOMINANT_COL)]); + } + if (columnPos.containsKey(IS_RECESSIVE_COL)) { + values.put(IS_RECESSIVE_KEY, field[columnPos.get(IS_RECESSIVE_COL)]); } - if (labelPos.containsKey(ALLELEFREQUENCY_EFFECT)) { - pgs.setAlleleFrequencyEffect(Double.parseDouble(field[labelPos.get(ALLELEFREQUENCY_EFFECT)])); + if (columnPos.containsKey(DOSAGE_0_WEIGHT_COL)) { + values.put(DOSAGE_0_WEIGHT_KEY, field[columnPos.get(DOSAGE_0_WEIGHT_COL)]); } - if (labelPos.containsKey(OR)) { - pgs.setOr(Double.parseDouble(field[labelPos.get(OR)])); + if (columnPos.containsKey(DOSAGE_1_WEIGHT_COL)) { + values.put(DOSAGE_1_WEIGHT_KEY, field[columnPos.get(DOSAGE_1_WEIGHT_COL)]); } - if (labelPos.containsKey(LOCUS_NAME)) { - pgs.setLocusName(field[labelPos.get(LOCUS_NAME)]); + if (columnPos.containsKey(DOSAGE_2_WEIGHT_COL)) { + values.put(DOSAGE_2_WEIGHT_KEY, field[columnPos.get(DOSAGE_2_WEIGHT_COL)]); } // Creating and/or updating variant polygenic score @@ -395,10 +469,10 @@ private void saveVariantPolygenicScore(String line, Map labelPo byte[] dbContent = rdb.get(key.getBytes()); if (dbContent == null) { varPgs = new VariantPolygenicScore(chrom, position, otherAllele, effectAllele, - Collections.singletonList(pgs)); + Collections.singletonList(new PolygenicScore(pgsId, values))); } else { varPgs = varPgsReader.readValue(dbContent); - varPgs.getPolygenicScores().add(pgs); + varPgs.getPolygenicScores().add(new PolygenicScore(pgsId, values)); } rdb.put(key.getBytes(), jsonObjectWriter.writeValueAsBytes(varPgs)); } @@ -411,9 +485,7 @@ private void serializeRDB(RocksDB rdb) throws IOException { logger.info("Reading from RocksDB index and serializing to {}.json.gz", serializer.getOutdir().resolve(serializer.getFileName())); int counter = 0; for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { - logger.info("variant = {}", new String(rocksIterator.key())); VariantPolygenicScore varPgs = varPgsReader.readValue(rocksIterator.value()); - logger.info("variant PGS: {}", varPgs.toString()); serializer.serialize(varPgs); counter++; if (counter % 10000 == 0) { From 4f44661baf1e7077f05150b0aabd13226b34485d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 22 Dec 2023 10:29:39 +0100 Subject: [PATCH 008/107] lib: update PGS builder according to Pubmed data models changes, #TASK-5407, #TASK-5387 --- .../cellbase/lib/builders/PolygenicScoreBuilder.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java index a534244c3b..43d2d98eb5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java @@ -25,8 +25,11 @@ import org.apache.commons.csv.CSVRecord; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.models.core.OntologyTermAnnotation; -import org.opencb.biodata.models.core.PubmedReference; -import org.opencb.biodata.models.core.pgs.*; +import org.opencb.biodata.models.core.pgs.CommonPolygenicScore; +import org.opencb.biodata.models.core.pgs.PgsCohort; +import org.opencb.biodata.models.core.pgs.PolygenicScore; +import org.opencb.biodata.models.core.pgs.VariantPolygenicScore; +import org.opencb.biodata.models.variant.avro.PubmedReference; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.commons.utils.FileUtils; From 436f534f141161d96d5f1b1cd0690cd7d5f9648c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 22 Dec 2023 14:29:37 +0100 Subject: [PATCH 009/107] lib: add PGS manager and adaptor, #TASK-5411, #TASK-5387 --- .../core/api/PolygenicScoreQuery.java | 98 +++++++++++++ .../lib/impl/core/MongoDBAdaptorFactory.java | 4 + .../core/PolygenicScoreMongoDBAdaptor.java | 130 ++++++++++++++++++ .../lib/managers/PolygenicScoreManager.java | 60 ++++++++ 4 files changed, 292 insertions(+) create mode 100644 cellbase-core/src/main/java/org/opencb/cellbase/core/api/PolygenicScoreQuery.java create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PolygenicScoreQuery.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PolygenicScoreQuery.java new file mode 100644 index 0000000000..106b01e1fe --- /dev/null +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PolygenicScoreQuery.java @@ -0,0 +1,98 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.core.api; + +import org.opencb.cellbase.core.api.query.AbstractQuery; +import org.opencb.cellbase.core.api.query.QueryException; +import org.opencb.cellbase.core.api.query.QueryParameter; + +import java.util.List; +import java.util.Map; + +public class PolygenicScoreQuery extends AbstractQuery { + + @QueryParameter(id = "id") + private List ids; + + @QueryParameter(id = "name") + private List names; + + @QueryParameter(id = "source", allowedValues = {"PGS Catalog"}) + private List sources; + + public PolygenicScoreQuery() { + } + + public PolygenicScoreQuery(Map params) throws QueryException { + super(params); + + objectMapper.readerForUpdating(this); + objectMapper.readerFor(PolygenicScoreQuery.class); + objectWriter = objectMapper.writerFor(PolygenicScoreQuery.class); + } + + @Override + protected void validateQuery() throws QueryException { + // Nothing to to + return; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("PolygenicScoreQuery{"); + sb.append("ids=").append(ids); + sb.append(", names=").append(names); + sb.append(", sources=").append(sources); + sb.append(", limit=").append(limit); + sb.append(", skip=").append(skip); + sb.append(", count=").append(count); + sb.append(", sort='").append(sort).append('\''); + sb.append(", order=").append(order); + sb.append(", facet='").append(facet).append('\''); + sb.append(", includes=").append(includes); + sb.append(", excludes=").append(excludes); + sb.append('}'); + return sb.toString(); + } + + public List getIds() { + return ids; + } + + public PolygenicScoreQuery setIds(List ids) { + this.ids = ids; + return this; + } + + public List getNames() { + return names; + } + + public PolygenicScoreQuery setNames(List names) { + this.names = names; + return this; + } + + public List getSources() { + return sources; + } + + public PolygenicScoreQuery setSources(List sources) { + this.sources = sources; + return this; + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java index e120e0ae51..05fe4d85a8 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java @@ -95,6 +95,10 @@ public PharmacogenomicsMongoDBAdaptor getPharmacogenomicsMongoDBAdaptor() { return new PharmacogenomicsMongoDBAdaptor(mongoDatastore); } + public PolygenicScoreMongoDBAdaptor getPolygenicScoreMongoDBAdaptor() { + return new PolygenicScoreMongoDBAdaptor(mongoDatastore); + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("MongoDBAdaptorFactory{"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java new file mode 100644 index 0000000000..697c7d47b6 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java @@ -0,0 +1,130 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.impl.core; + +import com.mongodb.client.model.Filters; +import org.apache.commons.lang3.StringUtils; +import org.bson.conversions.Bson; +import org.opencb.biodata.models.core.SpliceScore; +import org.opencb.biodata.models.core.SpliceScoreAlternate; +import org.opencb.biodata.models.core.pgs.VariantPolygenicScore; +import org.opencb.biodata.models.pharma.PharmaChemical; +import org.opencb.cellbase.core.api.PharmaChemicalQuery; +import org.opencb.cellbase.core.api.PolygenicScoreQuery; +import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.EtlCommons; +import org.opencb.cellbase.lib.iterator.CellBaseIterator; +import org.opencb.commons.datastore.core.DataResult; +import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.commons.datastore.mongodb.MongoDBCollection; +import org.opencb.commons.datastore.mongodb.MongoDataStore; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class PolygenicScoreMongoDBAdaptor extends CellBaseDBAdaptor + implements CellBaseCoreDBAdaptor { + + public PolygenicScoreMongoDBAdaptor(MongoDataStore mongoDataStore) { + super(mongoDataStore); + + init(); + } + + private void init() { + logger.debug("SpliceScoreMongoDBAdaptor: in 'constructor'"); + + mongoDBCollectionByRelease = buildCollectionByReleaseMap(EtlCommons.SPLICE_SCORE_DATA); + } + + public CellBaseDataResult getScores(String chromosome, int position, String reference, String alternate) + throws CellBaseException { + return getScores(chromosome, position, reference, alternate, 0); + } + + public CellBaseDataResult getScores(String chromosome, int position, String reference, String alternate, int dataRelease) + throws CellBaseException { + long dbTimeStart = System.currentTimeMillis(); + +// String ref = StringUtils.isEmpty(reference) ? "-" : reference; +// String alt = StringUtils.isEmpty(alternate) ? "-" : alternate; +// List andBsonList = new ArrayList<>(); +// andBsonList.add(Filters.eq("chromosome", chromosome)); +// andBsonList.add(Filters.eq("position", position)); +// andBsonList.add(Filters.eq("refAllele", ref)); +// Bson query = Filters.and(andBsonList); +//// System.out.println("\t\tgetScores >>>>>>> " + query); +// +// final String id = chromosome + ":" + position + ":" + ref + ":" + alt; +// +// MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease); +// DataResult spliceScoreDataResult = mongoDBCollection.find(query, null, SpliceScore.class, new QueryOptions()); +// +// List results = new ArrayList<>(); +// +// // Search for the right splice score +// if (spliceScoreDataResult.getNumResults() > 0) { +//// System.out.println("\t\tgetScores >>>>>>> num. results = " + spliceScoreDataResult.getNumResults()); +// for (SpliceScore score : spliceScoreDataResult.getResults()) { +// for (SpliceScoreAlternate scoreAlternate : score.getAlternates()) { +// if (alt.equals(scoreAlternate.getAltAllele())) { +// score.setAlternates(Collections.singletonList(scoreAlternate)); +//// System.out.println("\t\t\t\tgetScores, MATCH (" + score.getSource() + "): " + alt + " vs " +//// + scoreAlternate.getAltAllele()); +// results.add(score); +// } +// } +// } +// } +// int dbTime = Long.valueOf(System.currentTimeMillis() - dbTimeStart).intValue(); +// return new CellBaseDataResult<>(id, dbTime, new ArrayList<>(), results.size(), results, results.size()); + return null; + } + + @Override + public CellBaseIterator iterator(PolygenicScoreQuery query) throws CellBaseException { + logger.error("Not implemented yet"); + return null; + } + + @Override + public CellBaseDataResult aggregationStats(PolygenicScoreQuery query) { + logger.error("Not implemented yet"); + return null; + } + + @Override + public CellBaseDataResult groupBy(PolygenicScoreQuery query) throws CellBaseException { + logger.error("Not implemented yet"); + return null; + } + + @Override + public CellBaseDataResult distinct(PolygenicScoreQuery query) throws CellBaseException { + logger.error("Not implemented yet"); + return null; + } + + @Override + public List> info(List ids, ProjectionQueryOptions queryOptions, int dataRelease, String apiKey) throws CellBaseException { + logger.error("Not implemented yet"); + return null; + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java new file mode 100644 index 0000000000..a398c0c1be --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java @@ -0,0 +1,60 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.managers; + +import org.opencb.biodata.models.core.pgs.VariantPolygenicScore; +import org.opencb.biodata.models.pharma.PharmaChemical; +import org.opencb.cellbase.core.api.PharmaChemicalQuery; +import org.opencb.cellbase.core.api.PolygenicScoreQuery; +import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.impl.core.CellBaseCoreDBAdaptor; +import org.opencb.cellbase.lib.impl.core.PharmacogenomicsMongoDBAdaptor; +import org.opencb.cellbase.lib.impl.core.PolygenicScoreMongoDBAdaptor; + +import java.util.List; + +public class PolygenicScoreManager extends AbstractManager implements AggregationApi { + + private PolygenicScoreMongoDBAdaptor pgsDBAdaptor; + + public PolygenicScoreManager(String species, CellBaseConfiguration configuration) throws CellBaseException { + this(species, null, configuration); + } + + public PolygenicScoreManager(String species, String assembly, CellBaseConfiguration configuration) throws CellBaseException { + super(species, assembly, configuration); + + this.init(); + } + + private void init() { + pgsDBAdaptor = dbAdaptorFactory.getPolygenicScoreMongoDBAdaptor(); + } + + @Override + public CellBaseCoreDBAdaptor getDBAdaptor() { + return pgsDBAdaptor; + } + + public List> info(List ids, ProjectionQueryOptions query, int dataRelease, + String apiKey) throws CellBaseException { + return pgsDBAdaptor.info(ids, query, dataRelease, apiKey); + } +} From 314786d2e749528872c508b3bfa2b21ecd6958b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 2 Jan 2024 15:36:24 +0100 Subject: [PATCH 010/107] lib: add support for annotating using PGS, #TASK-5411, #TASK-5387 --- .../admin/executors/LoadCommandExecutor.java | 12 +- .../org/opencb/cellbase/lib/EtlCommons.java | 3 + .../core/PolygenicScoreMongoDBAdaptor.java | 163 +++++++++++------- .../lib/managers/PolygenicScoreManager.java | 10 +- .../VariantAnnotationCalculator.java | 9 + .../FuturePolygenicScoreAnnotator.java | 94 ++++++++++ 6 files changed, 224 insertions(+), 67 deletions(-) create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FuturePolygenicScoreAnnotator.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index bc6f722dc0..20b7fe8488 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -608,7 +608,7 @@ private void loadPolygenicScores() throws NoSuchMethodException, InterruptedExce Path pathToLoad = pgsPath.resolve(PolygenicScoreBuilder.COMMON_POLYGENIC_SCORE_FILENAME); logger.info("Loading file '{}'", pathToLoad.toFile().getName()); try { - loadRunner.load(pathToLoad, "common_polygenic_score", dataRelease); + loadRunner.load(pathToLoad, EtlCommons.PGS_COMMON_COLLECTION, dataRelease); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException | LoaderException e) { @@ -619,7 +619,7 @@ private void loadPolygenicScores() throws NoSuchMethodException, InterruptedExce pathToLoad = pgsPath.resolve(PolygenicScoreBuilder.VARIANT_POLYGENIC_SCORE_FILENAME); logger.info("Loading file '{}'", pathToLoad.toFile().getName()); try { - loadRunner.load(pathToLoad, "variant_polygenic_score", dataRelease); + loadRunner.load(pathToLoad, EtlCommons.PGS_VARIANT_COLLECTION, dataRelease); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException | LoaderException e) { @@ -627,15 +627,15 @@ private void loadPolygenicScores() throws NoSuchMethodException, InterruptedExce } // Create index - createIndex("variant_polygenic_score"); - createIndex("common_polygenic_score"); + createIndex(EtlCommons.PGS_COMMON_COLLECTION); + createIndex(EtlCommons.PGS_VARIANT_COLLECTION); // Update release (collection and sources) List sources = new ArrayList<>(Arrays.asList( input.resolve(EtlCommons.PGS_DATA + "/" + EtlCommons.PGS_VERSION_FILENAME) )); - dataReleaseManager.update(dataRelease, "variant_polygenic_score", EtlCommons.PGS_DATA, sources); - dataReleaseManager.update(dataRelease, "common_polygenic_score", null, null); + dataReleaseManager.update(dataRelease, EtlCommons.PGS_VARIANT_COLLECTION, EtlCommons.PGS_DATA, sources); + dataReleaseManager.update(dataRelease, EtlCommons.PGS_COMMON_COLLECTION, null, null); } private void createIndex(String collection) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index d172cf3438..0e6d1538ca 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -47,7 +47,10 @@ public class EtlCommons { public static final String CONSERVATION_DATA = "conservation"; public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; public static final String SPLICE_SCORE_DATA = "splice_score"; + public static final String PGS_DATA = "polygenic_score"; + public static final String PGS_COMMON_COLLECTION = "common_polygenic_scores"; + public static final String PGS_VARIANT_COLLECTION = "variant_polygenic_scores"; public static final String PGS_VERSION_FILENAME = "pgsVersion.json"; public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java index 697c7d47b6..225fc26608 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java @@ -17,30 +17,43 @@ package org.opencb.cellbase.lib.impl.core; import com.mongodb.client.model.Filters; -import org.apache.commons.lang3.StringUtils; +import org.apache.commons.collections4.CollectionUtils; +import org.bson.Document; import org.bson.conversions.Bson; -import org.opencb.biodata.models.core.SpliceScore; -import org.opencb.biodata.models.core.SpliceScoreAlternate; +import org.opencb.biodata.models.core.pgs.CommonPolygenicScore; +import org.opencb.biodata.models.core.pgs.PolygenicScore; import org.opencb.biodata.models.core.pgs.VariantPolygenicScore; -import org.opencb.biodata.models.pharma.PharmaChemical; -import org.opencb.cellbase.core.api.PharmaChemicalQuery; +import org.opencb.biodata.models.variant.avro.PolygenicScoreAnnotation; import org.opencb.cellbase.core.api.PolygenicScoreQuery; import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.iterator.CellBaseIterator; +import org.opencb.cellbase.lib.iterator.CellBaseMongoDBIterator; import org.opencb.commons.datastore.core.DataResult; import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.commons.datastore.core.QueryParam; +import org.opencb.commons.datastore.mongodb.GenericDocumentComplexConverter; import org.opencb.commons.datastore.mongodb.MongoDBCollection; +import org.opencb.commons.datastore.mongodb.MongoDBIterator; import org.opencb.commons.datastore.mongodb.MongoDataStore; import java.util.ArrayList; -import java.util.Collections; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; public class PolygenicScoreMongoDBAdaptor extends CellBaseDBAdaptor - implements CellBaseCoreDBAdaptor { + implements CellBaseCoreDBAdaptor { + + protected Map pgsVariantMongoDBCollectionByRelease; + + private static final GenericDocumentComplexConverter CONVERTER; + + static { + CONVERTER = new GenericDocumentComplexConverter<>(CommonPolygenicScore.class); + } public PolygenicScoreMongoDBAdaptor(MongoDataStore mongoDataStore) { super(mongoDataStore); @@ -49,82 +62,116 @@ public PolygenicScoreMongoDBAdaptor(MongoDataStore mongoDataStore) { } private void init() { - logger.debug("SpliceScoreMongoDBAdaptor: in 'constructor'"); + logger.debug("PolygenicScoreMongoDBAdaptor: in 'constructor'"); - mongoDBCollectionByRelease = buildCollectionByReleaseMap(EtlCommons.SPLICE_SCORE_DATA); + mongoDBCollectionByRelease = buildCollectionByReleaseMap(EtlCommons.PGS_COMMON_COLLECTION); + pgsVariantMongoDBCollectionByRelease = buildCollectionByReleaseMap(EtlCommons.PGS_VARIANT_COLLECTION); } - public CellBaseDataResult getScores(String chromosome, int position, String reference, String alternate) - throws CellBaseException { - return getScores(chromosome, position, reference, alternate, 0); - } - - public CellBaseDataResult getScores(String chromosome, int position, String reference, String alternate, int dataRelease) + public CellBaseDataResult getPolygenicScoreAnnotation(String chromosome, int position, String reference, + String alternate, int dataRelease) throws CellBaseException { long dbTimeStart = System.currentTimeMillis(); -// String ref = StringUtils.isEmpty(reference) ? "-" : reference; -// String alt = StringUtils.isEmpty(alternate) ? "-" : alternate; -// List andBsonList = new ArrayList<>(); -// andBsonList.add(Filters.eq("chromosome", chromosome)); -// andBsonList.add(Filters.eq("position", position)); -// andBsonList.add(Filters.eq("refAllele", ref)); -// Bson query = Filters.and(andBsonList); -//// System.out.println("\t\tgetScores >>>>>>> " + query); -// -// final String id = chromosome + ":" + position + ":" + ref + ":" + alt; -// -// MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease); -// DataResult spliceScoreDataResult = mongoDBCollection.find(query, null, SpliceScore.class, new QueryOptions()); -// -// List results = new ArrayList<>(); -// -// // Search for the right splice score -// if (spliceScoreDataResult.getNumResults() > 0) { -//// System.out.println("\t\tgetScores >>>>>>> num. results = " + spliceScoreDataResult.getNumResults()); -// for (SpliceScore score : spliceScoreDataResult.getResults()) { -// for (SpliceScoreAlternate scoreAlternate : score.getAlternates()) { -// if (alt.equals(scoreAlternate.getAltAllele())) { -// score.setAlternates(Collections.singletonList(scoreAlternate)); -//// System.out.println("\t\t\t\tgetScores, MATCH (" + score.getSource() + "): " + alt + " vs " -//// + scoreAlternate.getAltAllele()); -// results.add(score); -// } -// } -// } -// } -// int dbTime = Long.valueOf(System.currentTimeMillis() - dbTimeStart).intValue(); -// return new CellBaseDataResult<>(id, dbTime, new ArrayList<>(), results.size(), results, results.size()); - return null; + List andBsonList = new ArrayList<>(); + andBsonList.add(Filters.eq("chromosome", chromosome)); + andBsonList.add(Filters.eq("position", position)); + Bson query = Filters.and(andBsonList); + + MongoDBCollection mongoDBCollection = getCollectionByRelease(pgsVariantMongoDBCollectionByRelease, dataRelease); + DataResult pgsVariantDataResult = mongoDBCollection.find(query, null, VariantPolygenicScore.class, new QueryOptions()); + + List results = new ArrayList<>(); + + // Search for the right polygenic score, i.e., checking reference and alternate with PGS effectAllele and otherAllele + if (pgsVariantDataResult.getNumResults() > 0) { + for (VariantPolygenicScore score : pgsVariantDataResult.getResults()) { + if ((score.getEffectAllele().equals(reference) && score.getOtherAllele().equals(alternate)) + || (score.getEffectAllele().equals(alternate) && score.getOtherAllele().equals(reference))) { + PolygenicScoreAnnotation pgsAnnotation = new PolygenicScoreAnnotation(); + List pgsIds = score.getPolygenicScores().stream().map(PolygenicScore::getId).collect(Collectors.toList()); +// pgsAnnotation.setId(score.get); + pgsAnnotation.getVariants().add(new org.opencb.biodata.models.variant.avro.VariantPolygenicScore( + score.getEffectAllele(), score.getOtherAllele(), score.getPolygenicScores()); + results.add(score); + } + } + } + int dbTime = Long.valueOf(System.currentTimeMillis() - dbTimeStart).intValue(); + final String id = chromosome + ":" + position + ":" + reference + ":" + alternate; + return new CellBaseDataResult<>(id, dbTime, new ArrayList<>(), results.size(), results, results.size()); } @Override - public CellBaseIterator iterator(PolygenicScoreQuery query) throws CellBaseException { - logger.error("Not implemented yet"); - return null; + public CellBaseIterator iterator(PolygenicScoreQuery query) throws CellBaseException { + Bson bson = parseQuery(query); + QueryOptions queryOptions = query.toQueryOptions(); + Bson projection = getProjection(query); + MongoDBIterator iterator; + MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, query.getDataRelease()); + iterator = mongoDBCollection.iterator(null, bson, projection, CONVERTER, queryOptions); + return new CellBaseMongoDBIterator<>(iterator); } @Override - public CellBaseDataResult aggregationStats(PolygenicScoreQuery query) { + public CellBaseDataResult aggregationStats(PolygenicScoreQuery query) { logger.error("Not implemented yet"); return null; } @Override - public CellBaseDataResult groupBy(PolygenicScoreQuery query) throws CellBaseException { + public CellBaseDataResult groupBy(PolygenicScoreQuery query) throws CellBaseException { logger.error("Not implemented yet"); return null; } @Override public CellBaseDataResult distinct(PolygenicScoreQuery query) throws CellBaseException { - logger.error("Not implemented yet"); - return null; + Bson bsonDocument = parseQuery(query); + MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, query.getDataRelease()); + return new CellBaseDataResult<>(mongoDBCollection.distinct(query.getFacet(), bsonDocument, String.class)); } @Override - public List> info(List ids, ProjectionQueryOptions queryOptions, int dataRelease, String apiKey) throws CellBaseException { - logger.error("Not implemented yet"); - return null; + public List> info(List ids, ProjectionQueryOptions queryOptions, int dataRelease, String apiKey) throws CellBaseException { + List> results = new ArrayList<>(); + Bson projection = getProjection(queryOptions); + for (String id : ids) { + List orBsonList = new ArrayList<>(ids.size()); + orBsonList.add(Filters.eq("id", id)); + orBsonList.add(Filters.eq("name", id)); + Bson query = Filters.or(orBsonList); + MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease); + results.add(new CellBaseDataResult<>(mongoDBCollection.find(query, projection, CONVERTER, new QueryOptions()))); + } + return results; + } + + public Bson parseQuery(PolygenicScoreQuery pharmaQuery) { + List andBsonList = new ArrayList<>(); + try { + for (Map.Entry entry : pharmaQuery.toObjectMap().entrySet()) { + String dotNotationName = entry.getKey(); + Object value = entry.getValue(); + switch (dotNotationName) { + case "token": + case "apiKey": + case "dataRelease": + // do nothing + break; + default: + createAndOrQuery(value, dotNotationName, QueryParam.Type.STRING, andBsonList); + break; + } + } + } catch (IllegalAccessException e) { + e.printStackTrace(); + } + logger.debug("PolygenicScoreQuery parsed query: {}", andBsonList); + if (CollectionUtils.isNotEmpty(andBsonList)) { + return Filters.and(andBsonList); + } else { + return new Document(); + } } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java index a398c0c1be..c2d6bf7af4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java @@ -17,15 +17,13 @@ package org.opencb.cellbase.lib.managers; import org.opencb.biodata.models.core.pgs.VariantPolygenicScore; -import org.opencb.biodata.models.pharma.PharmaChemical; -import org.opencb.cellbase.core.api.PharmaChemicalQuery; +import org.opencb.biodata.models.variant.avro.PolygenicScoreAnnotation; import org.opencb.cellbase.core.api.PolygenicScoreQuery; import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.impl.core.CellBaseCoreDBAdaptor; -import org.opencb.cellbase.lib.impl.core.PharmacogenomicsMongoDBAdaptor; import org.opencb.cellbase.lib.impl.core.PolygenicScoreMongoDBAdaptor; import java.util.List; @@ -57,4 +55,10 @@ public List> info(List ids, Pr String apiKey) throws CellBaseException { return pgsDBAdaptor.info(ids, query, dataRelease, apiKey); } + + public CellBaseDataResult getPolygenicScoreAnnotation(String chromosome, Integer start, String reference, + String alternate, int dataRelease) + throws CellBaseException { + return pgsDBAdaptor.getPolygenicScoreAnnotation(chromosome, start, reference, alternate, dataRelease); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java index d54ceb6623..36564c158d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java @@ -35,6 +35,7 @@ import org.opencb.cellbase.core.api.query.QueryException; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.managers.*; import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; import org.opencb.cellbase.lib.variant.annotation.futures.FuturePharmacogenomicsAnnotator; @@ -518,6 +519,14 @@ private List runAnnotationProcess(List normalizedVar pharmacogenomicsFuture = CACHED_THREAD_POOL.submit(futurePharmacogenomicsAnnotator); } + FuturePolygenicScoreAnnotator futurePolygenicScoreAnnotator = null; + Future>> polygenicScoreFuture = null; + if (annotatorSet.contains(EtlCommons.PGS_DATA)) { + futurePolygenicScoreAnnotator = new FuturePolygenicScoreAnnotator(normalizedVariantList, QueryOptions.empty(), dataRelease, + polygenicScoreManager, logger); + polygenicScoreFuture = CACHED_THREAD_POOL.submit(futurePolygenicScoreAnnotator); + } + // We iterate over all variants to get the rest of the annotations and to create the VariantAnnotation objects Queue variantBuffer = new LinkedList<>(); long startTime = System.currentTimeMillis(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FuturePolygenicScoreAnnotator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FuturePolygenicScoreAnnotator.java new file mode 100644 index 0000000000..9b3cf6654b --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FuturePolygenicScoreAnnotator.java @@ -0,0 +1,94 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.variant.annotation.futures; + +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.pharma.*; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.avro.*; +import org.opencb.cellbase.core.api.PharmaChemicalQuery; +import org.opencb.cellbase.core.api.PolygenicScoreQuery; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.managers.PharmacogenomicsManager; +import org.opencb.cellbase.lib.managers.PolygenicScoreManager; +import org.opencb.commons.datastore.core.QueryOptions; +import org.slf4j.Logger; + +import java.util.*; +import java.util.concurrent.*; +import java.util.stream.Collectors; + +public class FuturePolygenicScoreAnnotator implements Callable>> { + private PolygenicScoreManager polygenicScoreManager; + + private List variantList; + private QueryOptions queryOptions; + private int dataRelease; + + private Logger logger; + + public FuturePolygenicScoreAnnotator(List variantList, QueryOptions queryOptions, int dataRelease, + PolygenicScoreManager polygenicScoreManager, Logger logger) { + this.polygenicScoreManager = polygenicScoreManager; + + this.variantList = variantList; + this.queryOptions = queryOptions; + this.dataRelease = dataRelease; + + this.logger = logger; + } + + @Override + public List> call() throws Exception { + long startTime = System.currentTimeMillis(); + + List> cellBaseDataResultList = new ArrayList<>(variantList.size()); + + logger.debug("PolygenicScore queries..."); + // Want to return only one CellBaseDataResult object per Variant + for (Variant variant : variantList) { + cellBaseDataResultList.add(polygenicScoreManager.getPolygenicScoreAnnotation(variant.getChromosome(), variant.getStart(), + variant.getReference(), variant.getAlternate(), dataRelease)); + } + logger.info("Pharmacogenomics queries performance in {} ms for {} variants", System.currentTimeMillis() - startTime, + variantList.size()); + return cellBaseDataResultList; + } + + public void processResults(Future>> pgsFuture, + List variantAnnotationList) + throws InterruptedException, ExecutionException { + List> pgsCellBaseDataResults; + try { + pgsCellBaseDataResults = pgsFuture.get(30, TimeUnit.SECONDS); + } catch (TimeoutException e) { + pgsFuture.cancel(true); + throw new ExecutionException("Unable to finish polygenic scores query on time", e); + } + + if (CollectionUtils.isNotEmpty(pgsCellBaseDataResults)) { + for (int i = 0; i < variantAnnotationList.size(); i++) { + CellBaseDataResult pgsResult = pgsCellBaseDataResults.get(i); + if (pgsResult != null && CollectionUtils.isNotEmpty(pgsResult.getResults())) { + // Set the polygenic scores in the variant annotation + variantAnnotationList.get(i).setPolygenicScores(pgsResult.getResults()); + } + } + } + } +} From b0aeca4ab180a38c98b55a1e7f33073a146b982d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 3 Jan 2024 16:24:25 +0100 Subject: [PATCH 011/107] lib: add PGS data for variant annotation, #TASK-5411, #TASK-5387 --- .../lib/builders/PolygenicScoreBuilder.java | 14 +++---- .../core/PolygenicScoreMongoDBAdaptor.java | 42 ++++++++++++++----- .../lib/managers/CellBaseManagerFactory.java | 13 ++++++ .../lib/managers/PolygenicScoreManager.java | 8 ++-- .../VariantAnnotationCalculator.java | 11 +++-- .../FuturePolygenicScoreAnnotator.java | 14 +++---- 6 files changed, 69 insertions(+), 33 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java index 43d2d98eb5..2869e1f00b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java @@ -24,11 +24,11 @@ import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.models.core.OntologyTermAnnotation; import org.opencb.biodata.models.core.pgs.CommonPolygenicScore; import org.opencb.biodata.models.core.pgs.PgsCohort; import org.opencb.biodata.models.core.pgs.PolygenicScore; import org.opencb.biodata.models.core.pgs.VariantPolygenicScore; +import org.opencb.biodata.models.variant.avro.OntologyTermAnnotation; import org.opencb.biodata.models.variant.avro.PubmedReference; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; @@ -320,7 +320,7 @@ private void processPgsMetadataFile(File metadataFile, BufferedWriter bw) throws continue; } - Map values = new HashMap<>(); + Map values = new HashMap<>(); if (StringUtils.isNotEmpty(strings.get(2))) { values.put(SAMPLE_SET_KEY, strings.get(2)); } @@ -416,18 +416,18 @@ private void saveVariantPolygenicScore(String line, Map columnP } // Create polygenic score - Map values = new HashMap<>(); + Map values = new HashMap<>(); if (columnPos.containsKey(EFFECT_WEIGHT_COL)) { - values.put(EFFECT_WEIGHT_KEY, Double.parseDouble(field[columnPos.get(EFFECT_WEIGHT_COL)])); + values.put(EFFECT_WEIGHT_KEY, field[columnPos.get(EFFECT_WEIGHT_COL)]); } if (columnPos.containsKey(ALLELEFREQUENCY_EFFECT_COL)) { - values.put(ALLELE_FREQUENCY_EFFECT_KEY, Double.parseDouble(field[columnPos.get(ALLELEFREQUENCY_EFFECT_COL)])); + values.put(ALLELE_FREQUENCY_EFFECT_KEY, field[columnPos.get(ALLELEFREQUENCY_EFFECT_COL)]); } if (columnPos.containsKey(ODDS_RATIO_COL)) { - values.put(ODDS_RATIO_KEY, Double.parseDouble(field[columnPos.get(ODDS_RATIO_COL)])); + values.put(ODDS_RATIO_KEY, field[columnPos.get(ODDS_RATIO_COL)]); } if (columnPos.containsKey(HAZARD_RATIO_COL)) { - values.put(HAZARD_RATIO_KEY, Double.parseDouble(field[columnPos.get(HAZARD_RATIO_COL)])); + values.put(HAZARD_RATIO_KEY, field[columnPos.get(HAZARD_RATIO_COL)]); } if (columnPos.containsKey(LOCUS_NAME_COL)) { values.put(LOCUS_NAME_KEY, field[columnPos.get(LOCUS_NAME_COL)]); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java index 225fc26608..66d10a2bb4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PolygenicScoreMongoDBAdaptor.java @@ -24,6 +24,7 @@ import org.opencb.biodata.models.core.pgs.PolygenicScore; import org.opencb.biodata.models.core.pgs.VariantPolygenicScore; import org.opencb.biodata.models.variant.avro.PolygenicScoreAnnotation; +import org.opencb.biodata.models.variant.avro.PolygenicScoreVariant; import org.opencb.cellbase.core.api.PolygenicScoreQuery; import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; import org.opencb.cellbase.core.exception.CellBaseException; @@ -39,9 +40,7 @@ import org.opencb.commons.datastore.mongodb.MongoDBIterator; import org.opencb.commons.datastore.mongodb.MongoDataStore; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; public class PolygenicScoreMongoDBAdaptor extends CellBaseDBAdaptor @@ -79,7 +78,8 @@ public CellBaseDataResult getPolygenicScoreAnnotation( Bson query = Filters.and(andBsonList); MongoDBCollection mongoDBCollection = getCollectionByRelease(pgsVariantMongoDBCollectionByRelease, dataRelease); - DataResult pgsVariantDataResult = mongoDBCollection.find(query, null, VariantPolygenicScore.class, new QueryOptions()); + DataResult pgsVariantDataResult = mongoDBCollection.find(query, null, VariantPolygenicScore.class, + new QueryOptions()); List results = new ArrayList<>(); @@ -88,12 +88,33 @@ public CellBaseDataResult getPolygenicScoreAnnotation( for (VariantPolygenicScore score : pgsVariantDataResult.getResults()) { if ((score.getEffectAllele().equals(reference) && score.getOtherAllele().equals(alternate)) || (score.getEffectAllele().equals(alternate) && score.getOtherAllele().equals(reference))) { - PolygenicScoreAnnotation pgsAnnotation = new PolygenicScoreAnnotation(); List pgsIds = score.getPolygenicScores().stream().map(PolygenicScore::getId).collect(Collectors.toList()); -// pgsAnnotation.setId(score.get); - pgsAnnotation.getVariants().add(new org.opencb.biodata.models.variant.avro.VariantPolygenicScore( - score.getEffectAllele(), score.getOtherAllele(), score.getPolygenicScores()); - results.add(score); + List> infoResults = info(pgsIds, null, dataRelease, null); + for (CellBaseDataResult infoResult : infoResults) { + CommonPolygenicScore pgs = infoResult.first(); + + // Init PGS + PolygenicScoreAnnotation pgsAnnotation = new PolygenicScoreAnnotation(pgs.getId(), pgs.getName(), pgs.getSource(), + pgs.getVersion(), pgs.getTraits(), pgs.getPubmedRefs(), pgs.getValues(), new ArrayList()); + + // Add PGS variant scores to that PGS + PolygenicScoreVariant pgsVariant = new PolygenicScoreVariant(score.getEffectAllele(), score.getOtherAllele(), + new HashMap<>()); + for (PolygenicScore polygenicScore : score.getPolygenicScores()) { + // Search the matched PGS + System.out.println(">>> polygenic score ID = " + polygenicScore.getId() + ", " + pgs.getId()); + System.out.println(">>> polygenic score variant scores size = " + polygenicScore.getValues().size()); + if (pgs.getId().equals(polygenicScore.getId())) { + System.out.println("FOUND !!!!!!"); + pgsVariant.setValues(polygenicScore.getValues()); + break; + } + } + pgsAnnotation.setVariants(Collections.singletonList(pgsVariant)); + + // Add annotation to the output list + results.add(pgsAnnotation); + } } } } @@ -133,7 +154,8 @@ public CellBaseDataResult distinct(PolygenicScoreQuery query) throws Cel } @Override - public List> info(List ids, ProjectionQueryOptions queryOptions, int dataRelease, String apiKey) throws CellBaseException { + public List> info(List ids, ProjectionQueryOptions queryOptions, int dataRelease, + String apiKey) throws CellBaseException { List> results = new ArrayList<>(); Bson projection = getProjection(queryOptions); for (String id : ids) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/CellBaseManagerFactory.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/CellBaseManagerFactory.java index ba6e90e150..9e610b8ae9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/CellBaseManagerFactory.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/CellBaseManagerFactory.java @@ -45,6 +45,7 @@ public class CellBaseManagerFactory { private FileManager fileManager; private PublicationManager publicationManager; private Map pharmacogenomicsManagers; + private Map polygenicScoreManagers; private Map dataReleaseManagers; @@ -67,6 +68,7 @@ public CellBaseManagerFactory(CellBaseConfiguration configuration) { ontologyManagers = new HashMap<>(); dataReleaseManagers = new HashMap<>(); pharmacogenomicsManagers = new HashMap<>(); + polygenicScoreManagers = new HashMap<>(); } private String getMultiKey(String species, String assembly) { @@ -374,4 +376,15 @@ public PharmacogenomicsManager getPharmacogenomicsManager(String species, String } return pharmacogenomicsManagers.get(multiKey); } + + public PolygenicScoreManager getPolygenicScoreManager(String species, String assembly) throws CellBaseException { + String multiKey = getMultiKey(species, assembly); + if (!polygenicScoreManagers.containsKey(multiKey)) { + if (!validateSpeciesAssembly(species, assembly)) { + throw new CellBaseException("Invalid species " + species + " or assembly " + assembly); + } + polygenicScoreManagers.put(multiKey, new PolygenicScoreManager(species, assembly, configuration)); + } + return polygenicScoreManagers.get(multiKey); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java index c2d6bf7af4..4c0630569e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PolygenicScoreManager.java @@ -16,7 +16,7 @@ package org.opencb.cellbase.lib.managers; -import org.opencb.biodata.models.core.pgs.VariantPolygenicScore; +import org.opencb.biodata.models.core.pgs.CommonPolygenicScore; import org.opencb.biodata.models.variant.avro.PolygenicScoreAnnotation; import org.opencb.cellbase.core.api.PolygenicScoreQuery; import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; @@ -28,7 +28,7 @@ import java.util.List; -public class PolygenicScoreManager extends AbstractManager implements AggregationApi { +public class PolygenicScoreManager extends AbstractManager implements AggregationApi { private PolygenicScoreMongoDBAdaptor pgsDBAdaptor; @@ -47,11 +47,11 @@ private void init() { } @Override - public CellBaseCoreDBAdaptor getDBAdaptor() { + public CellBaseCoreDBAdaptor getDBAdaptor() { return pgsDBAdaptor; } - public List> info(List ids, ProjectionQueryOptions query, int dataRelease, + public List> info(List ids, ProjectionQueryOptions query, int dataRelease, String apiKey) throws CellBaseException { return pgsDBAdaptor.info(ids, query, dataRelease, apiKey); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java index 36564c158d..55db8face9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java @@ -39,6 +39,7 @@ import org.opencb.cellbase.lib.managers.*; import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; import org.opencb.cellbase.lib.variant.annotation.futures.FuturePharmacogenomicsAnnotator; +import org.opencb.cellbase.lib.variant.annotation.futures.FuturePolygenicScoreAnnotator; import org.opencb.cellbase.lib.variant.hgvs.HgvsCalculator; import org.opencb.commons.datastore.core.QueryOptions; import org.slf4j.Logger; @@ -73,6 +74,7 @@ public class VariantAnnotationCalculator { private RepeatsManager repeatsManager; private ProteinManager proteinManager; private PharmacogenomicsManager pharmacogenomicsManager; + private PolygenicScoreManager polygenicScoreManager; private int dataRelease; private String apiKey; private Set annotatorSet; @@ -107,6 +109,7 @@ public VariantAnnotationCalculator(String species, String assembly, int dataRele this.clinicalManager = cellbaseManagerFactory.getClinicalManager(species, assembly); this.repeatsManager = cellbaseManagerFactory.getRepeatsManager(species, assembly); this.pharmacogenomicsManager = cellbaseManagerFactory.getPharmacogenomicsManager(species, assembly); + this.polygenicScoreManager = cellbaseManagerFactory.getPolygenicScoreManager(species, assembly); // Check data release this.dataRelease = cellbaseManagerFactory.getDataReleaseManager(species, assembly).checkDataRelease(dataRelease); @@ -668,6 +671,9 @@ private List runAnnotationProcess(List normalizedVar if (futurePharmacogenomicsAnnotator != null) { futurePharmacogenomicsAnnotator.processResults(pharmacogenomicsFuture, variantAnnotationList); } + if (futurePolygenicScoreAnnotator != null) { + futurePolygenicScoreAnnotator.processResults(polygenicScoreFuture, variantAnnotationList); + } // Not needed with newCachedThreadPool // fixedThreadPool.shutdown(); @@ -1175,7 +1181,8 @@ private Set getAnnotatorSet(QueryOptions queryOptions) { // 'expression' removed in CB 5.0 annotatorSet = new HashSet<>(Arrays.asList("variation", "traitAssociation", "conservation", "functionalScore", "consequenceType", "geneDisease", "drugInteraction", "geneConstraints", "mirnaTargets", "pharmacogenomics", - "cancerGeneAssociation", "cancerHotspots", "populationFrequencies", "repeats", "cytoband", "hgvs")); + "cancerGeneAssociation", "cancerHotspots", "populationFrequencies", "repeats", "cytoband", "hgvs", + EtlCommons.PGS_DATA)); List excludeList = queryOptions.getAsStringList("exclude"); excludeList.forEach(annotatorSet::remove); } @@ -1423,8 +1430,6 @@ private List getConsequenceTypeList(Variant variant, List } private List variantListToRegionList(List variantList) { -// return variantList.stream().map((variant) -> variantToRegion(variant)).collect(Collectors.toList()); - // In great majority of cases returned region list size will equal variant list; this will happen except when // there's a breakend within the variantList List regionList = new ArrayList<>(variantList.size()); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FuturePolygenicScoreAnnotator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FuturePolygenicScoreAnnotator.java index 9b3cf6654b..76f8ed1e85 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FuturePolygenicScoreAnnotator.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FuturePolygenicScoreAnnotator.java @@ -17,21 +17,17 @@ package org.opencb.cellbase.lib.variant.annotation.futures; import org.apache.commons.collections4.CollectionUtils; -import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.models.pharma.*; import org.opencb.biodata.models.variant.Variant; -import org.opencb.biodata.models.variant.avro.*; -import org.opencb.cellbase.core.api.PharmaChemicalQuery; -import org.opencb.cellbase.core.api.PolygenicScoreQuery; +import org.opencb.biodata.models.variant.avro.PolygenicScoreAnnotation; +import org.opencb.biodata.models.variant.avro.VariantAnnotation; import org.opencb.cellbase.core.result.CellBaseDataResult; -import org.opencb.cellbase.lib.managers.PharmacogenomicsManager; import org.opencb.cellbase.lib.managers.PolygenicScoreManager; import org.opencb.commons.datastore.core.QueryOptions; import org.slf4j.Logger; -import java.util.*; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.*; -import java.util.stream.Collectors; public class FuturePolygenicScoreAnnotator implements Callable>> { private PolygenicScoreManager polygenicScoreManager; @@ -65,7 +61,7 @@ public List> call() throws Exceptio cellBaseDataResultList.add(polygenicScoreManager.getPolygenicScoreAnnotation(variant.getChromosome(), variant.getStart(), variant.getReference(), variant.getAlternate(), dataRelease)); } - logger.info("Pharmacogenomics queries performance in {} ms for {} variants", System.currentTimeMillis() - startTime, + logger.info("PolygenicScore queries performance in {} ms for {} variants", System.currentTimeMillis() - startTime, variantList.size()); return cellBaseDataResultList; } From c6c628b2fe855e7508f6b276eff441aa669acd57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 3 Jan 2024 16:46:06 +0100 Subject: [PATCH 012/107] lib: add MongoDB indexes for PGS collections, #TASK-5410, #TASK-5387 --- cellbase-lib/src/main/resources/mongodb-indexes.json | 10 ++++++---- .../src/test/resources/index/mongodb-indexes.json | 7 +++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/cellbase-lib/src/main/resources/mongodb-indexes.json b/cellbase-lib/src/main/resources/mongodb-indexes.json index 160427bcdd..c4f7d71476 100644 --- a/cellbase-lib/src/main/resources/mongodb-indexes.json +++ b/cellbase-lib/src/main/resources/mongodb-indexes.json @@ -146,7 +146,9 @@ {"collection": "pharmacogenomics", "fields": {"variants.confidence": 1}, "options": {"background": true}} {"collection": "pharmacogenomics", "fields": {"variants.evidences.pubmed": 1}, "options": {"background": true}} -{"collection": "common_polygenic_score", "fields": {"id": 1}, "options": {"background": true}} -{"collection": "variant_polygenic_score", "fields": {"_chunkIds": 1}, "options": {"background": true}} -{"collection": "variant_polygenic_score", "fields": {"chromosome": 1, "position": 1}, "options": {"background": true}} -{"collection": "variant_polygenic_score", "fields": {"polygenicScores.id": 1}, "options": {"background": true}} +{"collection": "common_polygenic_scores", "fields": {"id": 1}, "options": {"background": true}} +{"collection": "common_polygenic_scores", "fields": {"name": 1}, "options": {"background": true}} +{"collection": "common_polygenic_scores", "fields": {"source": 1}, "options": {"background": true}} +{"collection": "variant_polygenic_scores", "fields": {"_chunkIds": 1}, "options": {"background": true}} +{"collection": "variant_polygenic_scores", "fields": {"chromosome": 1, "position": 1}, "options": {"background": true}} +{"collection": "variant_polygenic_scores", "fields": {"polygenicScores.id": 1}, "options": {"background": true}} diff --git a/cellbase-lib/src/test/resources/index/mongodb-indexes.json b/cellbase-lib/src/test/resources/index/mongodb-indexes.json index 7c264a469a..a77b79f49f 100644 --- a/cellbase-lib/src/test/resources/index/mongodb-indexes.json +++ b/cellbase-lib/src/test/resources/index/mongodb-indexes.json @@ -127,3 +127,10 @@ {"collection": "splice_score", "fields": {"_chunkIds": 1}, "options": {"background": true}} {"collection": "splice_score", "fields": {"chromosome": 1, "position": 1}, "options": {"background": true}} + +{"collection": "common_polygenic_scores", "fields": {"id": 1}, "options": {"background": true}} +{"collection": "common_polygenic_scores", "fields": {"name": 1}, "options": {"background": true}} +{"collection": "common_polygenic_scores", "fields": {"source": 1}, "options": {"background": true}} +{"collection": "variant_polygenic_scores", "fields": {"_chunkIds": 1}, "options": {"background": true}} +{"collection": "variant_polygenic_scores", "fields": {"chromosome": 1, "position": 1}, "options": {"background": true}} +{"collection": "variant_polygenic_scores", "fields": {"polygenicScores.id": 1}, "options": {"background": true}} From e079931db17ef8326f6aed2b291091fd2de91395 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 5 Jan 2024 08:55:04 +0100 Subject: [PATCH 013/107] lib: update AlphaMissenseBuilder according to biodata changes, #TASK-5419, #TASK-5388 --- .../lib/builders/AlphaMissenseBuilder.java | 41 +++++++++++++++---- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java index c4144df58d..5e1e0f839f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java @@ -22,7 +22,7 @@ import com.fasterxml.jackson.databind.ObjectWriter; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.models.core.ProteinSubstitutionPrediction; -import org.opencb.biodata.models.core.ProteinSubstitutionScore; +import org.opencb.biodata.models.core.ProteinSubstitutionPredictionScore; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.lib.builders.utils.RocksDBUtils; import org.opencb.commons.utils.FileUtils; @@ -75,7 +75,7 @@ public void parse() throws Exception { // Sanity check FileUtils.checkFile(alphaMissenseFile.toPath()); - Object[] dbConnection = RocksDBUtils.getDBConnection(serializer.getOutdir().resolve("rdb.idx").toString(), true); + Object[] dbConnection = RocksDBUtils.getDBConnection(serializer.getOutdir().resolve("alphamissense-rdb.idx").toString(), true); rdb = (RocksDB) dbConnection[0]; Options dbOption = (Options) dbConnection[1]; String dbLocation = (String) dbConnection[2]; @@ -90,12 +90,34 @@ public void parse() throws Exception { // CHROM POS REF ALT genome uniprot_id transcript_id protein_variant am_pathogenicity am_class String[] split = line.split("\t", -1); + String chrom = null; + int position; + String reference; + String alternate = null; String transcriptId; String uniprotId; - int position; + int aaPosition; String aaReference; String aaAlternate; + if (StringUtils.isNotEmpty(split[0])) { + chrom = split[0]; + } + if (StringUtils.isNotEmpty(split[1])) { + position = Integer.parseInt(split[1]); + } else { + logger.warn("Missing field 'position', skipping line: {}", line); + return; + } + if (StringUtils.isNotEmpty(split[2])) { + reference = split[2]; + } else { + logger.warn("Missing field 'reference', skipping line: {}", line); + return; + } + if (StringUtils.isNotEmpty(split[3])) { + alternate = split[3]; + } if (StringUtils.isNotEmpty(split[6])) { transcriptId = split[6]; } else { @@ -112,7 +134,7 @@ public void parse() throws Exception { Matcher matcher = aaChangePattern.matcher(split[7]); if (matcher.matches()) { aaReference = matcher.group(1); - position = Integer.parseInt(matcher.group(2)); + aaPosition = Integer.parseInt(matcher.group(2)); aaAlternate = matcher.group(3); } else { logger.warn("Error parsing field 'protein_variant' = {}, skipping line: {}", split[7], line); @@ -124,7 +146,8 @@ public void parse() throws Exception { } // Create protein substitution score - ProteinSubstitutionScore score = new ProteinSubstitutionScore(); + ProteinSubstitutionPredictionScore score = new ProteinSubstitutionPredictionScore(); + score.setAlternate(alternate); score.setAaAlternate(aaAlternate); if (StringUtils.isNotEmpty(split[8])) { score.setScore(Double.parseDouble(split[8])); @@ -135,11 +158,11 @@ public void parse() throws Exception { // Creating and/or updating protein substitution prediction ProteinSubstitutionPrediction prediction; - String key = transcriptId + "_" + uniprotId + "_" + position + "_" + aaReference; + String key = transcriptId + "_" + uniprotId + "_" + position + "_" + reference + "_" + aaPosition + "_" + aaReference; byte[] dbContent = rdb.get(key.getBytes()); if (dbContent == null) { - prediction = new ProteinSubstitutionPrediction(transcriptId, uniprotId, position, aaReference, "AlphaMissense", - Collections.singletonList(score)); + prediction = new ProteinSubstitutionPrediction(chrom, position, reference, transcriptId, uniprotId, aaPosition, + aaReference, "AlphaMissense", null, Collections.singletonList(score)); } else { prediction = predictionReader.readValue(dbContent); prediction.getScores().add(score); @@ -170,7 +193,7 @@ private void serializeRDB(RocksDB rdb) throws IOException { logger.info("Reading from RocksDB index and serializing to {}.json.gz", serializer.getOutdir().resolve(serializer.getFileName())); int counter = 0; for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { - logger.info("variant = {}", new String(rocksIterator.key())); +// logger.info("variant = {}", new String(rocksIterator.key())); ProteinSubstitutionPrediction prediction = predictionReader.readValue(rocksIterator.value()); serializer.serialize(prediction); counter++; From 0aa0349086061b4480191c05dd5b5185f5e9a306 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 5 Jan 2024 09:14:32 +0100 Subject: [PATCH 014/107] lib: improve AlphaMissenseBuilder by skipping incomplete lines from downloaded file, #TASK-5419, #TASK-5388 --- .../lib/builders/AlphaMissenseBuilder.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java index 5e1e0f839f..c65cefb245 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java @@ -52,6 +52,8 @@ public class AlphaMissenseBuilder extends CellBaseBuilder { private static ObjectReader predictionReader; private static ObjectWriter jsonObjectWriter; + private static final String SOURCE = "AlphaMissense"; + static { mapper = new ObjectMapper(); mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); @@ -107,13 +109,13 @@ public void parse() throws Exception { position = Integer.parseInt(split[1]); } else { logger.warn("Missing field 'position', skipping line: {}", line); - return; + continue; } if (StringUtils.isNotEmpty(split[2])) { reference = split[2]; } else { logger.warn("Missing field 'reference', skipping line: {}", line); - return; + continue; } if (StringUtils.isNotEmpty(split[3])) { alternate = split[3]; @@ -122,13 +124,13 @@ public void parse() throws Exception { transcriptId = split[6]; } else { logger.warn("Missing field 'transcript_id', skipping line: {}", line); - return; + continue; } if (StringUtils.isNotEmpty(split[5])) { uniprotId = split[5]; } else { logger.warn("Missing field 'uniprot_id', skipping line: {}", line); - return; + continue; } if (StringUtils.isNotEmpty(split[7])) { Matcher matcher = aaChangePattern.matcher(split[7]); @@ -138,11 +140,11 @@ public void parse() throws Exception { aaAlternate = matcher.group(3); } else { logger.warn("Error parsing field 'protein_variant' = {}, skipping line: {}", split[7], line); - return; + continue; } } else { logger.warn("Missing field 'protein_variant', skipping line: {}", line); - return; + continue; } // Create protein substitution score @@ -162,7 +164,7 @@ public void parse() throws Exception { byte[] dbContent = rdb.get(key.getBytes()); if (dbContent == null) { prediction = new ProteinSubstitutionPrediction(chrom, position, reference, transcriptId, uniprotId, aaPosition, - aaReference, "AlphaMissense", null, Collections.singletonList(score)); + aaReference, SOURCE, null, Collections.singletonList(score)); } else { prediction = predictionReader.readValue(dbContent); prediction.getScores().add(score); From b06cc8a05d6d27fecba739cc41509cb383864edc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 5 Jan 2024 10:55:11 +0100 Subject: [PATCH 015/107] lib: update RevelScoreBuilder according to biodata changes, #TASK-5441, #TASK-5388 --- .../lib/builders/RevelScoreBuilder.java | 67 ++++++++++++------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java index 2ccf0cb2a1..76876bd2d2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java @@ -17,8 +17,9 @@ package org.opencb.cellbase.lib.builders; -import org.opencb.biodata.models.core.MissenseVariantFunctionalScore; -import org.opencb.biodata.models.core.TranscriptMissenseVariantFunctionalScore; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.ProteinSubstitutionPrediction; +import org.opencb.biodata.models.core.ProteinSubstitutionPredictionScore; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.slf4j.LoggerFactory; @@ -37,34 +38,44 @@ public class RevelScoreBuilder extends CellBaseBuilder { public RevelScoreBuilder(Path revelDirectoryPath, CellBaseSerializer serializer) { super(serializer); - this.revelFilePath = revelDirectoryPath.resolve("revel-v1.3_all_chromosomes.zip"); + this.revelFilePath = revelDirectoryPath.resolve("revel_grch38_all_chromosomes.csv.zip"); logger = LoggerFactory.getLogger(ConservationBuilder.class); } @Override public void parse() throws IOException { - logger.error("processing Revel file at " + revelFilePath.toAbsolutePath()); - ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelFilePath))); + logger.error("Processing Revel file at " + revelFilePath.toAbsolutePath()); + ZipInputStream zis = new ZipInputStream(new FileInputStream(revelFilePath.toFile())); ZipEntry zipEntry = zis.getNextEntry(); - ZipFile zipFile = new ZipFile(String.valueOf(revelFilePath)); + ZipFile zipFile = new ZipFile(revelFilePath.toFile()); InputStream inputStream = zipFile.getInputStream(zipEntry); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); - // skip header + // Skip header line String line = bufferedReader.readLine(); - String[] fields = null; + String[] fields; String lastEntry = null; - String currentEntry = null; - List scores = new ArrayList<>(); - MissenseVariantFunctionalScore predictions = null; + String currentEntry; + List scores = new ArrayList<>(); + ProteinSubstitutionPrediction prediction = null; + + // Main loop, read line by line while ((line = bufferedReader.readLine()) != null) { fields = line.split(","); + // 0 1 2 3 4 5 6 7 8 + // chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL,Ensembl_transcriptid + // 1,35142,35142,G,A,T,M,0.027,ENST00000417324 + + if (StringUtils.isEmpty(fields[0])) { + logger.warn("Missing field 'chr', skipping line: {}", line); + continue; + } String chromosome = fields[0]; - if (".".equalsIgnoreCase(fields[2])) { - // 1,12855835,.,C,A,A,D,0.175 - // skip if invalid position + if (".".equalsIgnoreCase(fields[2]) || StringUtils.isEmpty(fields[2])) { + // Skip line if invalid position + logger.warn("Missing field 'grch38_pos', skipping line: {}", line); continue; } int position = Integer.parseInt(fields[2]); @@ -72,30 +83,36 @@ public void parse() throws IOException { String alternate = fields[4]; String aaReference = fields[5]; String aaAlternate = fields[6]; + if (StringUtils.isEmpty(fields[7])) { + logger.warn("Missing field 'REVEL' (i.e., score value), skipping line: {}", line); + continue; + } double score = Double.parseDouble(fields[7]); + String transcriptId = fields[8]; - currentEntry = chromosome + position; + currentEntry = chromosome + ":" + position; - // new chromosome + position, store previous entry + // New chromosome + position, store previous entry if (lastEntry != null && !currentEntry.equals(lastEntry)) { - serializer.serialize(predictions); + serializer.serialize(prediction); scores = new ArrayList<>(); - predictions = null; + prediction = null; } - if (predictions == null) { - predictions = new MissenseVariantFunctionalScore(chromosome, position, reference, SOURCE, scores); + if (prediction == null) { + prediction = new ProteinSubstitutionPrediction(chromosome, position, reference, transcriptId, null, 0, aaReference, + SOURCE, null, scores); } - TranscriptMissenseVariantFunctionalScore predictedScore = new TranscriptMissenseVariantFunctionalScore("", - alternate, aaReference, aaAlternate, score); + ProteinSubstitutionPredictionScore predictedScore = new ProteinSubstitutionPredictionScore(alternate, aaAlternate, score, null); scores.add(predictedScore); - lastEntry = chromosome + position; + lastEntry = chromosome + ":" + position; } - // serialise last entry - serializer.serialize(predictions); + // Serialize last entry + serializer.serialize(prediction); + // Close zis.close(); zipFile.close(); inputStream.close(); From c43436a98e6923dba2156e976234a61c30b10b93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 9 Jan 2024 09:32:19 +0100 Subject: [PATCH 016/107] lib: fix chromosome name and trancript ID in AlphaMissense builder, #TASK-5419, #TASK-5388 --- .../opencb/cellbase/lib/builders/AlphaMissenseBuilder.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java index c65cefb245..475a91d315 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java @@ -52,7 +52,7 @@ public class AlphaMissenseBuilder extends CellBaseBuilder { private static ObjectReader predictionReader; private static ObjectWriter jsonObjectWriter; - private static final String SOURCE = "AlphaMissense"; + private static final String SOURCE = "alphamissense"; static { mapper = new ObjectMapper(); @@ -103,7 +103,7 @@ public void parse() throws Exception { String aaAlternate; if (StringUtils.isNotEmpty(split[0])) { - chrom = split[0]; + chrom = split[0].replace("chr", ""); } if (StringUtils.isNotEmpty(split[1])) { position = Integer.parseInt(split[1]); @@ -121,7 +121,7 @@ public void parse() throws Exception { alternate = split[3]; } if (StringUtils.isNotEmpty(split[6])) { - transcriptId = split[6]; + transcriptId = split[6].split("\\.")[0]; } else { logger.warn("Missing field 'transcript_id', skipping line: {}", line); continue; From fa99029ef688bc75a88e1fcb69a379d39da73fa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 9 Jan 2024 17:27:56 +0100 Subject: [PATCH 017/107] app: minor improvements in AlphaMissense downloader, #TASK-5419, #TASK-5388 --- .../admin/executors/BuildCommandExecutor.java | 20 ++++++------- .../org/opencb/cellbase/lib/EtlCommons.java | 13 +++++--- .../AlphaMissenseDownloadManager.java | 30 ++++++++++++++----- 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index bde6c86c2a..0d0f1f8bb9 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -442,21 +442,21 @@ private CellBaseBuilder buildPharmacogenomics() throws IOException { } private CellBaseBuilder buildAlphaMissense() throws IOException { - Path pubmedInputFolder = downloadFolder.resolve(EtlCommons.PUBMED_DATA); - Path pubmedOutputFolder = buildFolder.resolve(EtlCommons.PUBMED_DATA); - if (!pubmedOutputFolder.toFile().exists()) { - pubmedOutputFolder.toFile().mkdirs(); + Path inputFolder = downloadFolder.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + Path outputFolder = buildFolder.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + if (!outputFolder.toFile().exists()) { + outputFolder.toFile().mkdirs(); } logger.info("Copying AlphaMissense version file..."); - if (downloadFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME).toFile().exists()) { - Files.copy(downloadFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME), - buildFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME), StandardCopyOption.REPLACE_EXISTING); + if (inputFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME).toFile().exists()) { + Files.copy(inputFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME), + outputFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME), StandardCopyOption.REPLACE_EXISTING); } - String alphaMissenseFilename = new File(configuration.getDownload().getAlphaMissense().getFiles().get(0)).getName(); - File alphaMissenseFile = downloadFolder.resolve(alphaMissenseFilename).toFile(); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.ALPHAMISSENSE_DATA); + File alphaMissenseFile = inputFolder.resolve(EtlCommons.ALPHAMISSENSE_RAW_FILENAME).toFile(); + String basename = EtlCommons.ALPHAMISSENSE_JSON_FILENAME.replace("\\.json\\.gz", ""); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outputFolder, basename); return new AlphaMissenseBuilder(alphaMissenseFile, serializer); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index fa3105b29a..349c70358b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -41,7 +41,6 @@ public class EtlCommons { public static final String GENE_DISEASE_ASSOCIATION_DATA = "gene_disease_association"; public static final String VARIATION_DATA = "variation"; public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score"; - public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; public static final String REGULATION_DATA = "regulation"; public static final String PROTEIN_DATA = "protein"; public static final String CONSERVATION_DATA = "conservation"; @@ -92,12 +91,18 @@ public class EtlCommons { public static final String PUBMED_DATA = "pubmed"; + // Protein substitution predictions consist of sift, polyphen, revel and alphamissense + public static final String PROTEIN_SUBSTITUTION_PREDICTION_DATA = "protein_substitution_predictions"; + // Sift and polyphen + public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; + // Revel + public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; + // AlphaMissense public static final String ALPHAMISSENSE_DATA = "alphamissense"; + public static final String ALPHAMISSENSE_RAW_FILENAME = "AlphaMissense_hg38.tsv.gz"; + public static final String ALPHAMISSENSE_JSON_FILENAME = "alphamissense_hg38.json.gz"; public static final String ALPHAMISSENSE_VERSION_FILENAME = "alphamissenseVersion.json"; - // Load specific data options - public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; - // Path and file names public static final String GERP_SUBDIRECTORY = "gerp"; public static final String MMSPLICE_SUBDIRECTORY = "mmsplice"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AlphaMissenseDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AlphaMissenseDownloadManager.java index 9f4b43fbfb..6b575e8ab7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AlphaMissenseDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AlphaMissenseDownloadManager.java @@ -28,6 +28,7 @@ import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.ALPHAMISSENSE_VERSION_FILENAME; +import static org.opencb.cellbase.lib.EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA; public class AlphaMissenseDownloadManager extends AbstractDownloadManager { @@ -37,22 +38,35 @@ public AlphaMissenseDownloadManager(String species, String assembly, Path target } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { logger.info("Downloading AlphaMissense file..."); // Downloads AlphaMissense file DownloadProperties.URLProperties alphaMissenseUrlProps = configuration.getDownload().getAlphaMissense(); - List list = new ArrayList<>(); - for (String file : alphaMissenseUrlProps.getFiles()) { - String filename = new File(file).getName(); - logger.info("\tDownloading file " + filename); - list.add(downloadFile(file, downloadFolder.resolve(filename).toAbsolutePath().toString())); + // Sanity check + if (alphaMissenseUrlProps.getFiles().size() != 1) { + throw new CellBaseException("AlphaMissense configuration mismatch: that downloader only supports to download one single" + + " AlphaMissense file"); + } + + Path alphamissenseFolder = downloadFolder.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA); + if (!alphamissenseFolder.toFile().exists()) { + if (!alphamissenseFolder.toFile().mkdirs()) { + throw new IOException("Error creating folder: " + alphamissenseFolder.toAbsolutePath()); + } } + // Download the AlphaMissense file + List list = new ArrayList<>(); + String file = alphaMissenseUrlProps.getFiles().get(0); + String filename = new File(file).getName(); + logger.info("\tDownloading file " + filename); + list.add(downloadFile(file, alphamissenseFolder.resolve(EtlCommons.ALPHAMISSENSE_RAW_FILENAME).toAbsolutePath().toString())); + // Save version - saveVersionData(EtlCommons.ALPHAMISSENSE_DATA, EtlCommons.ALPHAMISSENSE_DATA, alphaMissenseUrlProps.getVersion(), getTimeStamp(), - alphaMissenseUrlProps.getFiles(), downloadFolder.resolve(ALPHAMISSENSE_VERSION_FILENAME)); + saveVersionData(PROTEIN_SUBSTITUTION_PREDICTION_DATA, EtlCommons.ALPHAMISSENSE_DATA, alphaMissenseUrlProps.getVersion(), + getTimeStamp(), alphaMissenseUrlProps.getFiles(), alphamissenseFolder.resolve(ALPHAMISSENSE_VERSION_FILENAME)); logger.info("Downloaded AlphaMissense file. Done!"); From 351efe0f1658f02f4f46fecb1e14ab1c92a515ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 9 Jan 2024 17:29:13 +0100 Subject: [PATCH 018/107] lib: add exception to AbstractDownloadManager, #TASK-5419, #TASK-5388 --- .../opencb/cellbase/lib/download/AbstractDownloadManager.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index a4ade6603e..e910409b0f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -119,7 +119,8 @@ private void init() throws CellBaseException, IOException { logger.info("Processing species " + speciesConfiguration.getScientificName()); } - public List download() throws IOException, InterruptedException, NoSuchMethodException, FileFormatException { + public List download() throws IOException, InterruptedException, NoSuchMethodException, FileFormatException, + CellBaseException { return null; } From ecf1d59929cce5bacc067011d50d8d8bba234829 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 9 Jan 2024 17:32:06 +0100 Subject: [PATCH 019/107] lib: add indexes to the collection protein_substitution_predictions, and add suppor for loading AlphaMissense in that collection, #TASK-5420, #TASK-5388 --- .../admin/executors/LoadCommandExecutor.java | 24 +++++++++++++++++++ .../src/main/resources/mongodb-indexes.json | 5 ++++ 2 files changed, 29 insertions(+) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 5a8fd9417b..bc2523df66 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -210,6 +210,11 @@ public void execute() throws CellBaseException { EtlCommons.MISSENSE_VARIATION_SCORE_DATA, sources); break; } + case EtlCommons.ALPHAMISSENSE_DATA: { + // Load data, create index and update release + loadAlphaMissense(); + break; + } case EtlCommons.CONSERVATION_DATA: { // Load data, create index and update release loadConservation(); @@ -442,6 +447,25 @@ private void loadProteinFunctionalPrediction() throws NoSuchMethodException, Int dataReleaseManager.update(dataRelease, "protein_functional_prediction", null, null); } + private void loadAlphaMissense() throws NoSuchMethodException, InterruptedException, ExecutionException, + InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, + IOException, CellBaseException, LoaderException { + Path proteinSubstitutionPath = input.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + + // Load data + Path alphamissensePath = proteinSubstitutionPath.resolve(EtlCommons.ALPHAMISSENSE_JSON_FILENAME); + logger.info("Loading file '{}'", alphamissensePath); + loadRunner.load(alphamissensePath, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, dataRelease); + + // Create index + createIndex(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + + // Update release (collection and sources) + List sources = Collections.singletonList(input.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME)); + dataReleaseManager.update(dataRelease, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, + EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, sources); + } + private void loadClinical() throws FileNotFoundException { Path path = input.resolve(EtlCommons.CLINICAL_VARIANTS_ANNOTATED_JSON_FILE); if (Files.exists(path)) { diff --git a/cellbase-lib/src/main/resources/mongodb-indexes.json b/cellbase-lib/src/main/resources/mongodb-indexes.json index de81c7b83b..5bbff15aa5 100644 --- a/cellbase-lib/src/main/resources/mongodb-indexes.json +++ b/cellbase-lib/src/main/resources/mongodb-indexes.json @@ -145,3 +145,8 @@ {"collection": "pharmacogenomics", "fields": {"variants.phenotypeType": 1}, "options": {"background": true}} {"collection": "pharmacogenomics", "fields": {"variants.confidence": 1}, "options": {"background": true}} {"collection": "pharmacogenomics", "fields": {"variants.evidences.pubmed": 1}, "options": {"background": true}} + +{"collection": "protein_substitution_predictions", "fields": {"checksum": 1}, "options": {"background": true}} +{"collection": "protein_substitution_predictions", "fields": {"uniprotId": 1}, "options": {"background": true}} +{"collection": "protein_substitution_predictions", "fields": {"transcriptId": 1}, "options": {"background": true}} +{"collection": "protein_substitution_predictions", "fields": {"aaPosition": 1}, "options": {"background": true}} From 53c433b86ccc6e31ba96996544f7c517ec03c3f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 10 Jan 2024 08:59:42 +0100 Subject: [PATCH 020/107] lib: add the alphamissenseVersion.json content into the data release collection, #TASK-5420, #TASK-5388 --- .../cellbase/app/cli/admin/executors/LoadCommandExecutor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index bc2523df66..58c1234c6b 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -461,7 +461,7 @@ private void loadAlphaMissense() throws NoSuchMethodException, InterruptedExcept createIndex(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); // Update release (collection and sources) - List sources = Collections.singletonList(input.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME)); + List sources = Collections.singletonList(proteinSubstitutionPath.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME)); dataReleaseManager.update(dataRelease, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, sources); } From ea02c6f69841a892b59dfc02a975d44220c39a8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 10 Jan 2024 09:12:07 +0100 Subject: [PATCH 021/107] app: update Perl script to generate sift/polyphen data according to the new data model, #TASK-5442, #TASK-5388 --- .../protein_function_prediction_matrices.pl | 61 +++++++++++++------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl index de55722396..0614366a7d 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl @@ -6,6 +6,10 @@ use Digest::MD5 qw(md5 md5_hex md5_base64); use JSON; +#use lib "~/appl/cellbase/build/scripts/ensembl-scripts/"; +#use lib "~/soft/ensembl-variation/modules/"; +#use lib "~/soft/ensembl/modules/"; + use DB_CONFIG; my $species = 'Homo sapiens'; @@ -126,42 +130,61 @@ ## HASH ## my $effect = {}; + $effect->{"chromosome"} = $trans->seq_region_name; $effect->{"transcriptId"} = $trans->stable_id; - $effect->{"checksum"} = $md5seq; - $effect->{"size"} = length($seq); +# $effect->{"checksum"} = $md5seq; +# $effect->{"size"} = length($seq); foreach my $u (@{ $trans->get_all_xrefs('Uniprot/SWISSPROT') }){ $effect->{"uniprotId"} = $u->display_id(); } + $effect->{"source"} = "polyphen"; my $polyphen2 = $prot_function_adaptor->fetch_polyphen_predictions_by_translation_md5($md5seq); - for(my $i=1; $i<=length($seq); $i++) { - foreach (my $j=0; $j < @aa_code; $j++) { - if(defined $polyphen2) { + if(defined $polyphen2) { + for(my $i=1; $i<=length($seq); $i++) { + $effect->{"aaPosition"} = $i; + my @scores = (); + foreach (my $j=0; $j < @aa_code; $j++) { @preds = $polyphen2->get_prediction($i, $aa_code[$j]); - $effect->{"aaPositions"}->{$i}->{$aa_code[$j]}->{"pe"} = $effect_code{$preds[0]}; - $effect->{"aaPositions"}->{$i}->{$aa_code[$j]}->{"ps"} = $preds[1]; + if(defined $preds[0] || defined $preds[1]) { + push @scores, {"aaAlternate" => $aa_code[$j], "score" => $preds[0], "effect" => $preds[1]}; + $effect->{"scores"} = \@scores; +# print "-- polyphen = aa pos = " . $i . ", aa_code[" . $j . "] = " .$aa_code[$j] . " -> " . $preds[0] . ", " .$preds[1] . "\n"; + } + } + if(@scores) { + print FILE to_json($effect)."\n"; } } } - my $sift = $prot_function_adaptor->fetch_sift_predictions_by_translation_md5($md5seq); - for(my $i=1; $i<=length($seq); $i++) { - foreach (my $j=0; $j < @aa_code; $j++) { - if(defined $sift) { - @preds = $sift->get_prediction($i, $aa_code[$j]); - $effect->{"aaPositions"}->{$i}->{$aa_code[$j]}->{"se"} = $effect_code{$preds[0]}; - $effect->{"aaPositions"}->{$i}->{$aa_code[$j]}->{"ss"} = $preds[1]; - } - } - } - print FILE to_json($effect)."\n"; + $effect->{"source"} = "sift"; + my $sift = $prot_function_adaptor->fetch_sift_predictions_by_translation_md5($md5seq); + if(defined $sift) { + for(my $i=1; $i<=length($seq); $i++) { + $effect->{"aaPosition"} = $i; + my @scores = (); + foreach (my $j=0; $j < @aa_code; $j++) { + @preds = $sift->get_prediction($i, $aa_code[$j]); + if(defined $preds[0] || defined $preds[1]) { + push @scores, {"aaAlternate" => $aa_code[$j], "score" => $preds[0], "effect" => $preds[1]}; + $effect->{"scores"} = \@scores; +# print "-- sift = aa pos = " . $i . ", aa_code[" . $j . "] = " .$aa_code[$j] . " -> " . $preds[0] . ", " .$preds[1] . "\n"; + } + } + if(@scores) { + print FILE to_json($effect)."\n"; + } + } + } +# last; } } close(FILE); ## GZip output to save space in Amazon AWS -# exec("gzip prot_func_pred_chr_".$chrom->seq_region_name); + exec("gzip " . $outdir . "/prot_func_pred_chr_" . $chr->seq_region_name . ".json"); } sub print_parameters { From 92295763ad58e0ad5b5ba099f6cb74046a135fdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 10 Jan 2024 16:14:58 +0100 Subject: [PATCH 022/107] app: update Perl script to generate JSON files for the sift and polyphen versions, #TASK-5442, #TASK-5388 --- .../protein_function_prediction_matrices.pl | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl index 0614366a7d..162341e7ad 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl @@ -91,6 +91,37 @@ #} #print join("=", $polyphen2->get_prediction(1, 'G'))."\n"; +################################################################## + +# Get the current time +my ($sec, $min, $hour, $mday, $mon, $year) = localtime(); +# Adjust the year and month values (year is years since 1900, and month is 0-based) + +$year += 1900; +$mon += 1; + +# Format the date and time +my $formatted_date = sprintf("%04d%02d%02d_%02d%02d%02d", $year, $mon, $mday, $hour, $min, $sec); + +my $jsonVersion = {}; +$jsonVersion->{"date"} = $formatted_date; +$jsonVersion->{"data"} = "protein_substitution_predictions"; +$jsonVersion->{"version"} = "Ensembl 104"; +my @urls = (); +push @urls, "ensembldb.ensembl.org:3306"; +$jsonVersion->{"url"} = \@urls; + +print "Generating the JSON file for the Sift version.\n"; +$jsonVersion->{"name"} = "sift"; +open(FILE, ">".$outdir."/siftVersion.json") || die "error opening file\n"; +print FILE to_json($jsonVersion) . "\n"; +close(FILE); + +print "Generating the JSON file for the PolyPhen version\n"; +$jsonVersion->{"name"} = "polyphen"; +open(FILE, ">".$outdir."/polyphenVersion.json") || die "error opening file\n"; +print FILE to_json($jsonVersion) . "\n"; +close(FILE); my ($translation, $seq, $md5seq, @preds, @all_predictions); #my @transcripts = @{$transcript_adaptor->fetch_all_by_biotype('protein_coding')}; From 2e764a42cf7e1204ad510bed89d68e4e41ce9e3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 11 Jan 2024 10:43:01 +0100 Subject: [PATCH 023/107] lib: update loader for Revel data, #TASK-5441, #TASK-5388 --- .../app/cli/admin/AdminCliOptionsParser.java | 4 ++-- .../cli/admin/executors/LoadCommandExecutor.java | 13 +++++++------ .../java/org/opencb/cellbase/lib/EtlCommons.java | 2 ++ .../cellbase/lib/builders/RevelScoreBuilder.java | 2 +- .../cellbase/lib/loader/MongoDBCellBaseLoader.java | 3 +-- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index d3ab054ddb..aa9dd2cd1b 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -190,8 +190,8 @@ public class LoadCommandOptions { public CommonCommandOptions commonOptions = commonCommandOptions; @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation," - + " conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed, pharmacogenomics." - + " 'all' loads everything", required = true, arity = 1) + + " conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed, pharmacogenomics," + + " missense_variation_functional_score, alphamissense; and 'all' loads everything", required = true, arity = 1) public String data; @Parameter(names = {"-i", "--input"}, required = true, arity = 1, diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 58c1234c6b..019ed822d1 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -24,6 +24,7 @@ import org.opencb.cellbase.core.models.DataRelease; import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.EtlCommons; +import org.opencb.cellbase.lib.builders.RevelScoreBuilder; import org.opencb.cellbase.lib.impl.core.CellBaseDBAdaptor; import org.opencb.cellbase.lib.indexer.IndexManager; import org.opencb.cellbase.lib.loader.LoadRunner; @@ -198,16 +199,16 @@ public void execute() throws CellBaseException { } case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: { // Load data - loadIfExists(input.resolve("missense_variation_functional_score.json.gz"), - "missense_variation_functional_score"); + Path path = input.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + loadIfExists(path.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_JSON_FILENAME), + EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); // Create index - createIndex("missense_variation_functional_score"); + createIndex(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); // Update release (collection and sources) - List sources = new ArrayList<>(Collections.singletonList(input.resolve("revelVersion.json"))); - dataReleaseManager.update(dataRelease, "missense_variation_functional_score", - EtlCommons.MISSENSE_VARIATION_SCORE_DATA, sources); + dataReleaseManager.update(dataRelease, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, + RevelScoreBuilder.SOURCE, Collections.singletonList(path.resolve(EtlCommons.REVEL_VERSION_FILENAME))); break; } case EtlCommons.ALPHAMISSENSE_DATA: { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 349c70358b..a4c9cb13b0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -97,6 +97,8 @@ public class EtlCommons { public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; // Revel public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; + public static final String MISSENSE_VARIATION_SCORE_JSON_FILENAME = "missense_variation_functional_score.json.gz"; + public static final String REVEL_VERSION_FILENAME = "revelVersion.json"; // AlphaMissense public static final String ALPHAMISSENSE_DATA = "alphamissense"; public static final String ALPHAMISSENSE_RAW_FILENAME = "AlphaMissense_hg38.tsv.gz"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java index 76876bd2d2..d21e661215 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java @@ -34,7 +34,7 @@ public class RevelScoreBuilder extends CellBaseBuilder { private Path revelFilePath = null; - private static final String SOURCE = "revel"; + public static final String SOURCE = "revel"; public RevelScoreBuilder(Path revelDirectoryPath, CellBaseSerializer serializer) { super(serializer); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java index 8ab745feab..4da90ee1a5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java @@ -151,8 +151,7 @@ private String getCollectionName() throws LoaderException { if (dr.getCollections().containsKey(data)) { String collectionName = CellBaseDBAdaptor.buildCollectionName(data, dataRelease); if (dr.getCollections().get(data).equals(collectionName)) { - throw new LoaderException("Impossible load data " + data + " with release " + dataRelease + " since it" - + " has already been done."); + logger.warn("Loading new data " + data + " with release " + dataRelease + " (already populated previously)"); } } } From cacfd7883a8ec122aff7c2b9b774bf97270cd30b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 11 Jan 2024 16:31:42 +0100 Subject: [PATCH 024/107] lib: update loader for sift/polyphen data, #TASK-5442, #TASK-5388 --- .../app/cli/admin/AdminCliOptionsParser.java | 3 +- .../admin/executors/LoadCommandExecutor.java | 32 +++++++++++++------ .../org/opencb/cellbase/lib/EtlCommons.java | 4 +++ 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index aa9dd2cd1b..dc29ccb8b9 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -191,7 +191,8 @@ public class LoadCommandOptions { @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation," + " conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed, pharmacogenomics," - + " missense_variation_functional_score, alphamissense; and 'all' loads everything", required = true, arity = 1) + + " protein_functional_prediction, missense_variation_functional_score, alphamissense; and 'all' loads everything", + required = true, arity = 1) public String data; @Parameter(names = {"-i", "--input"}, required = true, arity = 1, diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 019ed822d1..3cffb61fb9 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -197,6 +197,11 @@ public void execute() throws CellBaseException { EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, sources); break; } + case EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA: { + // Load data, create index and update release + loadProteinFunctionalPrediction(); + break; + } case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: { // Load data Path path = input.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); @@ -256,11 +261,6 @@ public void execute() throws CellBaseException { // loadIfExists(input.resolve("intactVersion.json"), METADATA); // createIndex("protein_protein_interaction"); // break; - case EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA: { - // Load data, create index and update release - loadProteinFunctionalPrediction(); - break; - } case EtlCommons.CLINICAL_VARIANTS_DATA: { // Load data, create index and update release loadClinical(); @@ -433,19 +433,33 @@ private void loadProteinFunctionalPrediction() throws NoSuchMethodException, Int InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, CellBaseException, LoaderException { // Load data - DirectoryStream stream = Files.newDirectoryStream(input, + Path path = input.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + DirectoryStream stream = Files.newDirectoryStream(path, entry -> entry.getFileName().toString().startsWith("prot_func_pred_")); for (Path entry : stream) { logger.info("Loading file '{}'", entry); - loadRunner.load(input.resolve(entry.getFileName()), "protein_functional_prediction", dataRelease); + loadRunner.load(path.resolve(entry.getFileName()), EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, dataRelease); } // Create index - createIndex("protein_functional_prediction"); + createIndex(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); // Update release (collection and sources) - dataReleaseManager.update(dataRelease, "protein_functional_prediction", null, null); + String sourceName = null; + List sourceUrls = new ArrayList<>(); + if (path.resolve(EtlCommons.SIFT_VERSION_FILENAME).toFile().exists()) { + sourceUrls.add(path.resolve(EtlCommons.SIFT_VERSION_FILENAME)); + sourceName = EtlCommons.SIFT_SOURCE_NAME; + } + dataReleaseManager.update(dataRelease, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, sourceName, sourceUrls); + + sourceUrls = new ArrayList<>(); + if (path.resolve(EtlCommons.POLYPHEN_VERSION_FILENAME).toFile().exists()) { + sourceUrls.add(path.resolve(EtlCommons.POLYPHEN_VERSION_FILENAME)); + sourceName = EtlCommons.POLYPHEN_SOURCE_NAME; + } + dataReleaseManager.update(dataRelease, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, sourceName, sourceUrls); } private void loadAlphaMissense() throws NoSuchMethodException, InterruptedException, ExecutionException, diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index a4c9cb13b0..b793c09cdc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -95,6 +95,10 @@ public class EtlCommons { public static final String PROTEIN_SUBSTITUTION_PREDICTION_DATA = "protein_substitution_predictions"; // Sift and polyphen public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; + public static final String SIFT_SOURCE_NAME = "Sift"; + public static final String POLYPHEN_SOURCE_NAME = "PolyPhen"; + public static final String SIFT_VERSION_FILENAME = "siftVersion.json"; + public static final String POLYPHEN_VERSION_FILENAME = "polyphenVersion.json"; // Revel public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; public static final String MISSENSE_VARIATION_SCORE_JSON_FILENAME = "missense_variation_functional_score.json.gz"; From 41672826cd494d41f4717e4765b33d5ca7eb6dfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 12 Jan 2024 10:43:16 +0100 Subject: [PATCH 025/107] lib: fix Perl script to download sift/polyphen data, #TASK-5442, #TASK-5388 --- .../protein_function_prediction_matrices.pl | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl index 162341e7ad..b1f4004f2c 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl @@ -163,8 +163,6 @@ my $effect = {}; $effect->{"chromosome"} = $trans->seq_region_name; $effect->{"transcriptId"} = $trans->stable_id; -# $effect->{"checksum"} = $md5seq; -# $effect->{"size"} = length($seq); foreach my $u (@{ $trans->get_all_xrefs('Uniprot/SWISSPROT') }){ $effect->{"uniprotId"} = $u->display_id(); @@ -179,9 +177,8 @@ foreach (my $j=0; $j < @aa_code; $j++) { @preds = $polyphen2->get_prediction($i, $aa_code[$j]); if(defined $preds[0] || defined $preds[1]) { - push @scores, {"aaAlternate" => $aa_code[$j], "score" => $preds[0], "effect" => $preds[1]}; + push @scores, {"aaAlternate" => $aa_code[$j], "score" => $preds[1], "effect" => $preds[0]}; $effect->{"scores"} = \@scores; -# print "-- polyphen = aa pos = " . $i . ", aa_code[" . $j . "] = " .$aa_code[$j] . " -> " . $preds[0] . ", " .$preds[1] . "\n"; } } if(@scores) { @@ -199,9 +196,8 @@ foreach (my $j=0; $j < @aa_code; $j++) { @preds = $sift->get_prediction($i, $aa_code[$j]); if(defined $preds[0] || defined $preds[1]) { - push @scores, {"aaAlternate" => $aa_code[$j], "score" => $preds[0], "effect" => $preds[1]}; + push @scores, {"aaAlternate" => $aa_code[$j], "score" => $preds[1], "effect" => $preds[0]}; $effect->{"scores"} = \@scores; -# print "-- sift = aa pos = " . $i . ", aa_code[" . $j . "] = " .$aa_code[$j] . " -> " . $preds[0] . ", " .$preds[1] . "\n"; } } if(@scores) { @@ -209,7 +205,6 @@ } } } -# last; } } close(FILE); From bbabc8d42f137bcc2cf69db8ebd5a75ea270a302 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 12 Jan 2024 11:22:36 +0100 Subject: [PATCH 026/107] app: update exporter for protein substitution predictions (sift, polyphen, revel and alphamissense), #TASK-5464, #TASK-5388 --- .../app/cli/admin/AdminCliOptionsParser.java | 5 ++- .../executors/ExportCommandExecutor.java | 40 ++----------------- 2 files changed, 7 insertions(+), 38 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index dc29ccb8b9..afefeef950 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -19,6 +19,7 @@ import com.beust.jcommander.*; import org.opencb.cellbase.app.cli.CliOptionsParser; import org.opencb.cellbase.core.api.key.ApiKeyQuota; +import org.opencb.cellbase.lib.EtlCommons; import java.util.HashMap; import java.util.List; @@ -238,8 +239,8 @@ public class ExportCommandOptions { public CommonCommandOptions commonOptions = commonCommandOptions; @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation, " - + "conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed. 'all' " - + " loads everything", required = true, arity = 1) + + EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA + ", conservation, regulation, protein, clinical_variants, repeats," + + " regulatory_pfm, splice_score, pubmed. 'all' export everything", required = true, arity = 1) public String data; @Parameter(names = {"--db", "--database"}, description = "Database name, e.g., cellbase_hsapiens_grch38_v5", required = true, diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java index 72f992f344..bf887d3dbb 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java @@ -84,9 +84,9 @@ public ExportCommandExecutor(AdminCliOptionsParser.ExportCommandOptions exportCo if (exportCommandOptions.data.equals("all")) { this.dataToExport = new String[]{EtlCommons.GENOME_DATA, EtlCommons.GENE_DATA, EtlCommons.REFSEQ_DATA, EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, - EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, + PROTEIN_SUBSTITUTION_PREDICTION_DATA, EtlCommons.VARIATION_DATA, EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, - OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; + OBO_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { this.dataToExport = exportCommandOptions.data.split(","); } @@ -200,38 +200,6 @@ public void execute() throws CellBaseException { counterMsg = counter + " CADD items"; break; } - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: { - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output); - ProteinManager proteinManager = managerFactory.getProteinManager(species, assembly); - Map> positionMap = new HashMap<>(); - for (Variant variant : variants) { - if (!positionMap.containsKey(variant.getChromosome())) { - positionMap.put(variant.getChromosome(), new ArrayList<>()); - } - positionMap.get(variant.getChromosome()).add(variant.getStart()); - if (positionMap.get(variant.getChromosome()).size() >= 200) { - CellBaseDataResult results = proteinManager - .getMissenseVariantFunctionalScores(variant.getChromosome(), - positionMap.get(variant.getChromosome()), null, dataRelease); - counter += writeExportedData(results.getResults(), "missense_variation_functional_score", serializer); - positionMap.put(variant.getChromosome(), new ArrayList<>()); - } - } - - // Process map - for (Map.Entry> entry : positionMap.entrySet()) { - if (CollectionUtils.isEmpty(entry.getValue())) { - continue; - } - CellBaseDataResult results = proteinManager - .getMissenseVariantFunctionalScores(entry.getKey(), entry.getValue(), null, dataRelease); - counter += writeExportedData(results.getResults(), "missense_variation_functional_score", serializer); - } - serializer.close(); - - counterMsg = counter + " missense variation functional scores"; - break; - } case EtlCommons.CONSERVATION_DATA: { // Export data CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output); @@ -271,7 +239,7 @@ public void execute() throws CellBaseException { counterMsg = counter + " proteins"; break; } - case EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA: { + case EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA: { ProteinManager proteinManager = managerFactory.getProteinManager(species, assembly); Map> transcriptsMap = new HashMap<>(); for (Gene gene : genes) { @@ -290,7 +258,7 @@ public void execute() throws CellBaseException { } serializer.close(); - counterMsg = counter + " protein functional predictions"; + counterMsg = counter + " protein substitution predictions"; break; } case EtlCommons.CLINICAL_VARIANTS_DATA: { From a4bbb37f5a658200373ac49bc077a45ff334ab8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 12 Jan 2024 11:24:08 +0100 Subject: [PATCH 027/107] app: update protein manager and DB adaptor to retreive protein substitution predictions (sift, polyphen, revel and alphamissense), #TASK-TASK-5421, #TASK-5388 --- .../lib/impl/core/ProteinMongoDBAdaptor.java | 98 ++++++++----------- .../cellbase/lib/managers/ProteinManager.java | 19 ---- 2 files changed, 43 insertions(+), 74 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java index 353b4042c4..3a30b50f93 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java @@ -18,11 +18,13 @@ import com.mongodb.BasicDBList; import com.mongodb.client.model.Filters; -import com.mongodb.client.model.Projections; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.bson.Document; import org.bson.conversions.Bson; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.Entry; +import org.opencb.biodata.models.core.ProteinSubstitutionPrediction; +import org.opencb.biodata.models.core.ProteinSubstitutionPredictionScore; import org.opencb.biodata.models.variant.avro.ProteinFeature; import org.opencb.biodata.models.variant.avro.ProteinVariantAnnotation; import org.opencb.biodata.models.variant.avro.Score; @@ -34,7 +36,7 @@ import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.iterator.CellBaseIterator; import org.opencb.cellbase.lib.iterator.CellBaseMongoDBIterator; -import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; +import org.opencb.commons.datastore.core.DataResult; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.core.QueryParam; @@ -90,69 +92,55 @@ private void init() { logger.debug("ProteinMongoDBAdaptor: in 'constructor'"); mongoDBCollectionByRelease = buildCollectionByReleaseMap("protein"); - proteinSubstitutionMongoDBCollectionByRelease = buildCollectionByReleaseMap("protein_functional_prediction"); + proteinSubstitutionMongoDBCollectionByRelease = buildCollectionByReleaseMap("protein_substitution_predictions"); } - public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, Integer position, String aa) throws CellBaseException { - CellBaseDataResult result = null; + public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, Integer aaPosition, String aa) throws CellBaseException { + long dbTimeStart = System.currentTimeMillis(); + Map scoreSet = new HashMap<>(); + + // transcriptId, aaPosition, aaAlternate are needed for this collection + if (query.getTranscriptsId() != null && query.getTranscriptsId().get(0) != null && aaPosition != null + && StringUtils.isNotEmpty(aa)) { - // Ensembl transcript id is needed for this collection - if (query.getTranscriptsId() != null && query.getTranscriptsId().get(0) != null) { - String transcriptId = query.getTranscriptsId().get(0).split("\\.")[0]; - Bson transcript = Filters.eq("transcriptId", transcriptId); MongoDBCollection mongoDBCollection = getCollectionByRelease(proteinSubstitutionMongoDBCollectionByRelease, query.getDataRelease()); - String aaShortName = null; - // If position and aa change are provided we create a 'projection' to return only the required data from the database - if (position != null) { - String projectionString = "aaPositions." + position; - - // If aa change is provided we only return that information - if (StringUtils.isNotEmpty(aa)) { - aaShortName = aaShortNameMap.get(aa.toUpperCase()); - projectionString += "." + aaShortName; - } - - // Projection is used to minimize the returned data - Bson positionProjection = Projections.include(projectionString); - result = new CellBaseDataResult<>(mongoDBCollection.find(transcript, positionProjection, query.toQueryOptions())); - } else { - // Return the whole transcript data - result = new CellBaseDataResult<>(mongoDBCollection.find(transcript, query.toQueryOptions())); - } - - if (result != null && !result.getResults().isEmpty()) { - Document document = (Document) result.getResults().get(0); - Document aaPositionsDocument = (Document) document.get("aaPositions"); - - // Position or aa change were not provided, returning whole transcript data - if (position == null || position == -1 || aaShortName == null) { - // Return only the inner Document, not the whole document projected - result.setResults(Collections.singletonList(aaPositionsDocument)); - // Position and aa were provided, return only corresponding Score objects - } else { - List scoreList = null; - if (result.getNumResults() == 1 && aaPositionsDocument != null) { - scoreList = new ArrayList<>(NUM_PROTEIN_SUBSTITUTION_SCORE_METHODS); - Document positionDocument = (Document) aaPositionsDocument.get(Integer.toString(position)); - Document aaDocument = (Document) positionDocument.get(aaShortName); - if (aaDocument.get("ss") != null) { - scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ss")), - "sift", VariantAnnotationUtils.SIFT_DESCRIPTIONS.get(aaDocument.get("se")))); - } - if (aaDocument.get("ps") != null) { - scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ps")), - "polyphen", VariantAnnotationUtils.POLYPHEN_DESCRIPTIONS.get(aaDocument.get("pe")))); + List andBsonList = new ArrayList<>(); + // Sanity check, protein substitution predictions do not contain the transcript ID version + String transcriptId = query.getTranscriptsId().get(0).split("\\.")[0]; + andBsonList.add(Filters.eq("transcriptId", transcriptId)); + andBsonList.add(Filters.eq("aaPosition", aaPosition)); + String aaAlternate = aaShortNameMap.get(aa.toUpperCase()); + andBsonList.add(Filters.eq("scores.aaAlternate", aaAlternate)); + Bson bson = Filters.and(andBsonList); + + System.out.println("transcriptId = " + transcriptId + ", aaPosition = " + aaPosition + ", aa = " + aa + ", aaAlternate = " + + aaAlternate); + + DataResult predictions = mongoDBCollection.find(bson, null, ProteinSubstitutionPrediction.class, + new QueryOptions()); + + if (predictions != null && CollectionUtils.isNotEmpty(predictions.getResults())) { + for (ProteinSubstitutionPrediction prediction : predictions.getResults()) { + for (ProteinSubstitutionPredictionScore predictionScore : prediction.getScores()) { + System.out.println("predictionScore = " + predictionScore.toString()); + if (StringUtils.isNotEmpty(predictionScore.getAaAlternate()) && StringUtils.isNotEmpty(aaAlternate) + && predictionScore.getAaAlternate().equals(aaAlternate)) { + String key = prediction.getSource() + ":" + predictionScore.getScore() + ":" + predictionScore.getEffect(); + if (!scoreSet.containsKey(key)) { + Score score = new Score(predictionScore.getScore(), prediction.getSource(), predictionScore.getEffect()); + scoreSet.put(key, score); + } } } - result.setResults(scoreList); } } } - // Return null if no transcript id is provided - return result; + int dbTime = Long.valueOf(System.currentTimeMillis() - dbTimeStart).intValue(); + return new CellBaseDataResult<>("getSubstitutionScores", dbTime, new ArrayList<>(), scoreSet.size(), + new ArrayList<>(scoreSet.values()), scoreSet.size()); } // public CellBaseDataResult getSubstitutionScores(Query query, QueryOptions options) { @@ -231,12 +219,12 @@ public CellBaseDataResult getVariantAnnotation(String // Stop_gain/lost variants do not have SIFT/POLYPHEN scores // System.out.println("aaReference = " + aaReference); // System.out.println("aaAlternate = " + aaAlternate); - if (!aaAlternate.equals("STOP") && !aaReference.equals("STOP")) { +// if (!aaAlternate.equals("STOP") && !aaReference.equals("STOP")) { TranscriptQuery query = new TranscriptQuery(); query.setTranscriptsId(Collections.singletonList(ensemblTranscriptId)); query.setDataRelease(dataRelease); proteinVariantAnnotation.setSubstitutionScores(getSubstitutionScores(query, position, aaAlternate).getResults()); - } +// } CellBaseDataResult proteinVariantData; String shortAlternativeAa = aaShortNameMap.get(aaAlternate); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java index 0505c80ad9..e1a0681476 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java @@ -18,9 +18,7 @@ import com.fasterxml.jackson.core.JsonProcessingException; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.Entry; -import org.opencb.biodata.models.core.MissenseVariantFunctionalScore; import org.opencb.biodata.models.core.Transcript; -import org.opencb.biodata.models.core.TranscriptMissenseVariantFunctionalScore; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.ProteinVariantAnnotation; import org.opencb.biodata.models.variant.avro.Score; @@ -105,17 +103,6 @@ public CellBaseDataResult getVariantAnnotation(Variant int dataRelease) throws CellBaseException { CellBaseDataResult proteinVariantAnnotation = proteinDBAdaptor.getVariantAnnotation(ensemblTranscriptId, aaPosition, aaReference, aaAlternate, options, dataRelease); - CellBaseDataResult revelResults = - missenseVariationFunctionalScoreMongoDBAdaptor.getScores( - variant.getChromosome(), variant.getStart(), variant.getReference(), variant.getAlternate(), - aaReference, aaAlternate, dataRelease); - if (proteinVariantAnnotation.getResults() != null && revelResults.getResults() != null) { - if (proteinVariantAnnotation.getResults().get(0).getSubstitutionScores() == null) { - proteinVariantAnnotation.getResults().get(0).setSubstitutionScores(new ArrayList<>()); - } - proteinVariantAnnotation.getResults().get(0).getSubstitutionScores().add( - new Score(revelResults.first().getScore(), "revel", "")); - } return proteinVariantAnnotation; } @@ -123,12 +110,6 @@ public CellBaseDataResult getProteinSubstitutionRawData(List tra int dataRelease) throws CellBaseException { return proteinDBAdaptor.getProteinSubstitutionRawData(transcriptIds, options, dataRelease); } - - public CellBaseDataResult getMissenseVariantFunctionalScores(String chromosome, List positions, - CellBaseQueryOptions options, - int dataRelease) throws CellBaseException { - return missenseVariationFunctionalScoreMongoDBAdaptor.getScores(chromosome, positions, options, dataRelease); - } } From e3e9e21915efe98604d139d81a3063172931035e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 17 Jan 2024 11:31:16 +0100 Subject: [PATCH 028/107] lib: update Revel downloader, #TASK-5441, #TASK-5388 --- .../app/cli/admin/AdminCliOptionsParser.java | 2 +- .../admin/executors/BuildCommandExecutor.java | 10 +++---- .../executors/DownloadCommandExecutor.java | 4 +-- .../src/main/resources/configuration.yml | 1 + .../org/opencb/cellbase/lib/EtlCommons.java | 5 ++++ .../lib/builders/RevelScoreBuilder.java | 5 ++-- .../cellbase/lib/download/Downloader.java | 4 +-- ...r.java => RevelScoresDownloadManager.java} | 27 +++++++++++-------- 8 files changed, 35 insertions(+), 23 deletions(-) rename cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/{MissenseScoresDownloadManager.java => RevelScoresDownloadManager.java} (58%) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index afefeef950..4013594a66 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -90,7 +90,7 @@ public class DownloadCommandOptions { @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: genome, gene, " + "variation, variation_functional_score, regulation, protein, conservation, " - + "clinical_variants, repeats, svs, pubmed, alphamissense; and 'all' to download everything", required = true, arity = 1) + + "clinical_variants, repeats, svs, pubmed, revel, alphamissense; and 'all' to download everything", required = true, arity = 1) public String data; @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 0d0f1f8bb9..9a5d1c066a 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -135,7 +135,7 @@ public void execute() { case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: parser = buildCadd(); break; - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: + case EtlCommons.REVEL_DATA: parser = buildRevel(); break; case EtlCommons.REGULATION_DATA: @@ -287,10 +287,10 @@ private CellBaseBuilder buildCadd() { } private CellBaseBuilder buildRevel() { - Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - copyVersionFiles(Arrays.asList(missensePredictionScorePath.resolve("revelVersion.json"))); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - return new RevelScoreBuilder(missensePredictionScorePath, serializer); + Path path = downloadFolder.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + copyVersionFiles(Arrays.asList(path.resolve(EtlCommons.REVEL_VERSION_FILENAME))); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.REVEL_DATA); + return new RevelScoreBuilder(path, serializer); } private CellBaseBuilder buildRegulation() { diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index d905f83f98..07454f8955 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -75,8 +75,8 @@ public void execute() { case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: downloadFiles.addAll(downloader.downloadCaddScores()); break; - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: - downloadFiles.addAll(downloader.downloadPredictionScores()); + case EtlCommons.REVEL_DATA: + downloadFiles.addAll(downloader.downloadRevelScores()); break; case EtlCommons.REGULATION_DATA: downloadFiles.addAll(downloader.downloadRegulation()); diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index cef9cf79a0..ca68a3a6ce 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -159,6 +159,7 @@ download: goAnnotation: host: http://geneontology.org/gene-associations/goa_human.gaf.gz revel: + version: v1.3 host: https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip pubmed: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index b793c09cdc..5a388ca0ff 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -100,8 +100,13 @@ public class EtlCommons { public static final String SIFT_VERSION_FILENAME = "siftVersion.json"; public static final String POLYPHEN_VERSION_FILENAME = "polyphenVersion.json"; // Revel + @Deprecated public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; + @Deprecated public static final String MISSENSE_VARIATION_SCORE_JSON_FILENAME = "missense_variation_functional_score.json.gz"; + public static final String REVEL_DATA = "revel"; + public static final String REVEL_RAW_FILENAME = "revel-v1.3_all_chromosomes.zip"; + public static final String REVEL_JSON_FILENAME = "revel-v1.3_all_chromosomes.json.gz"; public static final String REVEL_VERSION_FILENAME = "revelVersion.json"; // AlphaMissense public static final String ALPHAMISSENSE_DATA = "alphamissense"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java index d21e661215..89645d3c31 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java @@ -21,6 +21,7 @@ import org.opencb.biodata.models.core.ProteinSubstitutionPrediction; import org.opencb.biodata.models.core.ProteinSubstitutionPredictionScore; import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.slf4j.LoggerFactory; import java.io.*; @@ -33,12 +34,12 @@ public class RevelScoreBuilder extends CellBaseBuilder { - private Path revelFilePath = null; + private Path revelFilePath; public static final String SOURCE = "revel"; public RevelScoreBuilder(Path revelDirectoryPath, CellBaseSerializer serializer) { super(serializer); - this.revelFilePath = revelDirectoryPath.resolve("revel_grch38_all_chromosomes.csv.zip"); + this.revelFilePath = revelDirectoryPath.resolve(EtlCommons.REVEL_RAW_FILENAME); logger = LoggerFactory.getLogger(ConservationBuilder.class); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java index 65f91a06d6..0e7399e90c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java @@ -84,8 +84,8 @@ public List downloadCaddScores() throws IOException, CellBaseExcep return manager.download(); } - public List downloadPredictionScores() throws IOException, CellBaseException, InterruptedException { - MissenseScoresDownloadManager manager = new MissenseScoresDownloadManager(species, assembly, outputDirectory, configuration); + public List downloadRevelScores() throws IOException, CellBaseException, InterruptedException { + RevelScoresDownloadManager manager = new RevelScoresDownloadManager(species, assembly, outputDirectory, configuration); return manager.download(); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RevelScoresDownloadManager.java similarity index 58% rename from cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java rename to cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RevelScoresDownloadManager.java index 1ae2514e49..7063425a47 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RevelScoresDownloadManager.java @@ -19,38 +19,43 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; +import org.opencb.cellbase.lib.builders.RevelScoreBuilder; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; import java.util.Collections; import java.util.List; -public class MissenseScoresDownloadManager extends AbstractDownloadManager { +public class RevelScoresDownloadManager extends AbstractDownloadManager { - public MissenseScoresDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) + public RevelScoresDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); } @Override public List download() throws IOException, InterruptedException { - return Collections.singletonList(downloadRevel()); - } + List list = new ArrayList<>(); - public DownloadFile downloadRevel() throws IOException, InterruptedException { if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { logger.info("Downloading Revel data ..."); - Path missensePredictionScore = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - Files.createDirectories(missensePredictionScore); + Path scorePath = downloadFolder.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + Files.createDirectories(scorePath); String url = configuration.getDownload().getRevel().getHost(); - saveVersionData(EtlCommons.MISSENSE_VARIATION_SCORE_DATA, "Revel", null, getTimeStamp(), - Collections.singletonList(url), missensePredictionScore.resolve("revelVersion.json")); - return downloadFile(url, missensePredictionScore.resolve("revel_grch38_all_chromosomes.csv.zip").toString()); + list.add(downloadFile(url, scorePath.resolve(EtlCommons.REVEL_RAW_FILENAME).toString())); + + saveVersionData(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, RevelScoreBuilder.SOURCE, + configuration.getDownload().getRevel().getVersion(), getTimeStamp(), Collections.singletonList(url), + scorePath.resolve(EtlCommons.REVEL_VERSION_FILENAME)); + + logger.info("Downloaded Revel file. Done!"); } - return null; + + return list; } } From a0900537e9422b1d443c6a490626100fc87b4761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 17 Jan 2024 16:09:29 +0100 Subject: [PATCH 029/107] lib: update Revel builder, #TASK-5441, #TASK-5388 --- .../app/cli/admin/AdminCliOptionsParser.java | 2 +- .../admin/executors/BuildCommandExecutor.java | 26 ++++++++++++++----- .../lib/builders/RevelScoreBuilder.java | 9 +++---- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 4013594a66..18cb5a9518 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -105,7 +105,7 @@ public class BuildCommandOptions { @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: genome, genome_info, " + "gene, variation, variation_functional_score, regulation, protein, ppi, conservation, drug, " - + "clinical_variants, repeats, svs, splice_score, pubmed and alphamissense; and 'all' builds everything.", required = true, arity = 1) + + "clinical_variants, repeats, svs, splice_score, pubmed, revel and alphamissense; and 'all' builds everything.", required = true, arity = 1) public String data; @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 9a5d1c066a..48b2a92e9d 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -82,7 +82,7 @@ public void execute() { if (speciesConfiguration == null) { throw new CellBaseException("Invalid species: '" + buildCommandOptions.species + "'"); } - SpeciesConfiguration.Assembly assembly = null; + SpeciesConfiguration.Assembly assembly; if (!StringUtils.isEmpty(buildCommandOptions.assembly)) { assembly = SpeciesUtils.getAssembly(speciesConfiguration, buildCommandOptions.assembly); if (assembly == null) { @@ -286,11 +286,23 @@ private CellBaseBuilder buildCadd() { return new CaddScoreBuilder(caddFilePath, serializer); } - private CellBaseBuilder buildRevel() { - Path path = downloadFolder.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); - copyVersionFiles(Arrays.asList(path.resolve(EtlCommons.REVEL_VERSION_FILENAME))); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.REVEL_DATA); - return new RevelScoreBuilder(path, serializer); + private CellBaseBuilder buildRevel() throws IOException { + Path inputFolder = downloadFolder.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + Path outputFolder = buildFolder.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + if (!outputFolder.toFile().exists()) { + outputFolder.toFile().mkdirs(); + } + + logger.info("Copying Revel version file..."); + if (inputFolder.resolve(EtlCommons.REVEL_VERSION_FILENAME).toFile().exists()) { + Files.copy(inputFolder.resolve(EtlCommons.REVEL_VERSION_FILENAME), + outputFolder.resolve(EtlCommons.REVEL_VERSION_FILENAME), StandardCopyOption.REPLACE_EXISTING); + } + + Path revelFilePath = inputFolder.resolve(EtlCommons.REVEL_RAW_FILENAME); + String basename = EtlCommons.REVEL_JSON_FILENAME.replace(".json.gz", ""); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outputFolder, basename); + return new RevelScoreBuilder(revelFilePath, serializer); } private CellBaseBuilder buildRegulation() { @@ -455,7 +467,7 @@ private CellBaseBuilder buildAlphaMissense() throws IOException { } File alphaMissenseFile = inputFolder.resolve(EtlCommons.ALPHAMISSENSE_RAW_FILENAME).toFile(); - String basename = EtlCommons.ALPHAMISSENSE_JSON_FILENAME.replace("\\.json\\.gz", ""); + String basename = EtlCommons.ALPHAMISSENSE_JSON_FILENAME.replace(".json.gz", ""); CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outputFolder, basename); return new AlphaMissenseBuilder(alphaMissenseFile, serializer); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java index 89645d3c31..87019b4c93 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java @@ -21,7 +21,6 @@ import org.opencb.biodata.models.core.ProteinSubstitutionPrediction; import org.opencb.biodata.models.core.ProteinSubstitutionPredictionScore; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.slf4j.LoggerFactory; import java.io.*; @@ -37,16 +36,16 @@ public class RevelScoreBuilder extends CellBaseBuilder { private Path revelFilePath; public static final String SOURCE = "revel"; - public RevelScoreBuilder(Path revelDirectoryPath, CellBaseSerializer serializer) { + public RevelScoreBuilder(Path revelFilePath, CellBaseSerializer serializer) { super(serializer); - this.revelFilePath = revelDirectoryPath.resolve(EtlCommons.REVEL_RAW_FILENAME); - logger = LoggerFactory.getLogger(ConservationBuilder.class); + this.revelFilePath = revelFilePath; + logger = LoggerFactory.getLogger(RevelScoreBuilder.class); } @Override public void parse() throws IOException { - logger.error("Processing Revel file at " + revelFilePath.toAbsolutePath()); + logger.info("Processing Revel file at " + revelFilePath.toAbsolutePath()); ZipInputStream zis = new ZipInputStream(new FileInputStream(revelFilePath.toFile())); ZipEntry zipEntry = zis.getNextEntry(); From 6346c97e10b2f568d1d5ab27fd1cb826a6df8711 Mon Sep 17 00:00:00 2001 From: imedina Date: Sun, 21 Jan 2024 02:10:16 +0000 Subject: [PATCH 030/107] download: gwas catalog fixes --- .../src/main/resources/configuration.yml | 45 ++++++++++--------- .../org/opencb/cellbase/lib/EtlCommons.java | 2 +- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index f24827532c..21a559eb76 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -45,6 +45,7 @@ server: port: "${CELLBASE.SERVER.REST.PORT}" defaultOutdir: "/tmp" download: + ## Genomic and Gene information ensembl: database: host: ensembldb.ensembl.org:3306 @@ -64,9 +65,6 @@ download: hgnc: host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt version: 2023-11-01 - cancerHotspot: - host: https://www.cancerhotspots.org/files/hotspots_v2.xls - version: "v2" refSeq: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz refSeqFasta: @@ -76,8 +74,6 @@ download: refSeqCdna: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz maneSelect: -# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_0.93/MANE.GRCh38.v0.93.summary.txt.gz -# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz version: "1.1" lrg: @@ -88,6 +84,8 @@ download: version: "2023-11-08" geneExpressionAtlas: host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + goAnnotation: + host: http://geneontology.org/gene-associations/goa_human.gaf.gz mirbase: host: ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz mirbaseReadme: @@ -121,15 +119,13 @@ download: gerp: host: http://ftp.ensembl.org/pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw version: "2023-05-17" + + ## Clinical Variant clinvar: -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz version: "2023-12-01" clinvarVariation: -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz clinvarSummary: @@ -140,6 +136,9 @@ download: version: "2023-12-01" clinvarEfoTerms: host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv + cancerHotspot: + host: https://www.cancerhotspots.org/files/hotspots_v2.xls + version: "v2" iarctp53: host: http://p53.iarc.fr/ajax/Zipper.ashx docm: @@ -154,10 +153,19 @@ download: host: http://hgdownload.cse.ucsc.edu/goldenPath genomicSuperDups: host: http://hgdownload.cse.ucsc.edu/goldenPath + + ## Variant Pathogenic Prediction + revel: + host: https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip + version: "1.3" + cadd: +# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz + ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP! + host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz + version: "1.7-pre" gwasCatalog: -# host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv -# version: "1.0.2 associations_e106_r2022-05-17" - host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations.tsv + ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e110_r2023-12-20' + host: https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations_ontology-annotated.tsv version: "23-12-21" hpo: ## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations @@ -170,16 +178,13 @@ download: dgidb: host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv version: "2022-02-01" - cadd: - ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP! -# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz - host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz - version: "1.7-pre" reactome: host: http://www.reactome.org/download/current/biopax.zip gnomadConstraints: host: https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz version: "2.1.1" + + ## OBO Ontologies hpoObo: host: http://purl.obolibrary.org/obo/hp.obo version: "2023-12-01" @@ -192,10 +197,8 @@ download: mondoObo: host: http://purl.obolibrary.org/obo/mondo.obo version: "2023-12-01" - goAnnotation: - host: http://geneontology.org/gene-associations/goa_human.gaf.gz - revel: - host: https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip + + ## Others pubmed: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ files: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 124ac6e6fc..f8ee4938e5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -61,7 +61,7 @@ public class EtlCommons { public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz"; public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz"; public static final String IARCTP53_FILE = "IARC-TP53.zip"; - public static final String GWAS_FILE = "gwas_catalog.tsv"; + public static final String GWAS_FILE = "gwas-catalog-associations_ontology-annotated.tsv"; public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz"; public static final String DBSNP_FILE = "All.vcf.gz"; From d3433153f5bbb850602efd31446fb53c14a97df9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Jan 2024 09:23:06 +0100 Subject: [PATCH 031/107] lib: take into account the NumberFormatException, #TASK-5407, #TASK-5387 --- .../cellbase/lib/builders/PolygenicScoreBuilder.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java index 2869e1f00b..82f70ba756 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java @@ -394,7 +394,12 @@ private void saveVariantPolygenicScore(String line, Map columnP return; } if (columnPos.containsKey(HM_POS_COL)) { - position = Integer.parseInt(field[columnPos.get(HM_POS_COL)]); + try { + position = Integer.parseInt(field[columnPos.get(HM_POS_COL)]); + } catch (NumberFormatException e) { + logger.warn("Invalid field '{}' (value = {}), skipping line: {}", HM_POS_COL, field[columnPos.get(HM_POS_COL)], line); + return; + } } else { logger.warn("Missing field '{}', skipping line: {}", HM_POS_COL, line); return; From 268f591e6fb74bbe8743b26224c698a82862010b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 2 Feb 2024 17:21:17 +0100 Subject: [PATCH 032/107] lib: improve PGS builder, #TASK-5407, #TASK-5387 --- .../lib/builders/PolygenicScoreBuilder.java | 59 ++++++++++++------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java index 82f70ba756..7c0e7474f5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java @@ -55,7 +55,7 @@ public class PolygenicScoreBuilder extends CellBaseBuilder { private Path pgsDir; private CellBaseFileSerializer fileSerializer; - protected RocksDB rdb; + protected Map rdbConnectionPerChrom = new HashMap<>(); protected static ObjectMapper mapper; protected static ObjectReader varPgsReader; @@ -138,11 +138,6 @@ public void parse() throws Exception { logger.info("Parsing polygenic score (PGS) files..."); - Object[] dbConnection = getDBConnection(pgsDir.resolve("rdb.idx").toString(), true); - rdb = (RocksDB) dbConnection[0]; - Options dbOption = (Options) dbConnection[1]; - String dbLocation = (String) dbConnection[2]; - BufferedWriter bw = FileUtils.newBufferedWriter(serializer.getOutdir().resolve(COMMON_POLYGENIC_SCORE_FILENAME)); for (File file : pgsDir.toFile().listFiles()) { @@ -185,8 +180,7 @@ public void parse() throws Exception { } // Serialize/write the saved variant polygenic scores in the RocksDB - serializeRDB(rdb); - closeIndex(rdb, dbOption, dbLocation); + serializeRDB(); serializer.close(); // Close PGS file (with common attributes) @@ -473,6 +467,7 @@ private void saveVariantPolygenicScore(String line, Map columnP // Creating and/or updating variant polygenic score VariantPolygenicScore varPgs; + RocksDB rdb = getRocksDB(chrom); String key = chrom + ":" + position + ":" + otherAllele + ":" + effectAllele; byte[] dbContent = rdb.get(key.getBytes()); if (dbContent == null) { @@ -485,21 +480,30 @@ private void saveVariantPolygenicScore(String line, Map columnP rdb.put(key.getBytes(), jsonObjectWriter.writeValueAsBytes(varPgs)); } - private void serializeRDB(RocksDB rdb) throws IOException { - // DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's - // named "iterator" - RocksIterator rocksIterator = rdb.newIterator(); - - logger.info("Reading from RocksDB index and serializing to {}.json.gz", serializer.getOutdir().resolve(serializer.getFileName())); - int counter = 0; - for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { - VariantPolygenicScore varPgs = varPgsReader.readValue(rocksIterator.value()); - serializer.serialize(varPgs); - counter++; - if (counter % 10000 == 0) { - logger.info("{} written", counter); + private void serializeRDB() throws IOException { + for (Map.Entry entry : rdbConnectionPerChrom.entrySet()) { + RocksDB rdb = (RocksDB) entry.getValue()[0]; + Options dbOption = (Options) entry.getValue()[1]; + String dbLocation = (String) entry.getValue()[2]; + + // DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's + // named "iterator" + RocksIterator rocksIterator = rdb.newIterator(); + + logger.info("Reading from RocksDB index (chrom. {}) and serializing to {}.json.gz", entry.getKey(), + serializer.getOutdir().resolve(serializer.getFileName())); + int counter = 0; + for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { + VariantPolygenicScore varPgs = varPgsReader.readValue(rocksIterator.value()); + serializer.serialize(varPgs); + counter++; + if (counter % 10000 == 0) { + logger.info("{} written", counter); + } } + closeIndex(rdb, dbOption, dbLocation); } + serializer.close(); logger.info("Done."); } @@ -517,6 +521,7 @@ private void closeIndex(RocksDB rdb, Options dbOption, String dbLocation) throws } private Object[] getDBConnection(String dbLocation, boolean forceCreate) { + System.out.println("db location = " + Paths.get(dbLocation).toAbsolutePath()); boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation)); // a static method that loads the RocksDB C++ library. RocksDB.loadLibrary(); @@ -549,4 +554,16 @@ private Object[] getDBConnection(String dbLocation, boolean forceCreate) { return new Object[]{db, options, dbLocation, indexingNeeded}; } + + private Object[] getRocksDBConnection(String chrom) { + if (!rdbConnectionPerChrom.containsKey(chrom)) { + Object[] dbConnection = getDBConnection(pgsDir.resolve("rdb-" + chrom + ".idx").toString(), true); + rdbConnectionPerChrom.put(chrom, dbConnection); + } + return rdbConnectionPerChrom.get(chrom); + } + + private RocksDB getRocksDB(String chrom) { + return (RocksDB) getRocksDBConnection(chrom)[0]; + } } From 5972c15fe2b18e3ad5fc793ee1db9db9d25e2a57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 2 Feb 2024 17:51:00 +0100 Subject: [PATCH 033/107] lib: remove System.out in PGS builder, #TASK-5407, #TASK-5387 --- .../org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java | 1 - 1 file changed, 1 deletion(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java index 7c0e7474f5..d70fb55e98 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java @@ -521,7 +521,6 @@ private void closeIndex(RocksDB rdb, Options dbOption, String dbLocation) throws } private Object[] getDBConnection(String dbLocation, boolean forceCreate) { - System.out.println("db location = " + Paths.get(dbLocation).toAbsolutePath()); boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation)); // a static method that loads the RocksDB C++ library. RocksDB.loadLibrary(); From d9ad898a3afe68e865bf19a8bddf951bd5121bba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 7 Feb 2024 08:50:03 +0100 Subject: [PATCH 034/107] lib: filter contigs (only chromosomes) when processing PGS scores, #TASK-5407, #TASK-5387 --- .../cellbase/lib/builders/PolygenicScoreBuilder.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java index d70fb55e98..37cf2f6077 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java @@ -43,9 +43,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; +import java.util.*; public class PolygenicScoreBuilder extends CellBaseBuilder { @@ -111,6 +109,8 @@ public class PolygenicScoreBuilder extends CellBaseBuilder { private static final String DOSAGE_1_WEIGHT_KEY = "Effect weight with 1 copy of the effect allele"; private static final String DOSAGE_2_WEIGHT_KEY = "Effect weight with 1 copy of the effect allele"; + private static final Set VALID_CHROMOSOMES = new HashSet<>(Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", + "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT", "M")); static { mapper = new ObjectMapper(); @@ -383,6 +383,10 @@ private void saveVariantPolygenicScore(String line, Map columnP if (columnPos.containsKey(HM_CHR_COL)) { chrom = field[columnPos.get(HM_CHR_COL)]; + if (!VALID_CHROMOSOMES.contains(chrom)) { + // Only chromosomes are processed; no contigs, e.g.: 8_KI270821v1_alt, 11_KI270927v1_alt, 12_GL877875v1_alt,... + return; + } } else { logger.warn("Missing field '{}', skipping line: {}", HM_CHR_COL, line); return; From d3ad8c967634ca6274c5b63116eeeaa2e069d377 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 7 Feb 2024 11:41:16 +0100 Subject: [PATCH 035/107] lib: minor changes, #TASK-5407, #TASK-5387 --- .../cellbase/lib/builders/PolygenicScoreBuilder.java | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java index 37cf2f6077..9e326013fc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java @@ -494,7 +494,7 @@ private void serializeRDB() throws IOException { // named "iterator" RocksIterator rocksIterator = rdb.newIterator(); - logger.info("Reading from RocksDB index (chrom. {}) and serializing to {}.json.gz", entry.getKey(), + logger.info("Reading from RocksDB index ({}) and serializing to {}.json.gz", dbLocation, serializer.getOutdir().resolve(serializer.getFileName())); int counter = 0; for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { @@ -507,9 +507,6 @@ private void serializeRDB() throws IOException { } closeIndex(rdb, dbOption, dbLocation); } - - serializer.close(); - logger.info("Done."); } private void closeIndex(RocksDB rdb, Options dbOption, String dbLocation) throws IOException { @@ -559,7 +556,7 @@ private Object[] getDBConnection(String dbLocation, boolean forceCreate) { } private Object[] getRocksDBConnection(String chrom) { - if (!rdbConnectionPerChrom.containsKey(chrom)) { + if (!rdbConnectionPerChrom.containsKey(chrom) || rdbConnectionPerChrom.get(chrom) == null) { Object[] dbConnection = getDBConnection(pgsDir.resolve("rdb-" + chrom + ".idx").toString(), true); rdbConnectionPerChrom.put(chrom, dbConnection); } From 89264c22d04e94dd48665ddeeefb22ce9b0cd13a Mon Sep 17 00:00:00 2001 From: imedina Date: Fri, 1 Mar 2024 09:33:22 +0000 Subject: [PATCH 036/107] Update configuration.yml --- .../src/main/resources/configuration.yml | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 21a559eb76..2204acf270 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -64,15 +64,19 @@ download: host: ftp://ftp.ensemblgenomes.org/pub hgnc: host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt - version: 2023-11-01 + version: "2023-11-01" refSeq: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz + version: "2023-10-11" refSeqFasta: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz + version: "2023-10-11" refSeqProteinFasta: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz + version: "2023-10-11" refSeqCdna: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz + version: "2023-10-11" maneSelect: host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz version: "1.1" @@ -84,6 +88,7 @@ download: version: "2023-11-08" geneExpressionAtlas: host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + version: "2.0.14" goAnnotation: host: http://geneontology.org/gene-associations/goa_human.gaf.gz mirbase: @@ -99,18 +104,19 @@ download: ## Protein Data uniprot: host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - version: "2023-11-08" + version: "2024-01-24" uniprotRelNotes: host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt - version: "2023-11-08" + version: "2024-01-24" interpro: host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz - version: "2023-11-08" + version: "2024-01-24" interproRelNotes: host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt + version: "2024-01-24" intact: host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt - version: "2023-10-07" + version: "2024-02-16" ## Conservation Scores conservation: @@ -123,17 +129,17 @@ download: ## Clinical Variant clinvar: # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-02.xml.gz version: "2023-12-01" clinvarVariation: # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/VCV_xml_old_format/ClinVarVariationRelease_2024-02.xml.gz clinvarSummary: host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - version: "2023-12-01" + version: "2024-03-01" clinvarVariationAllele: host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz - version: "2023-12-01" + version: "2024-03-01" clinvarEfoTerms: host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv cancerHotspot: @@ -159,17 +165,16 @@ download: host: https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip version: "1.3" cadd: -# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz - ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP! - host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz - version: "1.7-pre" + host: https://krishna.gs.washington.edu/download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz + version: "1.7" gwasCatalog: ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e110_r2023-12-20' - host: https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations_ontology-annotated.tsv - version: "23-12-21" + host: https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/02/12/gwas-catalog-associations_ontology-annotated.tsv + version: "2024-02-12" hpo: - ## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations + ## NOTE: Download manually from here now: https://hpo.jax.org/app/data/annotations host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt + version: "2024-03-01" disgenet: host: https://www.disgenet.org/static/disgenet_ap1/files/downloads files: @@ -187,16 +192,16 @@ download: ## OBO Ontologies hpoObo: host: http://purl.obolibrary.org/obo/hp.obo - version: "2023-12-01" + version: "2024-03-01" goObo: host: http://purl.obolibrary.org/obo/go/go-basic.obo - version: "2023-12-01" + version: "2024-03-01" doidObo: host: http://purl.obolibrary.org/obo/doid.obo - version: "2023-12-01" + version: "2024-03-01" mondoObo: host: http://purl.obolibrary.org/obo/mondo.obo - version: "2023-12-01" + version: "2024-03-01" ## Others pubmed: From 2be5f214f8843b6c6c0a2c493f79b3fe94b053d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 7 Mar 2024 10:33:58 +0100 Subject: [PATCH 037/107] core: update pubmed URLs in the configuration file, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 2204acf270..3c2b6ee443 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -207,7 +207,7 @@ download: pubmed: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ files: - - pubmed22n[1..1114..4].xml.gz + - pubmed24n[1..1219..4].xml.gz pharmGKB: host: https://www.pharmgkb.org/downloads version: v1 From fe05795eeef4dddb402832ee863277904517160a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 7 Mar 2024 11:10:13 +0100 Subject: [PATCH 038/107] core: update pubmed version in the configuration file, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 3c2b6ee443..b1d74cfd85 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -206,6 +206,7 @@ download: ## Others pubmed: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ + version: 2024 files: - pubmed24n[1..1219..4].xml.gz pharmGKB: From 50f7008d523d53188ab929fba113488ceb5ba56d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 7 Mar 2024 11:24:57 +0100 Subject: [PATCH 039/107] core: improve Ontology downloader, #TASK-5775, #TASK-5564 --- .../java/org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/download/OntologyDownloadManager.java | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index f8ee4938e5..10c45ae64e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -127,6 +127,7 @@ public class EtlCommons { public static final String HPO_VERSION_FILE = "hpoVersion.json"; public static final String GO_VERSION_FILE = "goVersion.json"; public static final String DO_VERSION_FILE = "doVersion.json"; + public static final String MONDO_VERSION_FILE = "mondoVersion.json"; public static final String HGMD_FILE = "hgmd.vcf"; public static final String PUBMED_VERSION_FILENAME = "pubmedVersion.json"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index 522be7b27d..e7e510fb91 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -43,24 +43,28 @@ public List download() throws IOException, InterruptedException { Files.createDirectories(oboFolder); String url = configuration.getDownload().getHpoObo().getHost(); + logger.info("Downloading {} ...", url); downloadFiles.add(downloadFile(url, oboFolder.resolve("hp.obo").toString())); saveVersionData(EtlCommons.OBO_DATA, "HPO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.HPO_VERSION_FILE)); + Collections.singletonList(url), oboFolder.resolve(EtlCommons.HPO_VERSION_FILE)); url = configuration.getDownload().getGoObo().getHost(); + logger.info("Downloading {} ...", url); downloadFiles.add(downloadFile(url, oboFolder.resolve("go-basic.obo").toString())); saveVersionData(EtlCommons.OBO_DATA, "GO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.GO_VERSION_FILE)); + Collections.singletonList(url), oboFolder.resolve(EtlCommons.GO_VERSION_FILE)); url = configuration.getDownload().getDoidObo().getHost(); + logger.info("Downloading {} ...", url); downloadFiles.add(downloadFile(url, oboFolder.resolve("doid.obo").toString())); saveVersionData(EtlCommons.OBO_DATA, "DO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.DO_VERSION_FILE)); + Collections.singletonList(url), oboFolder.resolve(EtlCommons.DO_VERSION_FILE)); url = configuration.getDownload().getMondoObo().getHost(); + logger.info("Downloading {} ...", url); downloadFiles.add(downloadFile(url, oboFolder.resolve("mondo.obo").toString())); saveVersionData(EtlCommons.OBO_DATA, "MONDO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.DO_VERSION_FILE)); + Collections.singletonList(url), oboFolder.resolve(EtlCommons.MONDO_VERSION_FILE)); return downloadFiles; } From a8a9328f16bd29fff27d569be6857cf97e9effd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 7 Mar 2024 16:35:13 +0100 Subject: [PATCH 040/107] lib: take into account PubMed version from config file, and fix sonnar issues, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/download/PubMedDownloadManager.java | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 10c45ae64e..b06364b6e5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -92,6 +92,7 @@ public class EtlCommons { public static final String HGMD_DATA = "hgmd"; public static final String PUBMED_DATA = "pubmed"; + public static final String PUBMED_VERSION_FILE = PUBMED_DATA + "Version.json"; // Load specific data options public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index b5edf0220b..e913539d5b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -27,9 +27,11 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.PUBMED_VERSION_FILE; + public class PubMedDownloadManager extends AbstractDownloadManager { - private static final String PUBMED_NAME = "PUBMED"; + private static final String PUBMED_NAME = "PubMed"; public PubMedDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); @@ -39,7 +41,7 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto public List download() throws IOException, InterruptedException { logger.info("Downloading PubMed XML files..."); - Path pubmedFolder = downloadFolder.resolve("pubmed"); + Path pubmedFolder = downloadFolder.resolve(EtlCommons.PUBMED_DATA); Files.createDirectories(pubmedFolder); // Downloads PubMed XML files @@ -47,17 +49,17 @@ public List download() throws IOException, InterruptedException { String regexp = configuration.getDownload().getPubmed().getFiles().get(0); String[] name = regexp.split("[\\[\\]]"); String[] split = name[1].split("\\.\\."); - int start = Integer.valueOf(split[0]); - int end = Integer.valueOf(split[1]); - int padding = Integer.valueOf(split[2]); + int start = Integer.parseInt(split[0]); + int end = Integer.parseInt(split[1]); + int padding = Integer.parseInt(split[2]); - saveVersionData(EtlCommons.PUBMED_DATA, PUBMED_NAME, null, getTimeStamp(), Collections.singletonList(url), - pubmedFolder.resolve("pubmedVersion.json")); + saveVersionData(EtlCommons.PUBMED_DATA, PUBMED_NAME, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), + Collections.singletonList(url), pubmedFolder.resolve(PUBMED_VERSION_FILE)); List list = new ArrayList<>(); for (int i = start; i <= end; i++) { String filename = name[0] + String.format("%0" + padding + "d", i) + name[2]; - logger.info("\tDownloading file " + filename); + logger.info("\tDownloading file {}", filename); list.add(downloadFile(url + "/" + filename, pubmedFolder.resolve(filename).toString())); } return list; From f84734e9c0d238e16e13e0c576262c593ee56a6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 7 Mar 2024 17:25:30 +0100 Subject: [PATCH 041/107] lib: improve clinvar and gwas downloader by removing hardcode filenames and taking into account the version from config file, and fix sonnar issues, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 9 + .../lib/download/ClinicalDownloadManager.java | 180 +++++++++++------- 2 files changed, 117 insertions(+), 72 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index b06364b6e5..4d516ec273 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -54,14 +54,23 @@ public class EtlCommons { public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json"; public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant"; + @Deprecated public static final String CLINVAR_VERSION = "2022.11"; + @Deprecated public static final String CLINVAR_DATE = "2022-11"; + @Deprecated public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz"; + @Deprecated public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv"; + @Deprecated public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz"; + @Deprecated public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz"; + public static final String CLINVAR_VERSION_FILENAME = "clinvarVersion.json"; public static final String IARCTP53_FILE = "IARC-TP53.zip"; + @Deprecated public static final String GWAS_FILE = "gwas-catalog-associations_ontology-annotated.tsv"; + public static final String GWAS_VERSION_FILENAME = "gwasVersion.json"; public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz"; public static final String DBSNP_FILE = "All.vcf.gz"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 580a855a19..1918f82be6 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -29,15 +29,23 @@ import java.net.URI; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; +import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_VERSION_FILENAME; +import static org.opencb.cellbase.lib.EtlCommons.GWAS_VERSION_FILENAME; + public class ClinicalDownloadManager extends AbstractDownloadManager { private static final String CLINVAR_NAME = "ClinVar"; private static final String GWAS_NAME = "GWAS catalog"; + /** + * @deprecated + */ + @Deprecated private static final String IARCTP53_NAME = "IARC TP53 Database"; @@ -63,39 +71,50 @@ public List downloadClinical() throws IOException, InterruptedExce logger.info("Downloading clinical information ..."); String url; + String filename; List downloadFiles = new ArrayList<>(); - Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER); + Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER).toAbsolutePath(); Files.createDirectories(clinicalFolder); logger.info("\t\tDownloading ClinVar files ..."); List clinvarUrls = new ArrayList<>(3); url = configuration.getDownload().getClinvar().getHost(); - - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_XML_FILE).toString())); + filename = Paths.get(url).getFileName().toString(); + logger.info("\t\tDownloading {} to {} ...", url, clinicalFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, clinicalFolder.resolve(filename).toString())); clinvarUrls.add(url); url = configuration.getDownload().getClinvarEfoTerms().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_EFO_FILE).toString())); + filename = Paths.get(url).getFileName().toString(); + logger.info("\t\tDownloading {} to {} ...", url, clinicalFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, clinicalFolder.resolve(filename).toString())); clinvarUrls.add(url); url = configuration.getDownload().getClinvarSummary().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_SUMMARY_FILE).toString())); + filename = Paths.get(url).getFileName().toString(); + logger.info("\t\tDownloading {} to {} ...", url, clinicalFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, clinicalFolder.resolve(filename).toString())); clinvarUrls.add(url); url = configuration.getDownload().getClinvarVariationAllele().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE).toString())); + filename = Paths.get(url).getFileName().toString(); + logger.info("\t\tDownloading {} to {} ...", url, clinicalFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, clinicalFolder.resolve(filename).toString())); clinvarUrls.add(url); - saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, getClinVarVersion(), getTimeStamp(), clinvarUrls, - clinicalFolder.resolve("clinvarVersion.json")); + + saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, configuration.getDownload().getClinvar().getVersion(), + getTimeStamp(), clinvarUrls, clinicalFolder.resolve(CLINVAR_VERSION_FILENAME)); // Gwas catalog logger.info("\t\tDownloading GWAS catalog file ..."); DownloadProperties.URLProperties gwasCatalog = configuration.getDownload().getGwasCatalog(); url = gwasCatalog.getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.GWAS_FILE).toString())); + filename = Paths.get(url).getFileName().toString(); + logger.info("\t\tDownloading {} to {} ...", url, clinicalFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, clinicalFolder.resolve(filename).toString())); saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, GWAS_NAME, gwasCatalog.getVersion(), getTimeStamp(), - Collections.singletonList(url), clinicalFolder.resolve("gwasVersion.json")); + Collections.singletonList(url), clinicalFolder.resolve(GWAS_VERSION_FILENAME)); // List hgvsList = getDocmHgvsList(); // if (!hgvsList.isEmpty()) { @@ -139,87 +158,110 @@ public List downloadClinical() throws IOException, InterruptedExce // Collections.singletonList(url), clinicalFolder.resolve("iarctp53Version.json")); // } - if (Files.notExists(clinicalFolder.resolve("clinvar_chunks"))) { - Files.createDirectories(clinicalFolder.resolve("clinvar_chunks")); - splitClinvar(clinicalFolder.resolve(EtlCommons.CLINVAR_XML_FILE), clinicalFolder.resolve("clinvar_chunks")); + final String chunkDir = "clinvar_chunks"; + if (Files.notExists(clinicalFolder.resolve(chunkDir))) { + Files.createDirectories(clinicalFolder.resolve(chunkDir)); + filename = Paths.get(configuration.getDownload().getClinvar().getHost()).getFileName().toString(); + logger.info("\t\tSplitting {} int {} ...", clinicalFolder.resolve(filename), clinicalFolder.resolve(chunkDir)); + splitClinvar(clinicalFolder.resolve(filename), clinicalFolder.resolve(chunkDir)); } return downloadFiles; } - return null; + return Collections.emptyList(); } private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { - BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath); - PrintWriter pw = null; - StringBuilder header = new StringBuilder(); - boolean beforeEntry = true; - boolean inEntry = false; - int count = 0; - int chunk = 0; - String line; - while ((line = br.readLine()) != null) { - if (line.trim().startsWith("")) { - inEntry = false; - if (count % 10000 == 0) { - pw.print(""); - pw.close(); - chunk++; + if (line.trim().startsWith("")) { + inEntry = false; + if (count % 10000 == 0) { + pw.print(""); + pw.close(); + chunk++; + } } } + pw.print(""); + pw.close(); } - pw.print(""); - pw.close(); - br.close(); } + /** + * @deprecated + * @param docmIndexHtml + * @return + */ + @Deprecated private String getDocmVersion(Path docmIndexHtml) { return getVersionFromVersionLine(docmIndexHtml, " hgvsList, Path path) throws IOException, InterruptedException { - try (BufferedWriter bufferedWriter = Files.newBufferedWriter(path)) { - Client client = ClientBuilder.newClient(); - WebTarget restUrlBase = client - .target(URI.create(configuration.getDownload().getDocm().getHost() + "v1/variants")); - - logger.info("Querying DOCM REST API to get detailed data for all their variants"); - int counter = 0; - for (String hgvs : hgvsList) { - WebTarget callUrl = restUrlBase.path(hgvs + ".json"); - String jsonString = callUrl.request().get(String.class); - bufferedWriter.write(jsonString + "\n"); - - if (counter % 10 == 0) { - logger.info("{} DOCM variants saved", counter); - } - // Wait 1/3 of a second to avoid saturating their REST server - also avoid getting banned - Thread.sleep(300); - - counter++; - } - logger.info("Finished. {} DOCM variants saved at {}", counter, path); - } - } - - /** - * @deprecated - * @return - * @throws IOException - */ - @Deprecated - private List getDocmHgvsList() throws IOException { - Client client = ClientBuilder.newClient(); - WebTarget restUrl = client - .target(URI.create(configuration.getDownload().getDocm().getHost() + "v1/variants.json")); - - String jsonString; - logger.info("Getting full list of DOCM hgvs from: {}", restUrl.getUri().toURL()); - jsonString = restUrl.request().get(String.class); - - List> responseMap = parseResult(jsonString); - List hgvsList = new ArrayList<>(responseMap.size()); - for (Map document : responseMap) { - if (document.containsKey("reference_version") - && document.get("reference_version").equalsIgnoreCase(assemblyConfiguration.getName())) { - hgvsList.add(document.get("hgvs")); - } - } - logger.info("{} hgvs found", hgvsList.size()); - - return hgvsList; - } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CoreDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CoreDownloadManager.java deleted file mode 100644 index aca27ff2e8..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CoreDownloadManager.java +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.download; - -import org.apache.commons.io.FilenameUtils; -import org.apache.commons.lang.StringUtils; -import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.SpeciesConfiguration; -import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.commons.utils.FileUtils; - -import java.io.*; -import java.net.URI; -import java.nio.charset.Charset; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.*; - -@Deprecated -public class CoreDownloadManager extends DownloadManager { - - private static final String ENSEMBL_NAME = "ENSEMBL"; - private static final String UNIPROT_NAME = "UniProt"; - private static final String INTACT_NAME = "IntAct"; - private static final String INTERPRO_NAME = "InterPro"; - private static final String GERP_NAME = "GERP++"; - private static final String PHASTCONS_NAME = "PhastCons"; - private static final String PHYLOP_NAME = "PhyloP"; - private static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; - private static final String HPO_NAME = "HPO"; - private static final String DISGENET_NAME = "DisGeNET"; - private static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; - private static final String DGIDB_NAME = "DGIdb"; - private static final String GNOMAD_NAME = "gnomAD"; - - private static final HashMap GENE_UNIPROT_XREF_FILES = new HashMap() { - { - put("Homo sapiens", "HUMAN_9606_idmapping_selected.tab.gz"); - put("Mus musculus", "MOUSE_10090_idmapping_selected.tab.gz"); - put("Rattus norvegicus", "RAT_10116_idmapping_selected.tab.gz"); - put("Danio rerio", "DANRE_7955_idmapping_selected.tab.gz"); - put("Drosophila melanogaster", "DROME_7227_idmapping_selected.tab.gz"); - put("Saccharomyces cerevisiae", "YEAST_559292_idmapping_selected.tab.gz"); - } - }; - - public CoreDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) - throws IOException, CellBaseException { - super(species, assembly, targetDirectory, configuration); - } - - public CoreDownloadManager(CellBaseConfiguration configuration, Path targetDirectory, SpeciesConfiguration speciesConfiguration, - SpeciesConfiguration.Assembly assembly) throws IOException, CellBaseException { - super(configuration, targetDirectory, speciesConfiguration, assembly); - } - - public void downloadReferenceGenome() throws IOException, InterruptedException { - logger.info("Downloading genome information ..."); - Path sequenceFolder = downloadFolder.resolve("genome"); - Files.createDirectories(sequenceFolder); - - // Reference genome sequences are downloaded from Ensembl - // New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead - String url = ensemblHostUrl + "/" + ensemblRelease; - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - url = url + "/fasta/" + speciesShortName + "/dna/*.dna.primary_assembly.fa.gz"; - } else { - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - url = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); - } - url = url + "/fasta/"; - if (configuration.getSpecies().getBacteria().contains(speciesConfiguration)) { - // WARN: assuming there's just one assembly - url = url + speciesConfiguration.getAssemblies().get(0).getEnsemblCollection() + "/"; - } - url = url + speciesShortName + "/dna/*.dna.toplevel.fa.gz"; - } - - String outputFileName = StringUtils.capitalize(speciesShortName) + "." + assemblyConfiguration.getName() + ".fa.gz"; - Path outputPath = sequenceFolder.resolve(outputFileName); - downloadFile(url, outputPath.toString()); - logger.info("Saving reference genome version data at {}", sequenceFolder.resolve("genomeVersion.json")); - saveVersionData(EtlCommons.GENOME_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve("genomeVersion.json")); - } - - public void downloadEnsemblGene()throws IOException, InterruptedException { - logger.info("Downloading gene information ..."); - Path geneFolder = downloadFolder.resolve("gene"); - Files.createDirectories(geneFolder); - - downloadEnsemblData(geneFolder); - downloadDrugData(geneFolder); - downloadGeneUniprotXref(geneFolder); - downloadGeneExpressionAtlas(geneFolder); - downloadGeneDiseaseAnnotation(geneFolder); - downloadGnomadConstraints(geneFolder); - downloadGO(geneFolder); - // FIXME -// runGeneExtraInfo(geneFolder); - } - - private void downloadGO(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading go annotation..."); - String url = configuration.getDownload().getGoAnnotation().getHost(); - downloadFile(url, geneFolder.resolve("goa_human.gaf.gz").toString()); - saveVersionData(EtlCommons.GENE_DATA, GO_ANNOTATION_NAME, null, getTimeStamp(), Collections.singletonList(url), - buildFolder.resolve("goAnnotationVersion.json")); - } - } - - public void downloadObo() throws IOException, InterruptedException { - logger.info("Downloading obo files ..."); - - Path oboFolder = downloadFolder.resolve("obo"); - Files.createDirectories(oboFolder); - - String url = configuration.getDownload().getHpoObo().getHost(); - downloadFile(url, oboFolder.resolve("hp.obo").toString()); - - url = configuration.getDownload().getGoObo().getHost(); - downloadFile(url, oboFolder.resolve("go-basic.obo").toString()); - } - - private void downloadGnomadConstraints(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading gnomAD constraints data..."); - String url = configuration.getDownload().getGnomadConstraints().getHost(); - downloadFile(url, geneFolder.resolve("gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz").toString()); - saveVersionData(EtlCommons.GENE_DATA, GNOMAD_NAME, configuration.getDownload(). - getGnomadConstraints().getVersion(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve("gnomadVersion.json")); - } - } - private void downloadDrugData(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading drug-gene data..."); - String url = configuration.getDownload().getDgidb().getHost(); - downloadFile(url, geneFolder.resolve("dgidb.tsv").toString()); - saveVersionData(EtlCommons.GENE_DATA, DGIDB_NAME, null, getTimeStamp(), Collections.singletonList(url), - buildFolder.resolve("dgidbVersion.json")); - } - } - - private void downloadEnsemblData(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene Ensembl data (gtf, pep, cdna, motifs) ..."); - List downloadedUrls = new ArrayList<>(4); - - String ensemblHost = ensemblHostUrl + "/" + ensemblRelease; - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - ensemblHost = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); - } - - String bacteriaCollectionPath = ""; - if (configuration.getSpecies().getBacteria().contains(speciesConfiguration)) { - // WARN: assuming there's just one assembly - bacteriaCollectionPath = speciesConfiguration.getAssemblies().get(0).getEnsemblCollection() + "/"; - } - - // Ensembl leaves now several GTF files in the FTP folder, we need to build a more accurate URL - // to download the correct GTF file. - String version = ensemblRelease.split("-")[1]; - String url = ensemblHost + "/gtf/" + bacteriaCollectionPath + speciesShortName + "/*" + version + ".gtf.gz"; - String fileName = geneFolder.resolve(speciesShortName + ".gtf.gz").toString(); - downloadFile(url, fileName); - downloadedUrls.add(url); - - url = ensemblHost + "/fasta/" + bacteriaCollectionPath + speciesShortName + "/pep/*.pep.all.fa.gz"; - fileName = geneFolder.resolve(speciesShortName + ".pep.all.fa.gz").toString(); - downloadFile(url, fileName); - downloadedUrls.add(url); - - url = ensemblHost + "/fasta/" + bacteriaCollectionPath + speciesShortName + "/cdna/*.cdna.all.fa.gz"; - fileName = geneFolder.resolve(speciesShortName + ".cdna.all.fa.gz").toString(); - downloadFile(url, fileName); - downloadedUrls.add(url); - - //ftp://ftp.ensembl.org/pub/release-99/regulation/homo_sapiens/MotifFeatures/Homo_sapiens.GRCh38.motif_features.gff.gz -// url = ensemblHost + "/regulation/" + speciesShortName + "/MotifFeatures/*.motif_features.gff.gz"; -// Path outputFile = geneFolder.resolve("motif_features.gff.gz"); -// downloadFile(url, outputFile.toString()); -// downloadedUrls.add(url); - - - saveVersionData(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), downloadedUrls, - buildFolder.resolve("ensemblCoreVersion.json")); - } - - private void downloadGeneUniprotXref(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading UniProt ID mapping ..."); - - if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { - String geneGtfUrl = configuration.getDownload().getGeneUniprotXref().getHost() + "/" - + GENE_UNIPROT_XREF_FILES.get(speciesConfiguration.getScientificName()); - downloadFile(geneGtfUrl, geneFolder.resolve("idmapping_selected.tab.gz").toString()); - downloadFile(getUniProtReleaseNotesUrl(), geneFolder.resolve("uniprotRelnotes.txt").toString()); - - saveVersionData(EtlCommons.GENE_DATA, UNIPROT_NAME, - getUniProtRelease(geneFolder.resolve("uniprotRelnotes.txt").toString()), getTimeStamp(), - Collections.singletonList(geneGtfUrl), buildFolder.resolve("uniprotXrefVersion.json")); - } - } - - private String getUniProtRelease(String relnotesFilename) { - Path path = Paths.get(relnotesFilename); - Files.exists(path); - try { - // The first line at the relnotes.txt file contains the UniProt release - BufferedReader reader = Files.newBufferedReader(path, Charset.defaultCharset()); - String release = reader.readLine().split(" ")[2]; - reader.close(); - return release; - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } - - private String getUniProtReleaseNotesUrl() { - return URI.create(configuration.getDownload().getGeneUniprotXref().getHost()).resolve("../../../").toString() - + "/relnotes.txt"; - } - - private void downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene expression atlas ..."); - - String geneGtfUrl = configuration.getDownload().getGeneExpressionAtlas().getHost(); - downloadFile(geneGtfUrl, geneFolder.resolve("allgenes_updown_in_organism_part.tab.gz").toString()); - - saveVersionData(EtlCommons.GENE_DATA, GENE_EXPRESSION_ATLAS_NAME, getGeneExpressionAtlasVersion(), getTimeStamp(), - Collections.singletonList(geneGtfUrl), buildFolder.resolve("geneExpressionAtlasVersion.json")); - - } - - private String getGeneExpressionAtlasVersion() { - return FilenameUtils.getBaseName(configuration.getDownload().getGeneExpressionAtlas().getHost()) - .split("_")[5].replace(".tab", ""); - } - - private void downloadGeneDiseaseAnnotation(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene disease annotation ..."); - - String host = configuration.getDownload().getHpo().getHost(); - String fileName = StringUtils.substringAfterLast(host, "/"); - downloadFile(host, geneFolder.resolve(fileName).toString()); - saveVersionData(EtlCommons.GENE_DATA, HPO_NAME, null, getTimeStamp(), Collections.singletonList(host), - buildFolder.resolve("hpoVersion.json")); - - host = configuration.getDownload().getDisgenet().getHost(); - List files = configuration.getDownload().getDisgenet().getFiles(); - for (String file : files) { - String outputFile = file.equalsIgnoreCase("readme.txt") ? "disgenetReadme.txt" : file; - downloadFile(host + "/" + file, geneFolder.resolve(outputFile).toString()); - } - - saveVersionData(EtlCommons.GENE_DISEASE_ASSOCIATION_DATA, DISGENET_NAME, - getVersionFromVersionLine(geneFolder.resolve("disgenetReadme.txt"), "(version"), getTimeStamp(), - Collections.singletonList(host), buildFolder.resolve("disgenetVersion.json")); - } - - private void runGeneExtraInfo(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene extra info ..."); - - String geneExtraInfoLogFile = geneFolder.resolve("gene_extra_info.log").toString(); - List args = new ArrayList<>(); - args.addAll(Arrays.asList("--species", speciesConfiguration.getScientificName(), "--assembly", assemblyConfiguration.getName(), - "--outdir", geneFolder.toAbsolutePath().toString(), - "--ensembl-libs", configuration.getDownload().getEnsembl().getLibs())); - - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration) - && !speciesConfiguration.getScientificName().equals("Drosophila melanogaster")) { - args.add("--phylo"); - args.add("no-vertebrate"); - } - - File ensemblScriptsFolder = new File(System.getProperty("basedir") + "/bin/ensembl-scripts/"); - - // run gene_extra_info.pl - boolean geneExtraInfoDownloaded = EtlCommons.runCommandLineProcess(ensemblScriptsFolder, - "./gene_extra_info.pl", - args, - geneExtraInfoLogFile); - - // check output - if (geneExtraInfoDownloaded) { - logger.info("Gene extra files created OK"); - } else { - logger.error("Gene extra info for " + speciesConfiguration.getScientificName() + " cannot be downloaded"); - } - } - - /** - * This method downloads Gerp, PhastCons and PhyloP data from UCSC for Human and Mouse species. - - * @throws IOException if there is an error writing to a file - * @throws InterruptedException if there is an error downloading files - */ - public void downloadConservation() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "conservation")) { - return; - } - - logger.info("Downloading conservation information ..."); - Path conservationFolder = downloadFolder.resolve("conservation"); - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve("phastCons")); - Files.createDirectories(conservationFolder.resolve("phylop")); - Files.createDirectories(conservationFolder.resolve("gerp")); - - String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", - "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M", }; - - if (assemblyConfiguration.getName().equalsIgnoreCase("GRCh38")) { - logger.info("Downloading GERP++ ..."); - downloadFile(configuration.getDownload().getGerp().getHost(), - conservationFolder.resolve(EtlCommons.GERP_SUBDIRECTORY + "/" + EtlCommons.GERP_FILE).toAbsolutePath().toString()); - saveVersionData(EtlCommons.CONSERVATION_DATA, GERP_NAME, null, getTimeStamp(), - Collections.singletonList(configuration.getDownload().getGerp().getHost()), - buildFolder.resolve("gerpVersion.json")); - - logger.info("Downloading phastCons and PhyloP ..."); - String url = configuration.getDownload().getConservation().getHost() + "/hg38"; - List phastconsUrls = new ArrayList<>(chromosomes.length); - List phyloPUrls = new ArrayList<>(chromosomes.length); - for (String chromosome : chromosomes) { - String phastConsUrl = url + "/phastCons100way/hg38.100way.phastCons/chr" + chromosome + ".phastCons100way.wigFix.gz"; - downloadFile(phastConsUrl, conservationFolder.resolve("phastCons").resolve("chr" + chromosome - + ".phastCons100way.wigFix.gz").toString()); - phastconsUrls.add(phastConsUrl); - - String phyloPUrl = url + "/phyloP100way/hg38.100way.phyloP100way/chr" + chromosome + ".phyloP100way.wigFix.gz"; - downloadFile(phyloPUrl, conservationFolder.resolve("phylop").resolve("chr" + chromosome - + ".phyloP100way.wigFix.gz").toString()); - phyloPUrls.add(phyloPUrl); - } - saveVersionData(EtlCommons.CONSERVATION_DATA, PHASTCONS_NAME, null, getTimeStamp(), phastconsUrls, - buildFolder.resolve("phastConsVersion.json")); - saveVersionData(EtlCommons.CONSERVATION_DATA, PHYLOP_NAME, null, getTimeStamp(), phyloPUrls, - buildFolder.resolve("phyloPVersion.json")); - } - } - - if (speciesConfiguration.getScientificName().equals("Mus musculus")) { - Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve("phastCons")); - Files.createDirectories(conservationFolder.resolve("phylop")); - - String url = configuration.getDownload().getConservation().getHost() + "/mm10"; - String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", - "15", "16", "17", "18", "19", "X", "Y", "M", }; - List phastconsUrls = new ArrayList<>(chromosomes.length); - List phyloPUrls = new ArrayList<>(chromosomes.length); - for (String chromosome : chromosomes) { - String phastConsUrl = url + "/phastCons60way/mm10.60way.phastCons/chr" + chromosome + ".phastCons60way.wigFix.gz"; - downloadFile(phastConsUrl, conservationFolder.resolve("phastCons").resolve("chr" + chromosome - + ".phastCons60way.wigFix.gz").toString()); - phastconsUrls.add(phastConsUrl); - String phyloPUrl = url + "/phyloP60way/mm10.60way.phyloP60way/chr" + chromosome + ".phyloP60way.wigFix.gz"; - downloadFile(phyloPUrl, conservationFolder.resolve("phylop").resolve("chr" + chromosome - + ".phyloP60way.wigFix.gz").toString()); - phyloPUrls.add(phyloPUrl); - } - saveVersionData(EtlCommons.CONSERVATION_DATA, PHASTCONS_NAME, null, getTimeStamp(), phastconsUrls, - buildFolder.resolve("phastConsVersion.json")); - saveVersionData(EtlCommons.CONSERVATION_DATA, PHYLOP_NAME, null, getTimeStamp(), phyloPUrls, - buildFolder.resolve("phastConsVersion.json")); - } - } - - - /** - * This method downloads UniProt, IntAct and Interpro data from EMBL-EBI. - * - * @throws IOException if there is an error writing to a file - * @throws InterruptedException if there is an error downloading files - */ - public void downloadProtein() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "protein")) { - return; - } - logger.info("Downloading protein information ..."); - Path proteinFolder = downloadFolder.resolve("protein"); - Files.createDirectories(proteinFolder); - - String url = configuration.getDownload().getUniprot().getHost(); - downloadFile(url, proteinFolder.resolve("uniprot_sprot.xml.gz").toString()); - String relNotesUrl = configuration.getDownload().getUniprotRelNotes().getHost(); - downloadFile(relNotesUrl, proteinFolder.resolve("uniprotRelnotes.txt").toString()); - Files.createDirectories(proteinFolder.resolve("uniprot_chunks")); - splitUniprot(proteinFolder.resolve("uniprot_sprot.xml.gz"), proteinFolder.resolve("uniprot_chunks")); - saveVersionData(EtlCommons.PROTEIN_DATA, UNIPROT_NAME, getLine(proteinFolder.resolve("uniprotRelnotes.txt"), 1), - getTimeStamp(), Collections.singletonList(url), buildFolder.resolve("uniprotVersion.json")); - -// url = configuration.getDownload().getIntact().getHost(); -// downloadFile(url, proteinFolder.resolve("intact.txt").toString()); -// saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, null, getTimeStamp(), Collections.singletonList(url), -// proteinFolder.resolve("intactVersion.json")); -// -// url = configuration.getDownload().getInterpro().getHost(); -// downloadFile(url, proteinFolder.resolve("protein2ipr.dat.gz").toString()); -// relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost(); -// downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString()); -// saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5), -// getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("interproVersion.json")); - } - - private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException { - BufferedReader br = FileUtils.newBufferedReader(uniprotFilePath); - PrintWriter pw = null; - StringBuilder header = new StringBuilder(); - boolean beforeEntry = true; - boolean inEntry = false; - int count = 0; - int chunk = 0; - String line; - while ((line = br.readLine()) != null) { - if (line.trim().startsWith("")) { - inEntry = false; - if (count % 10000 == 0) { - pw.print(""); - pw.close(); - chunk++; - } - } - } - pw.print(""); - pw.close(); - br.close(); - } - -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/DownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/DownloadManager.java deleted file mode 100644 index ab1d090294..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/DownloadManager.java +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.download; - -import com.beust.jcommander.ParameterException; -import com.fasterxml.jackson.core.util.DefaultPrettyPrinter; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.ObjectReader; -import com.fasterxml.jackson.databind.ObjectWriter; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; -import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.SpeciesConfiguration; -import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.cellbase.core.utils.SpeciesUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.nio.charset.Charset; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.sql.Timestamp; -import java.text.SimpleDateFormat; -import java.time.LocalDateTime; -import java.util.*; - -@Deprecated -public class DownloadManager { - - - private static final String CADD_NAME = "CADD"; - private static final String DGV_NAME = "DGV"; -// private static final String GWAS_NAME = "Gwas Catalog"; -// private static final String DBSNP_NAME = "dbSNP"; -// private static final String REACTOME_NAME = "Reactome"; - - private static final String GNOMAD_NAME = "gnomAD"; - - protected String species; - protected String assembly; - protected Path outdir; - protected CellBaseConfiguration configuration; - - protected SpeciesConfiguration speciesConfiguration; - protected String speciesShortName; - protected String ensemblHostUrl; - protected SpeciesConfiguration.Assembly assemblyConfiguration; - protected String ensemblVersion; - protected String ensemblRelease; - protected Path downloadFolder; - protected Path buildFolder; // /_/generated-json - protected Logger logger; - - public DownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration) - throws IOException, CellBaseException { - this.species = species; - this.assembly = assembly; - this.outdir = outdir; - this.configuration = configuration; - - this.init(); - } - - @Deprecated - public DownloadManager(CellBaseConfiguration configuration, Path targetDirectory, SpeciesConfiguration speciesConfiguration, - SpeciesConfiguration.Assembly assembly) throws IOException { - logger = LoggerFactory.getLogger(this.getClass()); - - this.configuration = configuration; - this.speciesConfiguration = speciesConfiguration; -// assemblyName = assembly.getName(); - - // Output folder creation - speciesShortName = SpeciesUtils.getSpeciesShortname(speciesConfiguration); - // /_ - Path speciesFolder = targetDirectory.resolve(speciesShortName + "_" + assembly.getName().toLowerCase()); - // /_/download - downloadFolder = targetDirectory.resolve(speciesFolder + "/download"); - makeDir(downloadFolder); - - ensemblHostUrl = getEnsemblURL(speciesConfiguration); - ensemblVersion = assembly.getEnsemblVersion(); - ensemblRelease = "release-" + ensemblVersion.split("_")[0]; - } - - private void init() throws CellBaseException, IOException { - logger = LoggerFactory.getLogger(this.getClass()); - - // Check Species - this.speciesConfiguration = SpeciesUtils.getSpeciesConfiguration(configuration, species); - if (speciesConfiguration == null) { - throw new CellBaseException("Invalid species: '" + species + "'"); - } - this.speciesShortName = SpeciesUtils.getSpeciesShortname(speciesConfiguration); - this.ensemblHostUrl = getEnsemblURL(speciesConfiguration); - - // Check assembly and get Ensembl version - if (StringUtils.isEmpty(assembly)) { - this.assemblyConfiguration = SpeciesUtils.getDefaultAssembly(speciesConfiguration); - } else { - this.assemblyConfiguration = SpeciesUtils.getAssembly(speciesConfiguration, assembly); - } - if (assemblyConfiguration == null) { - throw new CellBaseException("Invalid assembly: '" + assembly + "'"); - } - this.ensemblVersion = assemblyConfiguration.getEnsemblVersion(); - this.ensemblRelease = "release-" + ensemblVersion.split("_")[0]; - - // Prepare outdir - Path speciesFolder = outdir.resolve(speciesShortName + "_" + assemblyConfiguration.getName().toLowerCase()); - downloadFolder = outdir.resolve(speciesFolder + "/download"); - Files.createDirectories(downloadFolder); - - // /_/generated_json - buildFolder = outdir.resolve(speciesFolder + "/generated_json"); - Files.createDirectories(buildFolder); - - logger.info("Processing species " + speciesConfiguration.getScientificName()); - } - - @Deprecated - public DownloadFile downloadStructuralVariants() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "svs")) { - return null; - } - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading DGV data ..."); - - Path structuralVariantsFolder = downloadFolder.resolve(EtlCommons.STRUCTURAL_VARIANTS_FOLDER); - Files.createDirectories(structuralVariantsFolder); - String sourceFilename = (assemblyConfiguration.getName().equalsIgnoreCase("grch37") ? "GRCh37_hg19" : "GRCh38_hg38") - + "_variants_2016-05-15.txt"; - String url = configuration.getDownload().getDgv().getHost() + "/" + sourceFilename; - saveVersionData(EtlCommons.STRUCTURAL_VARIANTS_DATA, DGV_NAME, getDGVVersion(sourceFilename), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.DGV_VERSION_FILE)); - return downloadFile(url, structuralVariantsFolder.resolve(EtlCommons.DGV_FILE).toString()); - } - return null; - } - - private String getDGVVersion(String sourceFilename) { - return sourceFilename.split("\\.")[0].split("_")[3]; - } - - protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) { - boolean hasInfo = true; - if (sp.getData() == null || !sp.getData().contains(info)) { - logger.warn("Species '{}' has no '{}' information available to download", sp.getScientificName(), info); - hasInfo = false; - } - return hasInfo; - } - - protected String getTimeStamp() { - return new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()); - } - - protected void saveVersionData(String data, String source, String version, String date, List url, Path outputFilePath) - throws IOException { - Map versionDataMap = new HashMap<>(); - versionDataMap.put("data", data); - versionDataMap.put("source", source); - versionDataMap.put("version", version); - versionDataMap.put("downloadDate", date); - versionDataMap.put("uRL", url); - - ObjectMapper jsonObjectMapper = new ObjectMapper(); - jsonObjectMapper.writeValue(outputFilePath.toFile(), versionDataMap); - } - - protected String getLine(Path readmePath, int lineNumber) { - Files.exists(readmePath); - try { - BufferedReader reader = Files.newBufferedReader(readmePath, Charset.defaultCharset()); - String line = null; - for (int i = 0; i < lineNumber; i++) { - line = reader.readLine(); - } - reader.close(); - return line; - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } - - protected List> parseResult(String json) throws IOException { - ObjectMapper jsonObjectMapper = new ObjectMapper(); - jsonObjectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - ObjectReader reader = jsonObjectMapper - .readerFor(jsonObjectMapper.getTypeFactory().constructCollectionType(List.class, Map.class)); - return reader.readValue(json); - } - - protected String getPhylo(SpeciesConfiguration sp) { - if (configuration.getSpecies().getVertebrates().contains(sp)) { - return "vertebrates"; - } else if (configuration.getSpecies().getMetazoa().contains(sp)) { - return "metazoa"; - } else if (configuration.getSpecies().getFungi().contains(sp)) { - return "fungi"; - } else if (configuration.getSpecies().getProtist().contains(sp)) { - return "protists"; - } else if (configuration.getSpecies().getPlants().contains(sp)) { - return "plants"; - } else if (configuration.getSpecies().getVirus().contains(sp)) { - return "virus"; - } else if (configuration.getSpecies().getBacteria().contains(sp)) { - return "bacteria"; - } else { - throw new ParameterException("Species " + sp.getScientificName() + " not associated to any phylo in the configuration file"); - } - } - - public DownloadFile downloadCaddScores() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "variation_functional_score")) { - return null; - } - if (speciesConfiguration.getScientificName().equals("Homo sapiens") && assemblyConfiguration.getName().equalsIgnoreCase("GRCh37")) { - logger.info("Downloading CADD scores information ..."); - - Path variationFunctionalScoreFolder = downloadFolder.resolve("variation_functional_score"); - Files.createDirectories(variationFunctionalScoreFolder); - - // Downloads CADD scores - String url = configuration.getDownload().getCadd().getHost(); - - saveVersionData(EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, CADD_NAME, url.split("/")[5], getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve("caddVersion.json")); - return downloadFile(url, variationFunctionalScoreFolder.resolve("whole_genome_SNVs.tsv.gz").toString()); - } - return null; - } - - protected DownloadFile downloadFile(String url, String outputFileName) throws IOException, InterruptedException { - return downloadFile(url, outputFileName, null); - } - -// protected void downloadFiles(String host, List fileNames) throws IOException, InterruptedException { -// downloadFiles(host, fileNames, fileNames); -// } - -// protected void downloadFiles(String host, List fileNames, List ouputFileNames) -// throws IOException, InterruptedException { -// for (int i = 0; i < fileNames.size(); i++) { -// downloadFile(host + "/" + fileNames.get(i), ouputFileNames.get(i), null); -// } -// } - - protected DownloadFile downloadFile(String url, String outputFileName, List wgetAdditionalArgs) - throws IOException, InterruptedException { - DownloadFile downloadFileInfo = new DownloadFile(url, outputFileName, Timestamp.valueOf(LocalDateTime.now()).toString()); - Long startTime = System.currentTimeMillis(); - if (Paths.get(outputFileName).toFile().exists()) { - logger.warn("File '{}' is already downloaded", outputFileName); - setDownloadStatusAndMessage(outputFileName, downloadFileInfo, "File '" + outputFileName + "' is already downloaded", true); - } else { - final String outputLog = outputFileName + ".log"; - List wgetArgs = new ArrayList<>(Arrays.asList("--tries=10", url, "-O", outputFileName, "-o", outputLog)); - if (wgetAdditionalArgs != null && !wgetAdditionalArgs.isEmpty()) { - wgetArgs.addAll(wgetAdditionalArgs); - } - boolean downloaded = EtlCommons.runCommandLineProcess(null, "wget", wgetArgs, outputLog); - setDownloadStatusAndMessage(outputFileName, downloadFileInfo, outputLog, downloaded); - } - downloadFileInfo.setElapsedTime(startTime, System.currentTimeMillis()); - return downloadFileInfo; - } - - private void setDownloadStatusAndMessage(String outputFileName, DownloadFile downloadFile, String outputLog, boolean downloaded) { - if (downloaded) { - boolean validFileSize = validateDownloadFile(downloadFile, outputFileName, outputLog); - if (validFileSize) { - downloadFile.setStatus(DownloadFile.Status.OK); - downloadFile.setMessage("File downloaded successfully"); - } else { - downloadFile.setStatus(DownloadFile.Status.ERROR); - downloadFile.setMessage("Expected downloaded file size " + downloadFile.getExpectedFileSize() - + ", Actual file size " + downloadFile.getActualFileSize()); - } - } else { - downloadFile.setMessage("See full error message in " + outputLog); - downloadFile.setStatus(DownloadFile.Status.ERROR); - // because we use the -O flag, a file will be written, even on error. See #467 -// Files.deleteIfExists((new File(outputFileName)).toPath()); - } - } - - public void writeDownloadLogFile(List downloadFiles) throws IOException { - ObjectMapper mapper = new ObjectMapper(); - ObjectWriter writer = mapper.writer(new DefaultPrettyPrinter()); - writer.writeValue(new File(downloadFolder + "/download_log.json"), downloadFiles); - } - - private boolean validateDownloadFile(DownloadFile downloadFile, String outputFileName, String outputFileLog) { - long expectedFileSize = getExpectedFileSize(outputFileLog); - long actualFileSize = FileUtils.sizeOf(new File(outputFileName)); - downloadFile.setActualFileSize(actualFileSize); - downloadFile.setExpectedFileSize(expectedFileSize); - return expectedFileSize == actualFileSize; - } - - private int getExpectedFileSize(String outputFileLog) { - try (BufferedReader reader = new BufferedReader(new FileReader(outputFileLog))) { - String line = null; - while ((line = reader.readLine()) != null) { - // looking for: Length: 13846591 (13M) - if (line.startsWith("Length:")) { - String[] parts = line.split("\\s"); - return Integer.parseInt(parts[1]); - } - } - } catch (Exception e) { - System.err.println(e); - } - return 0; - } - - protected String getVersionFromVersionLine(Path path, String tag) { - Files.exists(path); - try { - BufferedReader reader = Files.newBufferedReader(path, Charset.defaultCharset()); - String line = reader.readLine(); - // There shall be a line at the README.txt containing the version. - // e.g. The files in the current directory contain the data corresponding to the latest release - // (version 4.0, April 2016). ... - while (line != null) { - // tag specifies a certain string that must be found within the line supposed to contain the version - // info - if (line.contains(tag)) { - String version = line.split("\\(")[1].split("\\)")[0]; - reader.close(); - return version; - } - line = reader.readLine(); - } - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } - - @Deprecated - private void makeDir(Path folderPath) throws IOException { - if (!Files.exists(folderPath)) { - Files.createDirectories(folderPath); - } - } - - @Deprecated - private String getEnsemblURL(SpeciesConfiguration sp) { - // We need to find which is the correct Ensembl host URL. - // This can different depending on if is a vertebrate species. - String ensemblHostUrl; - if (configuration.getSpecies().getVertebrates().contains(sp)) { - ensemblHostUrl = configuration.getDownload().getEnsembl().getUrl().getHost(); - } else { - ensemblHostUrl = configuration.getDownload().getEnsemblGenomes().getUrl().getHost(); - } - return ensemblHostUrl; - } -} - - diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 3f90493855..d66f149c04 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -127,7 +127,7 @@ private List downloadEnsemblData(Path geneFolder) throws IOExcepti downloadFiles.add(downloadFile(url, fileName)); downloadedUrls.add(url); - saveVersionData(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), downloadedUrls, + saveDataSource(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), downloadedUrls, geneFolder.resolve(ENSEMBL_CORE_VERSION_FILENAME)); return downloadFiles; @@ -179,7 +179,7 @@ private DownloadFile downloadRefSeqFile(String name, DownloadProperties.URLPrope String version = urlProperties.getVersion(); String filename = getUrlFilename(url); Path outputPath = refSeqFolder.resolve(filename); - saveVersionData(EtlCommons.REFSEQ_DATA, name, version, timeStamp, Collections.singletonList(url), + saveDataSource(EtlCommons.REFSEQ_DATA, name, version, timeStamp, Collections.singletonList(url), refSeqFolder.resolve(versionFilename)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); @@ -190,7 +190,7 @@ private DownloadFile downloadMane(Path geneFolder) throws IOException, Interrupt if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading MANE Select ..."); String url = configuration.getDownload().getManeSelect().getHost(); - saveVersionData(EtlCommons.GENE_DATA, MANE_SELECT_NAME, configuration.getDownload().getManeSelect().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, MANE_SELECT_NAME, configuration.getDownload().getManeSelect().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(MANE_SELECT_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -204,7 +204,7 @@ private DownloadFile downloadLrg(Path geneFolder) throws IOException, Interrupte if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading LRG data ..."); String url = configuration.getDownload().getLrg().getHost(); - saveVersionData(EtlCommons.GENE_DATA, LRG_NAME, configuration.getDownload().getLrg().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, LRG_NAME, configuration.getDownload().getLrg().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(LRG_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -218,7 +218,7 @@ private DownloadFile downloadHgnc(Path geneFolder) throws IOException, Interrupt if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading HGNC data ..."); String url = configuration.getDownload().getHgnc().getHost(); - saveVersionData(GENE_DATA, HGNC_GENE_NAME, configuration.getDownload().getHgnc().getVersion(), + saveDataSource(GENE_DATA, HGNC_GENE_NAME, configuration.getDownload().getHgnc().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(HGNC_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -232,7 +232,7 @@ private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading Cancer Hotspot ..."); String url = configuration.getDownload().getCancerHotspot().getHost(); - saveVersionData(EtlCommons.GENE_DATA, CANCER_HOTSPOT_NAME, configuration.getDownload().getHgnc().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, CANCER_HOTSPOT_NAME, configuration.getDownload().getHgnc().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(CANCER_HOTSPOT_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -246,7 +246,7 @@ private DownloadFile downloadGO(Path geneFolder) throws IOException, Interrupted if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading GO annotation..."); String url = configuration.getDownload().getGoAnnotation().getHost(); - saveVersionData(EtlCommons.GENE_DATA, GO_ANNOTATION_NAME, configuration.getDownload().getGoAnnotation().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, GO_ANNOTATION_NAME, configuration.getDownload().getGoAnnotation().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GO_ANNOTATION_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -260,7 +260,7 @@ private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOExcepti if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading gnomAD constraints data..."); String url = configuration.getDownload().getGnomadConstraints().getHost(); - saveVersionData(EtlCommons.GENE_DATA, GNOMAD_NAME, configuration.getDownload().getGnomadConstraints().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, GNOMAD_NAME, configuration.getDownload().getGnomadConstraints().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GNOMAD_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -274,7 +274,7 @@ private DownloadFile downloadDrugData(Path geneFolder) throws IOException, Inter if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading drug-gene data..."); String url = configuration.getDownload().getDgidb().getHost(); - saveVersionData(EtlCommons.GENE_DATA, DGIDB_NAME, configuration.getDownload().getDgidb().getVersion(), getTimeStamp(), + saveDataSource(EtlCommons.GENE_DATA, DGIDB_NAME, configuration.getDownload().getDgidb().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(DGIDB_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); @@ -291,7 +291,7 @@ private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException String filename = GENE_UNIPROT_XREF_FILES.get(speciesConfiguration.getScientificName()); String geneGtfUrl = configuration.getDownload().getGeneUniprotXref().getHost() + "/" + filename; - saveVersionData(EtlCommons.GENE_DATA, UNIPROT_NAME, + saveDataSource(EtlCommons.GENE_DATA, UNIPROT_NAME, configuration.getDownload().getGeneUniprotXref().getVersion(), getTimeStamp(), Collections.singletonList(geneGtfUrl), geneFolder.resolve(UNIPROT_XREF_VERSION_FILENAME)); @@ -306,7 +306,7 @@ private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException { logger.info("Downloading gene expression atlas ..."); String geneGtfUrl = configuration.getDownload().getGeneExpressionAtlas().getHost(); - saveVersionData(EtlCommons.GENE_DATA, GENE_EXPRESSION_ATLAS_NAME, configuration.getDownload().getGeneExpressionAtlas().getVersion(), + saveDataSource(EtlCommons.GENE_DATA, GENE_EXPRESSION_ATLAS_NAME, configuration.getDownload().getGeneExpressionAtlas().getVersion(), getTimeStamp(), Collections.singletonList(geneGtfUrl), geneFolder.resolve(GENE_EXPRESSION_ATLAS_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(geneGtfUrl)); @@ -322,7 +322,7 @@ private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOExc configuration.getDownload().getHpo().getHost(), HPO_VERSION_FILENAME, GENE_DATA, HPO_NAME); String url = configuration.getDownload().getDisgenet().getHost(); - saveVersionData(EtlCommons.GENE_DISEASE_ASSOCIATION_DATA, DISGENET_NAME, configuration.getDownload().getDisgenet().getVersion(), + saveDataSource(EtlCommons.GENE_DISEASE_ASSOCIATION_DATA, DISGENET_NAME, configuration.getDownload().getDisgenet().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(DISGINET_VERSION_FILENAME)); Path outputPath = geneFolder.resolve(getUrlFilename(url)); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index e7e510fb91..70fbc6f6a1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -17,6 +17,7 @@ package org.opencb.cellbase.lib.download; import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; @@ -27,6 +28,8 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class OntologyDownloadManager extends AbstractDownloadManager { public OntologyDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) @@ -34,37 +37,34 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec super(species, assembly, targetDirectory, configuration); } - public List download() throws IOException, InterruptedException { - logger.info("Downloading OBO files ..."); + logger.info("Downloading {} files ...", ONTOLOGY_DATA); - List downloadFiles = new ArrayList<>(); - Path oboFolder = downloadFolder.resolve("ontology"); + Path oboFolder = downloadFolder.resolve(ONTOLOGY_FOLDER_NAME); Files.createDirectories(oboFolder); - String url = configuration.getDownload().getHpoObo().getHost(); - logger.info("Downloading {} ...", url); - downloadFiles.add(downloadFile(url, oboFolder.resolve("hp.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "HPO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), oboFolder.resolve(EtlCommons.HPO_VERSION_FILE)); + DownloadFile downloadFile; + List downloadFiles = new ArrayList<>(); + + // HPO + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_NAME, ONTOLOGY_DATA, + HPO_OBO_FILE_ID, HPO_OBO_VERSION_FILENAME, oboFolder); + downloadFiles.add(downloadFile); - url = configuration.getDownload().getGoObo().getHost(); - logger.info("Downloading {} ...", url); - downloadFiles.add(downloadFile(url, oboFolder.resolve("go-basic.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "GO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), oboFolder.resolve(EtlCommons.GO_VERSION_FILE)); + // GO + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoObo(), GO_OBO_NAME, ONTOLOGY_DATA, + GO_OBO_FILE_ID, GO_OBO_VERSION_FILENAME, oboFolder); + downloadFiles.add(downloadFile); - url = configuration.getDownload().getDoidObo().getHost(); - logger.info("Downloading {} ...", url); - downloadFiles.add(downloadFile(url, oboFolder.resolve("doid.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "DO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), oboFolder.resolve(EtlCommons.DO_VERSION_FILE)); + // DOID + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_NAME, ONTOLOGY_DATA, + DOID_OBO_FILE_ID, DOID_OBO_VERSION_FILENAME, oboFolder); + downloadFiles.add(downloadFile); - url = configuration.getDownload().getMondoObo().getHost(); - logger.info("Downloading {} ...", url); - downloadFiles.add(downloadFile(url, oboFolder.resolve("mondo.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "MONDO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), oboFolder.resolve(EtlCommons.MONDO_VERSION_FILE)); + // Mondo + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_NAME, ONTOLOGY_DATA, + MONDO_OBO_FILE_ID, MONDO_OBO_VERSION_FILENAME, oboFolder); + downloadFiles.add(downloadFile); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 274f6c62a7..812dcd996a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -30,6 +30,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Map; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -49,7 +50,9 @@ public List download() throws IOException, InterruptedException { List urls = new ArrayList<>(); List downloadFiles = new ArrayList<>(); - for (String url : pharmGKB.getFiles()) { + String host = pharmGKB.getHost(); + for (Map.Entry entry : pharmGKB.getFiles().entrySet()) { + String url = host + entry.getValue(); urls.add(url); Path downloadedFileName = Paths.get(new URL(url).getPath()).getFileName(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java index d7c924afa1..3bc97b1824 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java @@ -26,7 +26,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataRelease; -import org.opencb.cellbase.core.models.DataReleaseSource; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.impl.core.CellBaseDBAdaptor; import org.opencb.cellbase.lib.impl.core.ReleaseMongoDBAdaptor; @@ -107,7 +107,7 @@ public DataRelease get(int release) throws CellBaseException { } } } - throw new CellBaseException("Data release '" + release + "' does not exist for species = " + species + ", assembly = " + assembly); + throw new CellBaseException("Data release '" + release + "' does not exist" + getSpeciesAssemblyMessage()); } public DataRelease getDefault(String cellBaseVersion) throws CellBaseException { @@ -119,8 +119,7 @@ public DataRelease getDefault(String cellBaseVersion) throws CellBaseException { } } } - throw new CellBaseException("No data release found for CellBase " + cellBaseVersion + " (species = " + species + ", assembly = " - + assembly + ")"); + throw new CellBaseException("No data release found for CellBase " + cellBaseVersion + getSpeciesAssemblyMessage()); } public DataRelease update(int release, List versions) throws CellBaseException { @@ -136,28 +135,27 @@ public DataRelease update(int release, String collection, String data, List newSources = new ArrayList<>(); + List newSources = new ArrayList<>(); // First, add new data sources Set sourceSet = new HashSet<>(); ObjectMapper jsonObjectMapper = new ObjectMapper(); - ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataReleaseSource.class); + ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataSource.class); for (Path dataSourcePath : dataSourcePaths) { if (dataSourcePath.toFile().exists()) { try { - DataReleaseSource dataReleaseSource = jsonObjectReader.readValue(dataSourcePath.toFile()); - newSources.add(dataReleaseSource); - sourceSet.add(dataReleaseSource.getData() + "__" + dataReleaseSource.getName()); + DataSource dataSource = jsonObjectReader.readValue(dataSourcePath.toFile()); + newSources.add(dataSource); + sourceSet.add(dataSource.getCategory() + "__" + dataSource.getName()); } catch (IOException e) { - logger.warn("Something wrong happened when reading data release source " + dataSourcePath + ". " - + e.getMessage()); + logger.warn("Something wrong happened when reading data release source {}: {}", dataSourcePath, e.getMessage()); } } } // Second, add previous data sources if necessary (to avoid duplicated sources) - for (DataReleaseSource source : currDataRelease.getSources()) { - String key = source.getData() + "__" + source.getName(); + for (DataSource source : currDataRelease.getSources()) { + String key = source.getCategory() + "__" + source.getName(); if (!sourceSet.contains(key)) { newSources.add(source); } @@ -173,7 +171,7 @@ public DataRelease update(int release, String collection, String data, List> tmp = new ArrayList<>(); - for (DataReleaseSource source : dataRelase.getSources()) { + for (DataSource source : dataRelase.getSources()) { Map map = new HashMap<>(); - if (StringUtils.isNotEmpty(source.getData())) { - map.put("data", source.getData()); - } if (StringUtils.isNotEmpty(source.getName())) { map.put("name", source.getName()); } + if (StringUtils.isNotEmpty(source.getCategory())) { + map.put("category", source.getCategory()); + } if (StringUtils.isNotEmpty(source.getVersion())) { map.put("version", source.getVersion()); } - if (CollectionUtils.isNotEmpty(source.getUrl())) { - map.put("url", source.getUrl()); + if (StringUtils.isNotEmpty(source.getDownloadDate())) { + map.put("downloadDate", source.getDownloadDate()); + } + if (CollectionUtils.isNotEmpty(source.getUrls())) { + map.put("urls", source.getUrls()); } - if (StringUtils.isNotEmpty(source.getDate())) { - map.put("date", source.getDate()); + if (CollectionUtils.isNotEmpty(source.getNotes())) { + map.put("notes", source.getUrls()); } tmp.add(map); } @@ -224,8 +225,7 @@ public int checkDataRelease(int inRelease) throws CellBaseException { String[] split = GitRepositoryState.get().getBuildVersion().split("[.-]"); String version = "v" + split[0] + "." + split[1]; outRelease = getDefault(version).getRelease(); - logger.info("Using data release 0: it means to take default data release '" + outRelease + "' for CellBase version '" - + version + "'"); + logger.warn("Using data release 0: it will take the default data release {} for CellBase version {}", outRelease, version); return outRelease; } @@ -236,8 +236,12 @@ public int checkDataRelease(int inRelease) throws CellBaseException { } } - throw new CellBaseException("Invalid data release " + outRelease + " for species = " + species + ", assembly = " + assembly - + ". Valid data releases are: " + StringUtils.join(dataReleases.stream().map(dr -> dr.getRelease()) + throw new CellBaseException("Invalid data release " + outRelease + getSpeciesAssemblyMessage() + ". Valid data releases are: " + + StringUtils.join(dataReleases.stream().map(dr -> dr.getRelease()) .collect(Collectors.toList()), ",")); } + + private String getSpeciesAssemblyMessage() { + return " (species = " + species + ", assembly = " + assembly + ")"; + } } From a3e9684143cfd18d372cc2faff001b53ee77d308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 11 Apr 2024 11:20:10 +0200 Subject: [PATCH 051/107] lib: update CellBase downloaders according to the DownloadProperties.URLProperties changes, #TASK-5775, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 4 +- .../src/main/resources/configuration.yml | 56 +++++--- .../org/opencb/cellbase/lib/EtlCommons.java | 124 +++++++++++++----- .../lib/download/AbstractDownloadManager.java | 2 - .../lib/download/CaddDownloadManager.java | 8 +- .../lib/download/ClinicalDownloadManager.java | 8 +- .../lib/download/GenomeDownloadManager.java | 70 +++++----- .../MissenseScoresDownloadManager.java | 18 ++- .../lib/download/OntologyDownloadManager.java | 8 +- .../lib/download/PharmGKBDownloadManager.java | 6 +- .../lib/download/ProteinDownloadManager.java | 50 +++---- .../lib/download/PubMedDownloadManager.java | 25 ++-- .../download/RegulationDownloadManager.java | 7 +- 13 files changed, 218 insertions(+), 168 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index b1a48dc0f1..8e51ac8b23 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -192,7 +192,7 @@ public void execute() { } private CellBaseBuilder buildRepeats() { - Path repeatsFilesDir = downloadFolder.resolve(EtlCommons.REPEATS_FOLDER); + Path repeatsFilesDir = downloadFolder.resolve(EtlCommons.REPEATS_SUBDIRECTORY); copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.TRF_VERSION_FILENAME))); copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.GSD_VERSION_FILENAME))); copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.WM_VERSION_FILENAME))); @@ -349,7 +349,7 @@ private CellBaseBuilder buildConservation() { } private CellBaseBuilder buildClinicalVariants() throws CellBaseException { - Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER_NAME); + Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_SUBDIRECTORY); List versionFiles = new ArrayList<>(); List versionFilenames = Arrays.asList(CLINVAR_VERSION_FILENAME, COSMIC_VERSION_FILENAME, GWAS_VERSION_FILENAME, diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 20012d44a1..5052473aa0 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -91,6 +91,8 @@ download: version: "2.0.14" goAnnotation: host: http://geneontology.org/gene-associations/goa_human.gaf.gz + + ## Regulation mirbase: host: https://www.mirbase.org/download/miRNA.dat version: "22.1" @@ -102,33 +104,39 @@ download: ## Protein Data uniprot: - host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - version: "2024-01-24" - uniprotRelNotes: - host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt + host: https://ftp.uniprot.org/ version: "2024-01-24" + files: + UNIPROT: pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz interpro: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz - version: "2024-01-24" - interproRelNotes: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt + host: https://ftp.ebi.ac.uk/ version: "2024-01-24" + files: + INTERPRO: pub/databases/interpro/current_release/protein2ipr.dat.gz intact: - host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt + host: https://ftp.ebi.ac.uk/ version: "2024-02-16" + files: + INTACT: pub/databases/intact/current/psimitab/intact.txt ## Conservation Scores phastCons: ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M - host: https://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/phastCons470way/put_assembly_here.470way.phastCons/chrput_chromosome_here.phastCons470way.wigFix.gz + host: https://hgdownload.cse.ucsc.edu/ version: "2022-08-30" + files: + PHASTCONS: goldenPath/put_assembly_here/phastCons470way/put_assembly_here.470way.phastCons/chrput_chromosome_here.phastCons470way.wigFix.gz phylop: ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M - host: https://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/phyloP470way/put_assembly_here.470way.phyloP/chrput_chromosome_here.phyloP470way.wigFix.gz + host: https://hgdownload.cse.ucsc.edu/ version: "2022-08-30" + files: + PHYLOP: goldenPath/put_assembly_here/phyloP470way/put_assembly_here.470way.phyloP/chrput_chromosome_here.phyloP470way.wigFix.gz gerp: - host: http://ftp.ensembl.org/pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + host: http://ftp.ensembl.org/ version: "2023-05-17" + files: + GERP: pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw ## Clinical Variant clinvar: @@ -165,19 +173,27 @@ download: dgv: host: http://dgv.tcag.ca/v106/docs simpleRepeats: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - host: http://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/database/simpleRepeat.txt.gz + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + SIMPLE_REPEATS: goldenPath/put_assembly_here/database/simpleRepeat.txt.gz windowMasker: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - host: http://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/database/windowmaskerSdust.txt.gz + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + WINDOW_MASKER: goldenPath/put_assembly_here/database/windowmaskerSdust.txt.gz genomicSuperDups: - ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 - host: http://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/database/genomicSuperDups.txt.gz + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + GENOMIC_SUPER_DUPS: goldenPath/put_assembly_here/database/genomicSuperDups.txt.gz ## Variant Pathogenic Prediction revel: - host: https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip + host: https://zenodo.org/ version: "1.3" + files: + REVEL: record/7072866/files/revel-v1.3_all_chromosomes.zip cadd: host: https://krishna.gs.washington.edu/ version: "1.7" @@ -228,7 +244,7 @@ download: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ version: 2024 files: - - pubmed24n[1..1219..4].xml.gz + PUBMED_REGEX: pubmed24n[1..1219..4].xml.gz pharmGKB: host: https://api.pharmgkb.org/v1/download/file/data/ version: v1 diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 8c048de1b3..15c93c5101 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -39,9 +39,15 @@ public class EtlCommons { public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; + public static final String GRCH38_NAME = "GRCh38"; + public static final String GRCH37_NAME = "GRCh37"; + public static final String HG38_NAME = "hg38"; + public static final String HG19_NAME = "hg19"; + public static final String SUFFIX_VERSION_FILENAME = "Version.json"; public static final String GENOME_DATA = "genome"; + public static final String GENOME_VERSION_FILENAME = "genome" + SUFFIX_VERSION_FILENAME; public static final String GENE_DATA = "gene"; public static final String ENSEMBL_CORE_VERSION_FILENAME = "ensemblCore" + SUFFIX_VERSION_FILENAME; @@ -64,23 +70,32 @@ public class EtlCommons { public static final String REFSEQ_CDNA_FASTA_VERSION_FILENAME = REFSEQ_DATA + "CdnaFasta" + SUFFIX_VERSION_FILENAME; public static final String GENE_DISEASE_ASSOCIATION_DATA = "gene_disease_association"; public static final String VARIATION_DATA = "variation"; - public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; - public static final String REGULATION_DATA = "regulation"; - public static final String PROTEIN_DATA = "protein"; public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; public static final String SPLICE_SCORE_DATA = "splice_score"; + // Pharmacogenomics public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; + public static final String PHARMACOGENOMICS_SUBDIRECTORY = "pharmacogenomics"; + // PharmGKB public static final String PHARMGKB_NAME = "PharmGKB"; public static final String PHARMGKB_DATA = "pharmgkb"; - public static final String PHARMGKB_VERSION_FILENAME = PHARMGKB_DATA + SUFFIX_VERSION_FILENAME; + public static final String PHARMGKB_SUBDIRECTORY = "pharmgkb"; + public static final String PHARMGKB_VERSION_FILENAME = "pharmgkb" + SUFFIX_VERSION_FILENAME; + + // Missense variantion functional score + public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; + // Revel + public static final String REVEL_NAME = "Revel"; + public static final String REVEL_VERSION_FILENAME = "revel" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String REVEL_FILE_ID = "REVEL"; // Clinical variants data - public static final String CLINICAL_VARIANTS_FOLDER_NAME = "clinicalVariant"; + public static final String CLINICAL_VARIANTS_SUBDIRECTORY = "clinicalVariant"; // ClinVar public static final String CLINVAR_NAME = "ClinVar"; public static final String CLINVAR_VERSION_FILENAME = "clinvar" + SUFFIX_VERSION_FILENAME; - public static final String ClINVAR_CHUNKS_FOLDER_NAME = "clinvar_chunks"; + public static final String CLINVAR_CHUNKS_SUBDIRECTORY = "clinvar_chunks"; // Must match the configuration file public static final String CLINVAR_FULL_RELEASE_FILE_ID = "FULL_RELEASE"; public static final String CLINVAR_SUMMARY_FILE_ID = "SUMMARY"; @@ -104,10 +119,25 @@ public class EtlCommons { public static final String STRUCTURAL_VARIANTS_DATA = "svs"; public static final String REPEATS_DATA = "repeats"; + public static final String REPEATS_SUBDIRECTORY = "genome"; + public static final String REPEATS_JSON = "repeats"; + // Simple repeats + @Deprecated + public static final String TRF_FILE = "simpleRepeat.txt.gz"; + public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME; + public static final String SIMPLE_REPEATS_FILE_ID = "SIMPLE_REPEATS"; + @Deprecated + public static final String GSD_FILE = "genomicSuperDups.txt.gz"; + public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME; + public static final String GENOMIC_SUPER_DUPS_FILE_ID = "GENOMIC_SUPER_DUPS"; + @Deprecated + public static final String WM_FILE = "windowmaskerSdust.txt.gz"; + public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME; + public static final String WINDOW_MASKER_FILE_ID = "WINDOW_MASKER"; // Ontology public static final String ONTOLOGY_DATA = "ontology"; - public static final String ONTOLOGY_FOLDER_NAME = "ontology"; + public static final String ONTOLOGY_SUBDIRECTORY = "ontology"; // HPO public static final String HPO_OBO_NAME = "HPO"; public static final String HPO_OBO_VERSION_FILENAME = "hpoObo" + SUFFIX_VERSION_FILENAME; @@ -134,13 +164,26 @@ public class EtlCommons { // Variation functional score public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score"; - public static final String VARIATION_FUNCTIONAL_SCORE_FOLDER_NAME = "variation_functional_score"; + public static final String VARIATION_FUNCTIONAL_SCORE_SUBDIRECTORY = "variation_functional_score"; // CADD scores public static final String CADD_NAME = "CADD"; public static final String CADD_VERSION_FILENAME = "cadd" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String CADD_FILE_ID = "CADD"; + // Regulation + public static final String REGULATION_DATA = "regulation"; + public static final String REGULATION_SUBDIRECTORY = "regulation"; + // Regulatory/motif features + public static final String REGULATORY_FEATURES_FILE = "Regulatory_Build.regulatory_features.gff.gz"; + public static final String MOTIF_FEATURES_FILE = "motif_features.gff.gz"; + // miRBase + public static final String MIRBASE_NAME = "miRBase"; + public static final String MIRBASE_VERSION_FILENAME = "mirbase" + SUFFIX_VERSION_FILENAME; + // miRTarBase + public static final String MIRTARBASE_NAME = "miRTarBase"; + public static final String MIRTARBASE_VERSION_FILENAME = "mirtarbase" + SUFFIX_VERSION_FILENAME; + // Build specific data options public static final String GENOME_INFO_DATA = "genome_info"; public static final String DISGENET_DATA = "disgenet"; @@ -158,23 +201,48 @@ public class EtlCommons { // public static final String IARCTP53_SOMATIC_REFERENCES_FILE = "somaticMutationReferenceIARC TP53 Database, R20.txt"; // public static final String HGMD_DATA = "hgmd"; - public static final String PUBMED_DATA = "pubmed"; - public static final String PUBMED_VERSION_FILE = PUBMED_DATA + SUFFIX_VERSION_FILENAME; - // Load specific data options public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; + // Protein + public static final String PROTEIN_DATA = "protein"; + public static final String PROTEIN_SUBDIRECTORY = "protein"; + // UniProt + public static final String UNIPROT_NAME = "UniProt"; + public static final String UNIPROT_CHUNKS_SUBDIRECTORY = "uniprot_chunks"; + public static final String UNIPROT_VERSION_FILENAME = "uniprot" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String UNIPROT_FILE_ID = "UNIPROT"; + // InterPro + public static final String INTERPRO_NAME = "InterPro"; + public static final String INTERPRO_VERSION_FILENAME = "interpro" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String INTERPRO_FILE_ID = "INTERPRO"; + // IntAct + public static final String INTACT_NAME = "IntAct"; + public static final String INTACT_VERSION_FILENAME = "intact" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String INTACT_FILE_ID = "INTACT"; + + // Conservation scores public static final String CONSERVATION_DATA = "conservation"; + public static final String CONSERVATION_SUBDIRECTORY = "conservation"; + // GERP public static final String GERP_NAME = "GERP++"; public static final String GERP_SUBDIRECTORY = "gerp"; - public static final String GERP_VERSION_FILENAME = GERP_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; + public static final String GERP_VERSION_FILENAME = "gerp" + SUFFIX_VERSION_FILENAME; + public static final String GERP_FILE_ID = "GERP"; + // PHASTCONS public static final String PHASTCONS_NAME = "PhastCons"; public static final String PHASTCONS_SUBDIRECTORY = "phastCons"; - public static final String PHASTCONS_VERSION_FILENAME = PHASTCONS_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; + public static final String PHASTCONS_VERSION_FILENAME = "phastCons" + SUFFIX_VERSION_FILENAME; + public static final String PHASTCONS_FILE_ID = "PHASTCONS"; + // PHYLOP public static final String PHYLOP_NAME = "PhyloP"; public static final String PHYLOP_SUBDIRECTORY = "phylop"; - public static final String PHYLOP_VERSION_FILENAME = PHYLOP_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; + public static final String PHYLOP_VERSION_FILENAME = "phylop" + SUFFIX_VERSION_FILENAME; + public static final String PHYLOP_FILE_ID = "PHYLOP"; // Splice scores public static final String MMSPLICE_SUBDIRECTORY = "mmsplice"; @@ -196,19 +264,6 @@ public class EtlCommons { public static final String DGV_VERSION_FILE = "dgvVersion.json"; public static final String STRUCTURAL_VARIANTS_JSON = "structuralVariants"; - @Deprecated - public static final String TRF_FILE = "simpleRepeat.txt.gz"; - @Deprecated - public static final String GSD_FILE = "genomicSuperDups.txt.gz"; - @Deprecated - public static final String WM_FILE = "windowmaskerSdust.txt.gz"; - - public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME; - public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME; - public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME; - public static final String REPEATS_FOLDER = "genome"; - public static final String REPEATS_JSON = "repeats"; - public static final String OBO_JSON = "ontology"; public static final String HPO_VERSION_FILE = "hpo" + SUFFIX_VERSION_FILENAME; public static final String GO_VERSION_FILE = "go" + SUFFIX_VERSION_FILENAME; @@ -216,16 +271,13 @@ public class EtlCommons { public static final String MONDO_VERSION_FILE = "mondo" + SUFFIX_VERSION_FILENAME; public static final String HGMD_FILE = "hgmd.vcf"; - public static final String PUBMED_VERSION_FILENAME = "pubmedVersion.json"; - - public static final String REGULATORY_FEATURES_FILE = "Regulatory_Build.regulatory_features.gff.gz"; - public static final String MOTIF_FEATURES_FILE = "motif_features.gff.gz"; - - public static final String MIRBASE_NAME = "miRBase"; - public static final String MIRBASE_VERSION_FILENAME = MIRBASE_NAME + SUFFIX_VERSION_FILENAME; - public static final String MIRTARBASE_NAME = "miRTarBase"; - public static final String MIRTARBASE_VERSION_FILENAME = MIRTARBASE_NAME + SUFFIX_VERSION_FILENAME; + // PubMed + public static final String PUBMED_NAME = "PubMed"; + public static final String PUBMED_DATA = "pubmed"; + public static final String PUBMED_SUBDIRECTORY = "pubmed"; + public static final String PUBMED_VERSION_FILENAME = "pubmed" + SUFFIX_VERSION_FILENAME; + public static final String PUBMED_REGEX_FILE_ID = "PUBMED"; public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) throws IOException, InterruptedException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index dcbd93a684..946d868721 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -47,8 +47,6 @@ import java.time.LocalDateTime; import java.util.*; -import static org.opencb.cellbase.lib.EtlCommons.HPO_OBO_FILE_ID; - public abstract class AbstractDownloadManager { protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} to {} ..."; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index 572588b2d2..6743ed8a06 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -17,9 +17,7 @@ package org.opencb.cellbase.lib.download; import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; @@ -42,10 +40,10 @@ public List download() throws IOException, InterruptedException { return null; } if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} files ...", CADD_NAME); - - Path variationFunctionalScoreFolder = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_FOLDER_NAME); + Path variationFunctionalScoreFolder = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_SUBDIRECTORY); Files.createDirectories(variationFunctionalScoreFolder); + logger.info("Downloading {} files at {} ...", CADD_NAME, variationFunctionalScoreFolder); + // Download CADD and save data source DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_NAME, diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index b1eb9e7192..bb6f53e32d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -51,11 +51,9 @@ public List download() throws IOException, InterruptedException { public List downloadClinical() throws IOException, InterruptedException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - - logger.info("Downloading clinical information ..."); - - Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER_NAME).toAbsolutePath(); + Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_SUBDIRECTORY).toAbsolutePath(); Files.createDirectories(clinicalFolder); + logger.info("Downloading clinical information at {} ...", clinicalFolder); String url; List urls; @@ -103,7 +101,7 @@ public List downloadClinical() throws IOException, InterruptedExce clinicalFolder.resolve(CLINVAR_VERSION_FILENAME)); // Prepare CliVar chunk files - Path chunksPath = clinicalFolder.resolve(ClINVAR_CHUNKS_FOLDER_NAME); + Path chunksPath = clinicalFolder.resolve(CLINVAR_CHUNKS_SUBDIRECTORY); if (Files.notExists(chunksPath)) { Files.createDirectories(chunksPath); Path clinvarPath = clinicalFolder.resolve(getUrlFilename( diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 99e22561ad..210271668f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -81,9 +81,9 @@ public List downloadReferenceGenome() throws IOException, Interrup String outputFileName = StringUtils.capitalize(speciesShortName) + "." + assemblyConfiguration.getName() + ".fa.gz"; Path outputPath = sequenceFolder.resolve(outputFileName); - logger.info("Saving reference genome version data at {}", sequenceFolder.resolve("genomeVersion.json")); - saveVersionData(EtlCommons.GENOME_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), - Collections.singletonList(url), sequenceFolder.resolve("genomeVersion.json")); + logger.info("Saving reference genome version data at {}", sequenceFolder.resolve(GENOME_VERSION_FILENAME)); + saveDataSource(ENSEMBL_NAME, EtlCommons.GENOME_DATA, ensemblVersion, getTimeStamp(), + Collections.singletonList(url), sequenceFolder.resolve(GENOME_VERSION_FILENAME)); List downloadFiles = Collections.singletonList(downloadFile(url, outputPath.toString())); logger.info("Unzipping file: {}", outputFileName); EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(outputPath.toString()), null); @@ -101,7 +101,7 @@ public List downloadConservation() throws IOException, Interrupted return Collections.emptyList(); } logger.info("Downloading conservation information ..."); - Path conservationFolder = downloadFolder.resolve("conservation"); + Path conservationFolder = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); List downloadFiles = new ArrayList<>(); if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { Files.createDirectories(conservationFolder); @@ -112,17 +112,18 @@ public List downloadConservation() throws IOException, Interrupted String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M", }; - if (assemblyConfiguration.getName().equalsIgnoreCase("GRCh38")) { + if (assemblyConfiguration.getName().equalsIgnoreCase(GRCH38_NAME)) { String filename; Path outputPath; - String assembly = "hg38"; + String assembly = HG38_NAME; List phastconsUrls = new ArrayList<>(chromosomes.length); List phyloPUrls = new ArrayList<>(chromosomes.length); // Downloading PhastCons and PhyloP logger.info("Downloading {} and {}", PHASTCONS_NAME, PHYLOP_NAME); for (String chromosome : chromosomes) { // PhastCons - String phastConsUrl = configuration.getDownload().getPhastCons().getHost().replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) + String phastConsUrl = configuration.getDownload().getPhastCons().getHost() + configuration.getDownload().getPhastCons() + .getFiles().get(PHASTCONS_FILE_ID).replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phastConsUrl).getFileName().toString(); outputPath = conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve(filename); @@ -131,7 +132,8 @@ public List downloadConservation() throws IOException, Interrupted phastconsUrls.add(phastConsUrl); // PhyloP - String phyloPUrl = configuration.getDownload().getPhylop().getHost().replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) + String phyloPUrl = configuration.getDownload().getPhylop().getHost() + configuration.getDownload().getPhylop() + .getFiles().get(PHYLOP_FILE_ID).replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phyloPUrl).getFileName().toString(); outputPath = conservationFolder.resolve(PHYLOP_SUBDIRECTORY).resolve(filename); @@ -142,26 +144,27 @@ public List downloadConservation() throws IOException, Interrupted // Downloading Gerp logger.info("Downloading {}", GERP_NAME); - String gerpUrl = configuration.getDownload().getGerp().getHost(); + String gerpUrl = configuration.getDownload().getGerp().getHost() + configuration.getDownload().getGerp().getFiles() + .get(GERP_FILE_ID); filename = Paths.get(gerpUrl).getFileName().toString(); outputPath = conservationFolder.resolve(GERP_SUBDIRECTORY).resolve(filename); logger.info("Downloading from {} to {}", gerpUrl, outputPath); downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); // Save data version - saveVersionData(EtlCommons.CONSERVATION_DATA, PHASTCONS_NAME, configuration.getDownload().getPhastCons().getVersion(), + saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); - saveVersionData(EtlCommons.CONSERVATION_DATA, PHYLOP_NAME, configuration.getDownload().getPhylop().getVersion(), + saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); - saveVersionData(EtlCommons.CONSERVATION_DATA, GERP_NAME, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), + saveDataSource(GERP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), Collections.singletonList(gerpUrl), conservationFolder.resolve(GERP_VERSION_FILENAME)); } } if (speciesConfiguration.getScientificName().equals("Mus musculus")) { Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve("phastCons")); - Files.createDirectories(conservationFolder.resolve("phylop")); + Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); + Files.createDirectories(conservationFolder.resolve(PHYLOP_SUBDIRECTORY)); String url = configuration.getDownload().getConservation().getHost() + "/mm10"; String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", @@ -170,18 +173,18 @@ public List downloadConservation() throws IOException, Interrupted List phyloPUrls = new ArrayList<>(chromosomes.length); for (String chromosome : chromosomes) { String phastConsUrl = url + "/phastCons60way/mm10.60way.phastCons/chr" + chromosome + ".phastCons60way.wigFix.gz"; - downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve("phastCons").resolve("chr" + chromosome + downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome + ".phastCons60way.wigFix.gz").toString())); phastconsUrls.add(phastConsUrl); String phyloPUrl = url + "/phyloP60way/mm10.60way.phyloP60way/chr" + chromosome + ".phyloP60way.wigFix.gz"; - downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve("phylop").resolve("chr" + chromosome + downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome + ".phyloP60way.wigFix.gz").toString())); phyloPUrls.add(phyloPUrl); } - saveVersionData(EtlCommons.CONSERVATION_DATA, PHASTCONS_NAME, null, getTimeStamp(), phastconsUrls, - conservationFolder.resolve("phastConsVersion.json")); - saveVersionData(EtlCommons.CONSERVATION_DATA, PHYLOP_NAME, null, getTimeStamp(), phyloPUrls, - conservationFolder.resolve("phastConsVersion.json")); + saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), + getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); + saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), + getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); } return downloadFiles; } @@ -192,21 +195,22 @@ public List downloadRepeats() throws IOException, InterruptedExcep } if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading repeats data ..."); - Path repeatsFolder = downloadFolder.resolve(EtlCommons.REPEATS_FOLDER); + Path repeatsFolder = downloadFolder.resolve(EtlCommons.REPEATS_SUBDIRECTORY); Files.createDirectories(repeatsFolder); List downloadFiles = new ArrayList<>(); String pathParam; - if (assemblyConfiguration.getName().equalsIgnoreCase("grch38")) { - pathParam = "hg38"; + if (assemblyConfiguration.getName().equalsIgnoreCase(GRCH38_NAME)) { + pathParam = HG38_NAME; } else { - logger.error("Please provide a valid human assembly {GRCh37, GRCh38)"); + logger.error("Please provide a valid human assembly: {}, {}", GRCH37_NAME, GRCH38_NAME); throw new ParameterException("Assembly '" + assemblyConfiguration.getName() + "' is not valid. Please provide " - + "a valid human assembly {GRCh37, GRCh38)"); + + "a valid human assembly: " + GRCH37_NAME + ", " + GRCH38_NAME); } // Download tandem repeat finder - String url = configuration.getDownload().getSimpleRepeats().getHost().replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveVersionData(EtlCommons.REPEATS_DATA, TRF_NAME, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), + String url = configuration.getDownload().getSimpleRepeats().getHost() + configuration.getDownload().getSimpleRepeats() + .getFiles().get(SIMPLE_REPEATS_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); + saveDataSource(TRF_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.TRF_VERSION_FILENAME)); Path outputPath = repeatsFolder.resolve(getUrlFilename(url)); @@ -214,8 +218,9 @@ public List downloadRepeats() throws IOException, InterruptedExcep downloadFiles.add(downloadFile(url, outputPath.toString())); // Download genomic super duplications - url = configuration.getDownload().getGenomicSuperDups().getHost().replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveVersionData(EtlCommons.REPEATS_DATA, GSD_NAME, configuration.getDownload().getGenomicSuperDups().getVersion(), + url = configuration.getDownload().getGenomicSuperDups().getHost() + configuration.getDownload().getGenomicSuperDups() + .getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); + saveDataSource(GSD_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.GSD_VERSION_FILENAME)); outputPath = repeatsFolder.resolve(getUrlFilename(url)); @@ -223,9 +228,10 @@ public List downloadRepeats() throws IOException, InterruptedExcep downloadFiles.add(downloadFile(url, outputPath.toString())); // Download WindowMasker - if (!pathParam.equalsIgnoreCase("hg19")) { - url = configuration.getDownload().getWindowMasker().getHost().replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveVersionData(EtlCommons.REPEATS_DATA, WM_NAME, configuration.getDownload().getWindowMasker().getVersion(), + if (!pathParam.equalsIgnoreCase(HG19_NAME)) { + url = configuration.getDownload().getWindowMasker().getHost() + configuration.getDownload().getWindowMasker().getFiles() + .get(WINDOW_MASKER_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); + saveDataSource(WM_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.WM_VERSION_FILENAME)); outputPath = repeatsFolder.resolve(getUrlFilename(url)); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java index 1ae2514e49..0dba31ed78 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java @@ -26,6 +26,8 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class MissenseScoresDownloadManager extends AbstractDownloadManager { public MissenseScoresDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) @@ -39,17 +41,13 @@ public List download() throws IOException, InterruptedException { } public DownloadFile downloadRevel() throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading Revel data ..."); - - Path missensePredictionScore = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - Files.createDirectories(missensePredictionScore); - - String url = configuration.getDownload().getRevel().getHost(); + if (speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { + Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); + Files.createDirectories(missensePredictionScorePath); + logger.info("Downloading Revel data at {} ...", missensePredictionScorePath); - saveVersionData(EtlCommons.MISSENSE_VARIATION_SCORE_DATA, "Revel", null, getTimeStamp(), - Collections.singletonList(url), missensePredictionScore.resolve("revelVersion.json")); - return downloadFile(url, missensePredictionScore.resolve("revel_grch38_all_chromosomes.csv.zip").toString()); + return downloadAndSaveDataSource(configuration.getDownload().getRevel(), REVEL_NAME, + MISSENSE_VARIATION_SCORE_DATA, REVEL_FILE_ID, REVEL_VERSION_FILENAME, missensePredictionScorePath); } return null; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index 70fbc6f6a1..7e730a8b0a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -17,15 +17,12 @@ package org.opencb.cellbase.lib.download; import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -38,10 +35,9 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec } public List download() throws IOException, InterruptedException { - logger.info("Downloading {} files ...", ONTOLOGY_DATA); - - Path oboFolder = downloadFolder.resolve(ONTOLOGY_FOLDER_NAME); + Path oboFolder = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); Files.createDirectories(oboFolder); + logger.info("Downloading {} files {} ...", ONTOLOGY_DATA, oboFolder); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 812dcd996a..04e72d3247 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -43,10 +43,10 @@ public PharmGKBDownloadManager(String species, String assembly, Path targetDirec @Override public List download() throws IOException, InterruptedException { - logger.info("Downloading PharmGKB files..."); DownloadProperties.URLProperties pharmGKB = configuration.getDownload().getPharmGKB(); - Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_SUBDIRECTORY).resolve(PHARMGKB_SUBDIRECTORY); Files.createDirectories(pharmgkbDownloadFolder); + logger.info("Downloading {} files at {} ...", PHARMGKB_DATA, pharmgkbDownloadFolder); List urls = new ArrayList<>(); List downloadFiles = new ArrayList<>(); @@ -67,7 +67,7 @@ public List download() throws IOException, InterruptedException { } // Save versions - saveVersionData(PHARMACOGENOMICS_DATA, PHARMGKB_NAME, pharmGKB.getVersion(), getTimeStamp(), urls, + saveDataSource(PHARMGKB_NAME, PHARMACOGENOMICS_DATA, pharmGKB.getVersion(), getTimeStamp(), urls, pharmgkbDownloadFolder.resolve(PHARMGKB_VERSION_FILENAME)); return downloadFiles; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 5a722ed448..9ebf9aa2b2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -18,7 +18,6 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; @@ -27,14 +26,11 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Collections; import java.util.List; -public class ProteinDownloadManager extends AbstractDownloadManager { +import static org.opencb.cellbase.lib.EtlCommons.*; - private static final String UNIPROT_NAME = "UniProt"; - private static final String INTERPRO_NAME = "InterPro"; - private static final String INTACT_NAME = "IntAct"; +public class ProteinDownloadManager extends AbstractDownloadManager { public ProteinDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { @@ -49,39 +45,35 @@ public ProteinDownloadManager(String species, String assembly, Path targetDirect * @throws InterruptedException if there is an error downloading files * */ public List download() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "protein")) { + if (!speciesHasInfoToDownload(speciesConfiguration, PROTEIN_DATA)) { return null; } - logger.info("Downloading protein information ..."); - Path proteinFolder = downloadFolder.resolve("protein"); + Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); Files.createDirectories(proteinFolder); + logger.info("Downloading protein information at {} ..."); + + DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // Uniprot - String url = configuration.getDownload().getUniprot().getHost(); - downloadFiles.add(downloadFile(url, proteinFolder.resolve("uniprot_sprot.xml.gz").toString())); - Files.createDirectories(proteinFolder.resolve("uniprot_chunks")); - splitUniprot(proteinFolder.resolve("uniprot_sprot.xml.gz"), proteinFolder.resolve("uniprot_chunks")); - - String relNotesUrl = configuration.getDownload().getUniprotRelNotes().getHost(); - downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("uniprotRelnotes.txt").toString())); - saveVersionData(EtlCommons.PROTEIN_DATA, UNIPROT_NAME, getLine(proteinFolder.resolve("uniprotRelnotes.txt"), 1), - getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("uniprotVersion.json")); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_NAME, PROTEIN_DATA, UNIPROT_FILE_ID, + UNIPROT_VERSION_FILENAME, proteinFolder); + Path chunksPath = proteinFolder.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); + String uniprotFilename = getUrlFilename(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); + logger.info("Split UniProt file {} into chunks at {}", uniprotFilename, chunksPath); + Files.createDirectories(chunksPath); + splitUniprot(proteinFolder.resolve(uniprotFilename), chunksPath); + downloadFiles.add(downloadFile); // Interpro - String interproUrl = configuration.getDownload().getInterpro().getHost(); - downloadFiles.add(downloadFile(interproUrl, proteinFolder.resolve("protein2ipr.dat.gz").toString())); - - relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost(); - downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString())); - saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5), - getTimeStamp(), Collections.singletonList(interproUrl), proteinFolder.resolve("interproVersion.json")); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_NAME, PROTEIN_DATA, INTERPRO_FILE_ID, + INTERPRO_VERSION_FILENAME, proteinFolder); + downloadFiles.add(downloadFile); // Intact - String intactUrl = configuration.getDownload().getIntact().getHost(); - downloadFiles.add(downloadFile(intactUrl, proteinFolder.resolve("intact.txt").toString())); - saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, configuration.getDownload().getIntact().getVersion(), - getTimeStamp(), Collections.singletonList(intactUrl), proteinFolder.resolve("intactVersion.json")); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_NAME, PROTEIN_DATA, INTACT_FILE_ID, + INTACT_VERSION_FILENAME, proteinFolder); + downloadFiles.add(downloadFile); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index e913539d5b..e5a8c78f26 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -27,11 +27,8 @@ import java.util.Collections; import java.util.List; -import static org.opencb.cellbase.lib.EtlCommons.PUBMED_VERSION_FILE; - public class PubMedDownloadManager extends AbstractDownloadManager { - private static final String PUBMED_NAME = "PubMed"; public PubMedDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); @@ -39,29 +36,29 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto @Override public List download() throws IOException, InterruptedException { - logger.info("Downloading PubMed XML files..."); - - Path pubmedFolder = downloadFolder.resolve(EtlCommons.PUBMED_DATA); + Path pubmedFolder = downloadFolder.resolve(EtlCommons.PUBMED_SUBDIRECTORY); Files.createDirectories(pubmedFolder); + logger.info("Downloading {} files at {} ...", EtlCommons.PUBMED_DATA, pubmedFolder); // Downloads PubMed XML files String url = configuration.getDownload().getPubmed().getHost(); - String regexp = configuration.getDownload().getPubmed().getFiles().get(0); + String regexp = configuration.getDownload().getPubmed().getFiles().get(EtlCommons.PUBMED_REGEX_FILE_ID); String[] name = regexp.split("[\\[\\]]"); String[] split = name[1].split("\\.\\."); int start = Integer.parseInt(split[0]); int end = Integer.parseInt(split[1]); int padding = Integer.parseInt(split[2]); - saveVersionData(EtlCommons.PUBMED_DATA, PUBMED_NAME, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), - Collections.singletonList(url), pubmedFolder.resolve(PUBMED_VERSION_FILE)); - - List list = new ArrayList<>(); + List downloadFiles = new ArrayList<>(); for (int i = start; i <= end; i++) { String filename = name[0] + String.format("%0" + padding + "d", i) + name[2]; - logger.info("\tDownloading file {}", filename); - list.add(downloadFile(url + "/" + filename, pubmedFolder.resolve(filename).toString())); + logger.info("\tDownloading from {} to {} ", url + "/" + filename, pubmedFolder.resolve(filename)); + downloadFiles.add(downloadFile(url + "/" + filename, pubmedFolder.resolve(filename).toString())); } - return list; + + saveDataSource(EtlCommons.PUBMED_NAME, EtlCommons.PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), + Collections.singletonList(url), pubmedFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME)); + + return downloadFiles; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 8b0cf01abb..546bb2dc7e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -52,13 +52,12 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C @Override public List download() throws IOException, InterruptedException, NoSuchMethodException, FileFormatException { - if (!speciesHasInfoToDownload(speciesConfiguration, "regulation")) { + if (!speciesHasInfoToDownload(speciesConfiguration, REGULATION_DATA)) { return Collections.emptyList(); } - this.regulationFolder = downloadFolder.resolve("regulation"); + regulationFolder = downloadFolder.resolve(REGULATION_SUBDIRECTORY); Files.createDirectories(regulationFolder); - - logger.info("Downloading regulation information ..."); + logger.info("Downloading {} files at {} ...", REGULATION_DATA, regulationFolder); List downloadFiles = new ArrayList<>(); From c7ad55d8d2c113dd0d4b2c60dd237b4085a118fb Mon Sep 17 00:00:00 2001 From: imedina Date: Thu, 11 Apr 2024 11:27:59 +0100 Subject: [PATCH 052/107] Rename get file name method --- .../lib/download/AbstractDownloadManager.java | 11 +++++----- .../lib/download/ClinicalDownloadManager.java | 4 ++-- .../lib/download/GeneDownloadManager.java | 20 +++++++++---------- .../lib/download/GenomeDownloadManager.java | 6 +++--- .../lib/download/ProteinDownloadManager.java | 2 +- 5 files changed, 21 insertions(+), 22 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 946d868721..c87e2a9512 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -138,11 +138,10 @@ protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) } protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String name, String category, String fileId, - String versionFilename, Path outPath) - throws IOException, InterruptedException { + String versionFilename, Path outPath) throws IOException, InterruptedException { logger.info("Downloading {} ({}) file ...", name, category); String url = props.getHost() + props.getFiles().get(fileId); - File outFile = outPath.resolve(getUrlFilename(url)).toFile(); + File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); DownloadFile downloadFile = downloadFile(url, outPath.toString()); @@ -270,12 +269,12 @@ private boolean validateDownloadFile(DownloadFile downloadFile, String outputFil private long getExpectedFileSize(String outputFileLog) { try (BufferedReader reader = new BufferedReader(new FileReader(outputFileLog))) { - String line = null; + String line; while ((line = reader.readLine()) != null) { // looking for: Length: 13846591 (13M) if (line.startsWith("Length:")) { String[] parts = line.split("\\s"); - return Long.valueOf(parts[1]); + return Long.parseLong(parts[1]); } } } catch (Exception e) { @@ -294,7 +293,7 @@ private String getEnsemblURL(SpeciesConfiguration sp) { } } - protected String getUrlFilename(String url) { + protected String getFilenameFromUrl(String url) { return Paths.get(url).getFileName().toString(); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index bb6f53e32d..a274df11a4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -91,7 +91,7 @@ public List downloadClinical() throws IOException, InterruptedExce for (String fileId : Arrays.asList(CLINVAR_FULL_RELEASE_FILE_ID, CLINVAR_SUMMARY_FILE_ID, CLINVAR_ALLELE_FILE_ID, CLINVAR_EFO_TERMS_FILE_ID)) { url = props.getHost() + props.getFiles().get(fileId); - outPath = clinicalFolder.resolve(getUrlFilename(url)); + outPath = clinicalFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outPath); downloadFiles.add(downloadFile(url, outPath.toString())); urls.add(url); @@ -104,7 +104,7 @@ public List downloadClinical() throws IOException, InterruptedExce Path chunksPath = clinicalFolder.resolve(CLINVAR_CHUNKS_SUBDIRECTORY); if (Files.notExists(chunksPath)) { Files.createDirectories(chunksPath); - Path clinvarPath = clinicalFolder.resolve(getUrlFilename( + Path clinvarPath = clinicalFolder.resolve(getFilenameFromUrl( props.getHost() + props.getFiles().get(CLINVAR_FULL_RELEASE_FILE_ID))); logger.info("Splitting {} in {} ...", clinvarPath, chunksPath); splitClinvar(clinvarPath, chunksPath); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index d66f149c04..843bc360e3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -177,7 +177,7 @@ private DownloadFile downloadRefSeqFile(String name, DownloadProperties.URLPrope String versionFilename, Path refSeqFolder) throws IOException, InterruptedException { String url = urlProperties.getHost(); String version = urlProperties.getVersion(); - String filename = getUrlFilename(url); + String filename = getFilenameFromUrl(url); Path outputPath = refSeqFolder.resolve(filename); saveDataSource(EtlCommons.REFSEQ_DATA, name, version, timeStamp, Collections.singletonList(url), refSeqFolder.resolve(versionFilename)); @@ -193,7 +193,7 @@ private DownloadFile downloadMane(Path geneFolder) throws IOException, Interrupt saveDataSource(EtlCommons.GENE_DATA, MANE_SELECT_NAME, configuration.getDownload().getManeSelect().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(MANE_SELECT_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -207,7 +207,7 @@ private DownloadFile downloadLrg(Path geneFolder) throws IOException, Interrupte saveDataSource(EtlCommons.GENE_DATA, LRG_NAME, configuration.getDownload().getLrg().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(LRG_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -221,7 +221,7 @@ private DownloadFile downloadHgnc(Path geneFolder) throws IOException, Interrupt saveDataSource(GENE_DATA, HGNC_GENE_NAME, configuration.getDownload().getHgnc().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(HGNC_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -235,7 +235,7 @@ private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, saveDataSource(EtlCommons.GENE_DATA, CANCER_HOTSPOT_NAME, configuration.getDownload().getHgnc().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(CANCER_HOTSPOT_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -249,7 +249,7 @@ private DownloadFile downloadGO(Path geneFolder) throws IOException, Interrupted saveDataSource(EtlCommons.GENE_DATA, GO_ANNOTATION_NAME, configuration.getDownload().getGoAnnotation().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GO_ANNOTATION_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -263,7 +263,7 @@ private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOExcepti saveDataSource(EtlCommons.GENE_DATA, GNOMAD_NAME, configuration.getDownload().getGnomadConstraints().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GNOMAD_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -277,7 +277,7 @@ private DownloadFile downloadDrugData(Path geneFolder) throws IOException, Inter saveDataSource(EtlCommons.GENE_DATA, DGIDB_NAME, configuration.getDownload().getDgidb().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(DGIDB_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } @@ -309,7 +309,7 @@ private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOExcep saveDataSource(EtlCommons.GENE_DATA, GENE_EXPRESSION_ATLAS_NAME, configuration.getDownload().getGeneExpressionAtlas().getVersion(), getTimeStamp(), Collections.singletonList(geneGtfUrl), geneFolder.resolve(GENE_EXPRESSION_ATLAS_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(geneGtfUrl)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(geneGtfUrl)); logger.info(DOWNLOADING_LOG_MESSAGE, geneGtfUrl, outputPath); return downloadFile(geneGtfUrl, outputPath.toString()); } @@ -325,7 +325,7 @@ private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOExc saveDataSource(EtlCommons.GENE_DISEASE_ASSOCIATION_DATA, DISGENET_NAME, configuration.getDownload().getDisgenet().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(DISGINET_VERSION_FILENAME)); - Path outputPath = geneFolder.resolve(getUrlFilename(url)); + Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); return downloadFile(url, outputPath.toString()); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 210271668f..bbd25cf8f7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -213,7 +213,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep saveDataSource(TRF_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.TRF_VERSION_FILENAME)); - Path outputPath = repeatsFolder.resolve(getUrlFilename(url)); + Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); @@ -223,7 +223,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep saveDataSource(GSD_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.GSD_VERSION_FILENAME)); - outputPath = repeatsFolder.resolve(getUrlFilename(url)); + outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); @@ -234,7 +234,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep saveDataSource(WM_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.WM_VERSION_FILENAME)); - outputPath = repeatsFolder.resolve(getUrlFilename(url)); + outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 9ebf9aa2b2..799bc92aad 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -59,7 +59,7 @@ public List download() throws IOException, InterruptedException { downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_NAME, PROTEIN_DATA, UNIPROT_FILE_ID, UNIPROT_VERSION_FILENAME, proteinFolder); Path chunksPath = proteinFolder.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); - String uniprotFilename = getUrlFilename(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); + String uniprotFilename = getFilenameFromUrl(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); logger.info("Split UniProt file {} into chunks at {}", uniprotFilename, chunksPath); Files.createDirectories(chunksPath); splitUniprot(proteinFolder.resolve(uniprotFilename), chunksPath); From e92b67680200bfdf0cb744809855e32e11112f88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 11 Apr 2024 18:02:23 +0200 Subject: [PATCH 053/107] lib: update CellBase downloaders according to the DownloadProperties.URLProperties changes, #TASK-5775, #TASK-5564 --- .../app/cli/admin/AdminCliOptionsParser.java | 8 +- .../cellbase/app/cli/admin/AdminMain.java | 1 - .../admin/executors/BuildCommandExecutor.java | 4 +- .../executors/DownloadCommandExecutor.java | 101 +-- .../executors/ExportCommandExecutor.java | 6 +- .../admin/executors/LoadCommandExecutor.java | 8 +- .../core/exception/CellBaseException.java | 3 + .../src/main/resources/configuration.yml | 14 +- .../org/opencb/cellbase/lib/EtlCommons.java | 29 +- .../lib/builders/OntologyBuilder.java | 10 +- .../builders/RegulatoryFeatureBuilder.java | 11 +- .../lib/builders/RegulatoryRegionBuilder.java | 607 ------------------ .../lib/download/GenomeDownloadManager.java | 8 - .../download/RegulationDownloadManager.java | 180 +++--- .../lib/managers/DataReleaseManager.java | 3 - 15 files changed, 207 insertions(+), 786 deletions(-) delete mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 6049ef9b4b..55342641b3 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -88,11 +88,13 @@ public class DownloadCommandOptions { public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions; @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: genome, gene, " - + "variation, variation_functional_score, regulation, protein, conservation, " - + "clinical_variants, repeats, svs, pubmed and 'all' to download everything", required = true, arity = 1) + + "variation_functional_score, missense_variation_functional_score, regulation, protein, conservation, " + + "clinical_variants, repeats, ontology, pubmed and pharmacogenomics; or use 'all' to download everything", + required = true, arity = 1) public String data; - @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1) + @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, + arity = 1) public String outputDirectory; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java index 10c43d637c..d77722a492 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java @@ -103,5 +103,4 @@ public static void main(String[] args) { } } } - } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 8e51ac8b23..71b20e8b5a 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -157,7 +157,7 @@ public void execute() { case EtlCommons.REPEATS_DATA: parser = buildRepeats(); break; - case EtlCommons.OBO_DATA: + case ONTOLOGY_DATA: parser = buildObo(); break; case EtlCommons.SPLICE_SCORE_DATA: @@ -202,7 +202,7 @@ private CellBaseBuilder buildRepeats() { } private CellBaseBuilder buildObo() { - Path oboDir = downloadFolder.resolve(EtlCommons.OBO_DATA); + Path oboDir = downloadFolder.resolve(ONTOLOGY_DATA); CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.OBO_JSON); return new OntologyBuilder(oboDir, serializer); } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index f8197e6558..f8d3e04eb9 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -16,15 +16,11 @@ package org.opencb.cellbase.app.cli.admin.executors; -import com.beust.jcommander.ParameterException; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; -import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.utils.SpeciesUtils; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.download.AbstractDownloadManager; import org.opencb.cellbase.lib.download.DownloadFile; import org.opencb.cellbase.lib.download.Downloader; @@ -36,6 +32,8 @@ import java.util.Arrays; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 03/02/15. */ @@ -44,6 +42,10 @@ public class DownloadCommandExecutor extends CommandExecutor { private AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions; private Path outputDirectory; + private static final List VALID_SOURCES_TO_DOWNLOAD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, + MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANTS_DATA, REPEATS_DATA, + ONTOLOGY_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); + public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions) { super(downloadCommandOptions.commonOptions.logLevel, downloadCommandOptions.commonOptions.conf); @@ -52,88 +54,95 @@ public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions down } /** - * Execute specific 'download' command options. + * Process CellBase command 'download'. + * + * @throws CellBaseException Exception */ - public void execute() { + public void execute() throws CellBaseException { try { String species = downloadCommandOptions.speciesAndAssemblyOptions.species; String assembly = downloadCommandOptions.speciesAndAssemblyOptions.assembly; List downloadFiles = new ArrayList<>(); - List dataList = getDataList(species); + List dataList = checkDataSources(); Downloader downloader = new Downloader(species, assembly, outputDirectory, configuration); for (String data : dataList) { switch (data) { - case EtlCommons.GENOME_DATA: + case GENOME_DATA: downloadFiles.addAll(downloader.downloadGenome()); break; - case EtlCommons.GENE_DATA: + case GENE_DATA: downloadFiles.addAll(downloader.downloadGene()); break; -// case EtlCommons.VARIATION_DATA: -// downloadManager.downloadVariation(); -// break; - case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: + case VARIATION_FUNCTIONAL_SCORE_DATA: downloadFiles.addAll(downloader.downloadCaddScores()); break; - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: + case MISSENSE_VARIATION_SCORE_DATA: downloadFiles.addAll(downloader.downloadPredictionScores()); break; - case EtlCommons.REGULATION_DATA: + case REGULATION_DATA: downloadFiles.addAll(downloader.downloadRegulation()); break; - case EtlCommons.PROTEIN_DATA: + case PROTEIN_DATA: downloadFiles.addAll(downloader.downloadProtein()); break; - case EtlCommons.CONSERVATION_DATA: + case CONSERVATION_DATA: downloadFiles.addAll(downloader.downloadConservation()); break; - case EtlCommons.CLINICAL_VARIANTS_DATA: + case CLINICAL_VARIANTS_DATA: downloadFiles.addAll(downloader.downloadClinicalVariants()); break; -// case EtlCommons.STRUCTURAL_VARIANTS_DATA: -// downloadFiles.add(downloadManager.downloadStructuralVariants()); -// break; - case EtlCommons.REPEATS_DATA: + case REPEATS_DATA: downloadFiles.addAll(downloader.downloadRepeats()); break; - case EtlCommons.OBO_DATA: + case ONTOLOGY_DATA: downloadFiles.addAll(downloader.downloadOntologies()); break; - case EtlCommons.PUBMED_DATA: + case PUBMED_DATA: downloadFiles.addAll(downloader.downloadPubMed()); break; - case EtlCommons.PHARMACOGENOMICS_DATA: + case PHARMACOGENOMICS_DATA: downloadFiles.addAll(downloader.downloadPharmKGB()); break; default: - System.out.println("Value \"" + data + "\" is not allowed for the data parameter. Allowed values" - + " are: {genome, gene, gene_disease_association, variation, variation_functional_score," - + " regulation, protein, conservation, clinical_variants, ontology, pubmed}"); - break; + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); } } AbstractDownloadManager.writeDownloadLogFile(outputDirectory, downloadFiles); - } catch (ParameterException | IOException | CellBaseException | InterruptedException | NoSuchMethodException - | FileFormatException e) { - logger.error("Error in 'download' command line: " + e.getMessage()); + } catch (IOException | NoSuchMethodException | FileFormatException e) { + throw new CellBaseException("Error executing command line 'download'", e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing command line 'download'", e); } } - private List getDataList(String species) throws CellBaseException { - if (StringUtils.isEmpty(downloadCommandOptions.data) || downloadCommandOptions.data.equals("all")) { - return SpeciesUtils.getSpeciesConfiguration(configuration, species).getData(); - } else { - return Arrays.asList(downloadCommandOptions.data.split(",")); + private List checkDataSources() { + if (StringUtils.isEmpty(downloadCommandOptions.data)) { + throw new IllegalArgumentException("Missing data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); } - } - - @Deprecated - private List getDataList(SpeciesConfiguration sp) { - List dataList; - if (downloadCommandOptions.data.equals("all")) { - dataList = sp.getData(); - } else { - dataList = Arrays.asList(downloadCommandOptions.data.split(",")); + List dataList = Arrays.asList(downloadCommandOptions.data.split(",")); + for (String data : dataList) { + switch (data) { + case GENOME_DATA: + case GENE_DATA: + case VARIATION_FUNCTIONAL_SCORE_DATA: + case MISSENSE_VARIATION_SCORE_DATA: + case REGULATION_DATA: + case PROTEIN_DATA: + case CONSERVATION_DATA: + case CLINICAL_VARIANTS_DATA: + case REPEATS_DATA: + case ONTOLOGY_DATA: + case PUBMED_DATA: + case PHARMACOGENOMICS_DATA: + break; + default: + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); + } } return dataList; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java index 72f992f344..85446fac1f 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java @@ -86,7 +86,7 @@ public ExportCommandExecutor(AdminCliOptionsParser.ExportCommandOptions exportCo EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, - OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; + ONTOLOGY_DATA, MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { this.dataToExport = exportCommandOptions.data.split(","); } @@ -309,7 +309,7 @@ public void execute() throws CellBaseException { counterMsg = counter + " repeats"; break; } - case OBO_DATA: { + case ONTOLOGY_DATA: { counter = exportOntologyData(); counterMsg = counter + " ontology items"; break; @@ -449,7 +449,7 @@ private int exportClinicalVariantData(List regions) throws CellBaseExcep private int exportOntologyData() throws CellBaseException, IOException { int counter = 0; - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, OBO_DATA); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, ONTOLOGY_DATA); OntologyManager ontologyManager = managerFactory.getOntologyManager(species, assembly); CellBaseIterator iterator = ontologyManager.iterator(new OntologyQuery()); while (iterator.hasNext()) { diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 6af05bf732..ca1a4a9a71 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -80,8 +80,8 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, - EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PUBMED_DATA, - EtlCommons.PHARMACOGENOMICS_DATA}; + EtlCommons.ONTOLOGY_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, + EtlCommons.PUBMED_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { loadOptions = loadCommandOptions.data.split(","); } @@ -268,7 +268,7 @@ public void execute() throws CellBaseException { // case EtlCommons.STRUCTURAL_VARIANTS_DATA: // loadStructuralVariants(); // break; - case EtlCommons.OBO_DATA: { + case EtlCommons.ONTOLOGY_DATA: { // Load data loadIfExists(input.resolve("ontology.json.gz"), "ontology"); @@ -281,7 +281,7 @@ public void execute() throws CellBaseException { input.resolve(EtlCommons.GO_VERSION_FILE), input.resolve(EtlCommons.DO_VERSION_FILE) )); - dataReleaseManager.update(dataRelease, "ontology", EtlCommons.OBO_DATA, sources); + dataReleaseManager.update(dataRelease, "ontology", EtlCommons.ONTOLOGY_DATA, sources); break; } case EtlCommons.SPLICE_SCORE_DATA: { diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java index 884c63f2ae..422a52b0d4 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java @@ -22,5 +22,8 @@ public CellBaseException(String msg) { super(msg); } + public CellBaseException(String msg, Throwable e) { + super(msg, e); + } } diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 5052473aa0..28263dfb6e 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -53,7 +53,11 @@ download: password: '' libs: "${CELLBASE.ENSEMBL.LIBS}" url: - host: ftp://ftp.ensembl.org/pub + host: ftp://ftp.ensembl.org/pub/ + files: + REGULATORY_BUILD: "regulation/put_species_here/*Regulatory_Build.regulatory_features*.gff.gz" + MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/*put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/*put_assembly_here.motif_features.gff.gz.tbi" ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 @@ -94,13 +98,17 @@ download: ## Regulation mirbase: - host: https://www.mirbase.org/download/miRNA.dat + host: https://www.mirbase.org/ version: "22.1" + files: + MIRBASE: download/miRNA.dat targetScan: host: http://hgdownload.cse.ucsc.edu/goldenPath/ miRTarBase: - host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx + host: https://mirtarbase.cuhk.edu.cn/ version: "9.0" + files: + MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx ## Protein Data uniprot: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 15c93c5101..207841aabb 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -35,7 +35,15 @@ */ public class EtlCommons { + // Ensembl public static final String ENSEMBL_NAME = "ENSEMBL"; + public static final String PUT_SPECIES_HERE_MARK = "put_species_here"; + public static final String PUT_ASSEMBLY_HERE_MARK = "put_assembly_here"; + public static final String PUT_CHROMOSOME_HERE_MARK = "put_chromosome_here"; + // Must match the configuration file + public static final String REGULATORY_BUILD_FILE_ID = "REGULATORY_BUILD"; + public static final String MOTIF_FEATURES_FILE_ID = "MOTIF_FEATURES"; + public static final String MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; @@ -122,14 +130,19 @@ public class EtlCommons { public static final String REPEATS_SUBDIRECTORY = "genome"; public static final String REPEATS_JSON = "repeats"; // Simple repeats + public static final String TRF_NAME = "Tandem Repeats Finder"; @Deprecated public static final String TRF_FILE = "simpleRepeat.txt.gz"; public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME; public static final String SIMPLE_REPEATS_FILE_ID = "SIMPLE_REPEATS"; + // Genomic super duplications + public static final String GSD_NAME = "Genomic Super Duplications"; @Deprecated public static final String GSD_FILE = "genomicSuperDups.txt.gz"; public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME; public static final String GENOMIC_SUPER_DUPS_FILE_ID = "GENOMIC_SUPER_DUPS"; + // Window masker + public static final String WM_NAME = "Window Masker"; @Deprecated public static final String WM_FILE = "windowmaskerSdust.txt.gz"; public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME; @@ -174,15 +187,22 @@ public class EtlCommons { // Regulation public static final String REGULATION_DATA = "regulation"; public static final String REGULATION_SUBDIRECTORY = "regulation"; - // Regulatory/motif features - public static final String REGULATORY_FEATURES_FILE = "Regulatory_Build.regulatory_features.gff.gz"; - public static final String MOTIF_FEATURES_FILE = "motif_features.gff.gz"; + // Regulatory build and motif features (see Ensembl files: regulatory build and motif features files) + public static final String REGULATORY_BUILD_NAME = "Regulatory Build"; + public static final String REGULATORY_BUILD_VERSION_FILENAME = "regulatoryBuild" + SUFFIX_VERSION_FILENAME; + // Motif features (see Ensembl files) + public static final String MOTIF_FEATURES_NAME = "Motif Features"; + public static final String MOTIF_FEATURES_VERSION_FILENAME = "motifFeatures" + SUFFIX_VERSION_FILENAME; // miRBase public static final String MIRBASE_NAME = "miRBase"; public static final String MIRBASE_VERSION_FILENAME = "mirbase" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String MIRBASE_FILE_ID = "MIRBASE"; // miRTarBase public static final String MIRTARBASE_NAME = "miRTarBase"; - public static final String MIRTARBASE_VERSION_FILENAME = "mirtarbase" + SUFFIX_VERSION_FILENAME; + public static final String MIRTARBASE_VERSION_FILENAME = "mirTarBase" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String MIRTARBASE_FILE_ID = "MIRTARBASE"; // Build specific data options public static final String GENOME_INFO_DATA = "genome_info"; @@ -224,7 +244,6 @@ public class EtlCommons { // Must match the configuration file public static final String INTACT_FILE_ID = "INTACT"; - // Conservation scores public static final String CONSERVATION_DATA = "conservation"; public static final String CONSERVATION_SUBDIRECTORY = "conservation"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index 1eabf8975a..cbe7c56952 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -20,7 +20,6 @@ import org.opencb.biodata.formats.obo.OboParser; import org.opencb.biodata.models.core.OntologyTerm; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; @@ -36,10 +35,11 @@ public class OntologyBuilder extends CellBaseBuilder { public OntologyBuilder(Path oboDirectoryPath, CellBaseSerializer serializer) { super(serializer); - hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE); - goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE); - doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE); - mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE); + // TODO: fix it !! +// hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE); +// goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE); +// doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE); +// mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE); } @Override diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java index 03fc3a1cd6..d1ae5fb205 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java @@ -21,12 +21,12 @@ import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.biodata.models.core.RegulatoryFeature; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.HashSet; +import java.util.Set; public class RegulatoryFeatureBuilder extends CellBaseBuilder { @@ -35,7 +35,9 @@ public class RegulatoryFeatureBuilder extends CellBaseBuilder { public RegulatoryFeatureBuilder(Path regulatoryDirectoryPath, CellBaseSerializer serializer) { super(serializer); - gffFile = regulatoryDirectoryPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); + // TODO: fix it ! + gffFile = null; +// gffFile = regulatoryDirectoryPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); } @Override @@ -44,7 +46,8 @@ public void parse() throws Exception { if (Files.exists(gffFile)) { parseGffFile(gffFile); } else { - logger.warn("No regulatory features GFF file found {}", EtlCommons.REGULATORY_FEATURES_FILE); + // TODO: fix it +// logger.warn("No regulatory features GFF file found {}", EtlCommons.REGULATORY_FEATURES_FILE); logger.warn("Skipping regulatory features GFF file parsing. Regulatory feature data models will not be built."); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java deleted file mode 100644 index 3727ac4a69..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java +++ /dev/null @@ -1,607 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.builders; - -import org.opencb.biodata.models.core.RegulatoryFeature; -import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.commons.utils.FileUtils; - -import java.io.BufferedReader; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.sql.*; -import java.util.*; - -/** - * User: fsalavert. - * Date: 4/10/13 - * Time: 10:14 AM - */ -@Deprecated -public class RegulatoryRegionBuilder extends CellBaseBuilder { - - private static final int CHUNK_SIZE = 2000; - private static final String REGULATORY_FEATURES = "regulatory_features"; - @Deprecated - private static final String DEPRECATED_MOTIF_FEATURES = "deprecated_motif_features"; - private static final String MOTIF_FEATURES = "motif_features"; - private static final String FEATURE_TYPE = "feature_type"; - private static final String ID = "id"; - private static final String BINDING_MATRIX = "binding_matrix"; - private static final String MOTIF_FEATURE_TYPE = "motif_feature_type"; - private Path regulatoryRegionPath; - - public RegulatoryRegionBuilder(Path regulatoryRegionFilesDir, CellBaseSerializer serializer) { - super(serializer); - - this.regulatoryRegionPath = regulatoryRegionFilesDir; - - } - - public void createSQLiteRegulatoryFiles(Path regulatoryRegionPath) - throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException { - List gffColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "group"); - List gffColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT", "TEXT"); - - // Path regulatoryRegionPath = regulationDir.toPath(); - - Path filePath; - - filePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); - createSQLiteRegulatoryFiles(filePath, REGULATORY_FEATURES, gffColumnNames, gffColumnTypes); - - filePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE); - createSQLiteRegulatoryFiles(filePath, MOTIF_FEATURES, gffColumnNames, gffColumnTypes); - - // TODO: REMOVE - // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DEPRECATED - filePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz"); - createSQLiteRegulatoryFiles(filePath, "annotated_features", gffColumnNames, gffColumnTypes); - - - filePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz"); - createSQLiteRegulatoryFiles(filePath, DEPRECATED_MOTIF_FEATURES, gffColumnNames, gffColumnTypes); - - - filePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz"); - createSQLiteRegulatoryFiles(filePath, "regulatory_features_multicell", gffColumnNames, gffColumnTypes); - // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< DEPRECATED - - - -// GFFColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame"); -// GFFColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT"); - filePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz"); - if (Files.exists(filePath)) { - createSQLiteRegulatoryFiles(filePath, "mirna_uniq", gffColumnNames, gffColumnTypes); - } - - } - - @Override - public void parse() throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException { - if (regulatoryRegionPath == null || !Files.exists(regulatoryRegionPath) || !Files.isDirectory(regulatoryRegionPath)) { - throw new IOException("Regulation directory whether does not exist, is not a directory or cannot be read"); - } - - // Create the SQLite databases - createSQLiteRegulatoryFiles(regulatoryRegionPath); - - String chunkIdSuffix = CHUNK_SIZE / 1000 + "k"; - - Path regulatoryFilePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE + ".db"); - Path motifFilePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE + ".db"); - Path annotatedFilePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz.db"); - Path deprecatedMotifFilePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz.db"); - Path deprecatedRegulatoryFilePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz.db"); - Path mirnaFilePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz.db"); - - List filePaths = Arrays.asList(regulatoryFilePath, motifFilePath, annotatedFilePath, - deprecatedMotifFilePath, deprecatedRegulatoryFilePath); - List tableNames = Arrays.asList(REGULATORY_FEATURES, MOTIF_FEATURES, "annotated_features", - DEPRECATED_MOTIF_FEATURES, "regulatory_features_multicell"); - - if (Files.exists(mirnaFilePath)) { - filePaths.add(mirnaFilePath); - tableNames.add("mirna_uniq"); - } - - // Fetching and joining all chromosomes found in the different databases - Set setChr = new HashSet<>(); - setChr.addAll(getChromosomesList(regulatoryFilePath, REGULATORY_FEATURES)); - setChr.addAll(getChromosomesList(motifFilePath, MOTIF_FEATURES)); - setChr.addAll(getChromosomesList(annotatedFilePath, "annotated_features")); - setChr.addAll(getChromosomesList(deprecatedMotifFilePath, DEPRECATED_MOTIF_FEATURES)); - setChr.addAll(getChromosomesList(deprecatedRegulatoryFilePath, "regulatory_features_multicell")); - if (Files.exists(mirnaFilePath)) { - setChr.addAll(getChromosomesList(mirnaFilePath, "mirna_uniq")); - } - - List chromosomes = new ArrayList<>(setChr); - List regulatoryFeatures; - HashSet chunksHash; - for (String chromosome : chromosomes) { - for (int i = 0; i < tableNames.size(); i++) { - chunksHash = new HashSet<>(); - regulatoryFeatures = queryChromosomesRegulatoryDB(filePaths.get(i), tableNames.get(i), chromosome); - for (RegulatoryFeature regulatoryFeature : regulatoryFeatures) { - int firstChunkId = getChunkId(regulatoryFeature.getStart(), CHUNK_SIZE); - int lastChunkId = getChunkId(regulatoryFeature.getEnd(), CHUNK_SIZE); - - List chunkIds = new ArrayList<>(); - String chunkId; - for (int j = firstChunkId; j <= lastChunkId; j++) { - chunkId = chromosome + "_" + j + "_" + chunkIdSuffix; - chunkIds.add(chunkId); - //count chunks - if (!chunksHash.contains(j)) { - chunksHash.add(j); - } - } -// regulatoryFeature.setChunkIds(chunkIds); - - // remove 'chr' prefix -// if (genericFeature.getChromosome() != null) { -// genericFeature.setSequenceName(genericFeature.getSequenceName().replace("chr", "")); -// } - serializer.serialize(regulatoryFeature); - } - } - } - } - - - public void createSQLiteRegulatoryFiles(Path filePath, String tableName, List columnNames, List columnTypes) - throws ClassNotFoundException, IOException, SQLException { - int limitRows = 100000; - int batchCount = 0; - - if (!Files.exists(filePath) || Files.size(filePath) == 0) { - return; - } - - Path dbPath = Paths.get(filePath.toString() + ".db"); - if (Files.exists(dbPath) && Files.size(dbPath) > 0) { - return; - } - - BufferedReader br = FileUtils.newBufferedReader(filePath); - - Class.forName("org.sqlite.JDBC"); - Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - conn.setAutoCommit(false); //Set false to perform commits manually and increase performance on insertion - - //Create table query - Statement createTables = conn.createStatement(); - - StringBuilder sbQuery = new StringBuilder(); - sbQuery.append("CREATE TABLE if not exists " + tableName + "("); - for (int i = 0; i < columnNames.size(); i++) { //columnNames and columnTypes must have the same size - sbQuery.append("'" + columnNames.get(i) + "' " + columnTypes.get(i) + ","); - } - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(")"); - - System.out.println(sbQuery.toString()); - createTables.executeUpdate(sbQuery.toString()); - - //Prepare insert query - sbQuery = new StringBuilder(); - sbQuery.append("INSERT INTO " + tableName + "("); - for (int i = 0; i < columnNames.size(); i++) { - sbQuery.append("'" + columnNames.get(i) + "',"); - } - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(") values ("); - sbQuery.append(repeat("?,", columnNames.size())); - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(")"); - System.out.println(sbQuery.toString()); - - PreparedStatement ps = conn.prepareStatement(sbQuery.toString()); - - //Read file - String line = null; - while ((line = br.readLine()) != null) { - - insertByType(ps, getFields(line, tableName), columnTypes); - ps.addBatch(); - batchCount++; - - //commit batch - if (batchCount % limitRows == 0 && batchCount != 0) { - ps.executeBatch(); - conn.commit(); - } - - } - br.close(); - - //Execute last Batch - ps.executeBatch(); - conn.commit(); - - //Create index - System.out.println("creating indices..."); - createTables.executeUpdate("CREATE INDEX " + tableName + "_seqname_idx on " + tableName + "(" + columnNames.get(0) + ")"); - System.out.println("indices created."); - - conn.commit(); - conn.close(); - } - - public List getChromosomesList(Path dbPath, String tableName) throws IOException { - - try { - FileUtils.checkFile(dbPath); - } catch (IOException e) { - logger.warn(e.getMessage()); - return Collections.emptyList(); - } - - List chromosomes = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName); -// ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName + " where seqname like 'chr%'"); - - while (rs.next()) { - chromosomes.add(rs.getString(1)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return chromosomes; - } - - public List queryChromosomesRegulatoryDB(Path dbPath, String tableName, String chromosome) { - - try { - FileUtils.checkFile(dbPath); - } catch (IOException e) { - logger.warn(e.getMessage()); - return Collections.emptyList(); - } - - Connection conn; - List regulatoryFeatures = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='" + chromosome + "'"); -// ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='chr" + chromosome + "'"); - while (rs.next()) { - regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return regulatoryFeatures; - } - - public static List queryRegulatoryDB(Path dbPath, String tableName, String chrFile, int start, int end) { - Connection conn = null; - List regulatoryFeatures = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select * from " + tableName + " where start<=" + end + " AND end>=" + start); - - while (rs.next()) { - regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return regulatoryFeatures; - } - - private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs, String tableName) throws SQLException { - RegulatoryFeature regulatoryFeature = null; - switch (tableName.toLowerCase()) { - case REGULATORY_FEATURES: - regulatoryFeature = getRegulatoryFeature(rs); - break; - case MOTIF_FEATURES: - regulatoryFeature = getMotifFeature(rs); - break; - case "annotated_features": - regulatoryFeature = getAnnotatedFeature(rs); - break; - case "regulatory_features_multicell": - regulatoryFeature = getDeprecatedRegulatoryFeature(rs); - break; - case DEPRECATED_MOTIF_FEATURES: - regulatoryFeature = getDeprecatedMotifFeature(rs); - break; - case "mirna_uniq": - regulatoryFeature = getMirnaFeature(rs); - break; - default: - break; - } - return regulatoryFeature; - } - - private static RegulatoryFeature getMotifFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - - // Seems weird that the motif_feature_type property is used to fill the Name field. However, this is how the - // it was being done from the previous ENSEMBL files - regulatoryFeature.setName(groupFields.get(MOTIF_FEATURE_TYPE)); - - regulatoryFeature.setMatrix(groupFields.get(BINDING_MATRIX)); - - return regulatoryFeature; - } - - private static RegulatoryFeature getRegulatoryFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setId(groupFields.get(ID)); - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(groupFields.get(FEATURE_TYPE).replace(" ", "_")); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - - return regulatoryFeature; - } - - private static RegulatoryFeature getAnnotatedFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - regulatoryFeature.setName(groupFields.get("name")); - regulatoryFeature.setAlias(groupFields.get("alias")); - regulatoryFeature.setFeatureClass(groupFields.get("class")); - regulatoryFeature.getCellTypes().add(groupFields.get("cell_type")); - - return regulatoryFeature; - } - - @Deprecated - private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - regulatoryFeature.setFrame(rs.getString(9)); - - return regulatoryFeature; - } - - @Deprecated - private static RegulatoryFeature getDeprecatedMotifFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3) + "_motif"); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - String[] split = groupFields.get("name").split(":"); - regulatoryFeature.setName(split[0]); - regulatoryFeature.setMatrix(split[1]); - - return regulatoryFeature; - } - - private static RegulatoryFeature getMirnaFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - regulatoryFeature.setFeatureClass("microRNA"); - regulatoryFeature.setName(groupFields.get("name")); - - return regulatoryFeature; - } - - private static Map getGroupFields(String group) { - //process group column - Map groupFields = new HashMap<>(); - String[] attributeFields = group.split(";"); - String[] attributeKeyValue; - for (String attributeField : attributeFields) { - attributeKeyValue = attributeField.trim().split("="); - groupFields.put(attributeKeyValue[0].toLowerCase(), attributeKeyValue[1]); - } - return groupFields; - } - - - public static List getFields(String line, String tableName) { - List fields = new ArrayList<>(); - switch (tableName.toLowerCase()) { - case REGULATORY_FEATURES: - fields = getRegulatoryFeaturesFields(line); - break; - case MOTIF_FEATURES: - fields = getMotifFeaturesFields(line); - break; - case "annotated_features": - fields = getAnnotatedFeaturesFields(line); - break; - case "regulatory_features_multicell": - fields = getRegulatoryFeaturesFields(line); - break; - case DEPRECATED_MOTIF_FEATURES: - fields = getMotifFeaturesFields(line); - break; - case "mirna_uniq": - fields = getMirnaFeaturesFields(line); - break; - default: - break; - } - return fields; - } - - @Deprecated - public static List getAnnotatedFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getRegulatoryFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getMotifFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getMirnaFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static void insertByType(PreparedStatement ps, List fields, List types) throws SQLException { - //Datatypes In SQLite Version 3 -> http://www.sqlite.org/datatype3.html - String raw; - String type; - if (types.size() == fields.size()) { - for (int i = 0; i < fields.size(); i++) { //columnNames and columnTypes must have same size - int sqliteIndex = i + 1; - raw = fields.get(i); - type = types.get(i); - - switch (type) { - case "INTEGER": - case "INT": - ps.setInt(sqliteIndex, Integer.parseInt(raw)); - break; - case "REAL": - ps.setFloat(sqliteIndex, Float.parseFloat(raw)); - break; - case "TEXT": - ps.setString(sqliteIndex, raw); - break; - default: - ps.setString(sqliteIndex, raw); - break; - } - } - } - - } - - public String repeat(String s, int n) { - if (s == null) { - return null; - } - final StringBuilder sb = new StringBuilder(); - for (int i = 0; i < n; i++) { - sb.append(s); - } - return sb.toString(); - } - - private int getChunkId(int position, int chunksize) { - if (chunksize <= 0) { - return position / CHUNK_SIZE; - } else { - return position / chunksize; - } - } - - private int getChunkStart(int id, int chunksize) { - if (chunksize <= 0) { - return (id == 0) ? 1 : id * CHUNK_SIZE; - } else { - return (id == 0) ? 1 : id * chunksize; - } - } - - private int getChunkEnd(int id, int chunksize) { - if (chunksize <= 0) { - return (id * CHUNK_SIZE) + CHUNK_SIZE - 1; - } else { - return (id * chunksize) + chunksize - 1; - } - } -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index bbd25cf8f7..df4aa069bf 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -34,14 +34,6 @@ public class GenomeDownloadManager extends AbstractDownloadManager { - private static final String ENSEMBL_NAME = "ENSEMBL"; - private static final String TRF_NAME = "Tandem repeats finder"; - private static final String GSD_NAME = "Genomic super duplications"; - private static final String WM_NAME = "WindowMasker"; - - private static final String PUT_ASSEMBLY_HERE_MARK = "put_assembly_here"; - private static final String PUT_CHROMOSOME_HERE_MARK = "put_chromosome_here"; - public GenomeDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 546bb2dc7e..1ca0693b80 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -16,27 +16,15 @@ package org.opencb.cellbase.lib.download; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.feature.gff.Gff2; -import org.opencb.biodata.formats.feature.gff.io.Gff2Reader; -import org.opencb.biodata.formats.io.FileFormatException; -import org.opencb.biodata.models.core.RegulatoryPfm; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; -import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; -import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.*; -import java.util.concurrent.TimeUnit; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -51,7 +39,7 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C } @Override - public List download() throws IOException, InterruptedException, NoSuchMethodException, FileFormatException { + public List download() throws IOException, InterruptedException { if (!speciesHasInfoToDownload(speciesConfiguration, REGULATION_DATA)) { return Collections.emptyList(); } @@ -69,100 +57,108 @@ public List download() throws IOException, InterruptedException, N } /** - * Downloads Ensembl regulatory buid and motif feature files. + * Downloads Ensembl regulatory build and motif feature files. * @throws IOException Any issue when writing files * @throws InterruptedException Any issue downloading files */ - private List downloadRegulatoryaAndMotifFeatures() - throws IOException, InterruptedException, NoSuchMethodException, FileFormatException { - String regulationUrl = ensemblHostUrl + "/" + ensemblRelease; + private List downloadRegulatoryaAndMotifFeatures() throws IOException, InterruptedException { + String baseUrl = ensemblHostUrl + "/" + ensemblRelease; if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - regulationUrl = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); + baseUrl = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); } - regulationUrl += "/regulation/" + speciesShortName; List downloadFiles = new ArrayList<>(); - Path outputFile = regulationFolder.resolve(EtlCommons.REGULATORY_FEATURES_FILE); - String regulatoryBuildUrl = regulationUrl + "/*Regulatory_Build.regulatory_features*.gff.gz"; - downloadFiles.add(downloadFile(regulatoryBuildUrl, outputFile.toString())); - - outputFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE); - String motifUrl = regulationUrl + "/MotifFeatures/*" + assemblyConfiguration.getName() + ".motif_features.gff.gz"; - downloadFiles.add(downloadFile(motifUrl, outputFile.toString())); - - String motifTbiUrl = regulationUrl + "/MotifFeatures/*" + assemblyConfiguration.getName() + ".motif_features.gff.gz.tbi"; - outputFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE + ".tbi"); - downloadFiles.add(downloadFile(motifTbiUrl, outputFile.toString())); - - loadPfmMatrices(); + // Regulatory build + String url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(REGULATORY_BUILD_FILE_ID)) + .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName); + String outputFileName = getFilenameFromUrl(url); + Path outputPath = regulationFolder.resolve(outputFileName); + logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath.toString())); + // Save data source (name, category, version,...) + saveDataSource(REGULATORY_BUILD_NAME, REGULATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url), + regulationFolder.resolve(REGULATORY_BUILD_VERSION_FILENAME)); + + // Motif features + List urls = new ArrayList<>(); + url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(MOTIF_FEATURES_FILE_ID)) + .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName).replaceAll(PUT_ASSEMBLY_HERE_MARK, assemblyConfiguration.getName()); + outputFileName = getFilenameFromUrl(url); + outputPath = regulationFolder.resolve(outputFileName); + logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath.toString())); + urls.add(url); + // Motif features index + url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(MOTIF_FEATURES_INDEX_FILE_ID)) + .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName).replaceAll(PUT_ASSEMBLY_HERE_MARK, assemblyConfiguration.getName()); + outputFileName = getFilenameFromUrl(url); + outputPath = regulationFolder.resolve(outputFileName); + logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + downloadFiles.add(downloadFile(url, outputPath.toString())); + // Save data source (name, category, version,...) + saveDataSource(REGULATORY_BUILD_NAME, MOTIF_FEATURES_NAME, ensemblVersion, getTimeStamp(), urls, + regulationFolder.resolve(MOTIF_FEATURES_VERSION_FILENAME)); + + // This will be executed in the CellBase build +// loadPfmMatrices(); return downloadFiles; } - private void loadPfmMatrices() throws IOException, NoSuchMethodException, FileFormatException, InterruptedException { - logger.info("Downloading and building pfm matrices..."); - if (Files.exists(buildFolder.resolve("regulatory_pfm.json.gz"))) { - logger.info("regulatory_pfm.json.gz is already built"); - return; - } - Set motifIds = new HashSet<>(); - Path motifGffFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE); - try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { - Gff2 tfbsMotifFeature; - Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); - while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { - String pfmId = getMatrixId(filePattern, tfbsMotifFeature); - if (StringUtils.isNotEmpty(pfmId)) { - motifIds.add(pfmId); - } - } - } - - ObjectMapper mapper = new ObjectMapper(); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_pfm", true); - if (logger.isInfoEnabled()) { - logger.info("Looking up {} pfms", motifIds.size()); - } - for (String pfmId : motifIds) { - String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId - + "?unit=frequencies;content-type=application/json"; - URL url = new URL(urlString); - RegulatoryPfm regulatoryPfm = mapper.readValue(url, RegulatoryPfm.class); - serializer.serialize(regulatoryPfm); - // https://github.com/Ensembl/ensembl-rest/wiki/Rate-Limits - TimeUnit.MILLISECONDS.sleep(250); - } - serializer.close(); - } - - private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) { - Matcher matcher = pattern.matcher(tfbsMotifFeature.getAttribute()); - if (matcher.find()) { - return matcher.group(0); - } - return null; - } +// private void loadPfmMatrices() throws IOException, NoSuchMethodException, FileFormatException, InterruptedException { +// logger.info("Downloading and building pfm matrices..."); +// if (Files.exists(buildFolder.resolve("regulatory_pfm.json.gz"))) { +// logger.info("regulatory_pfm.json.gz is already built"); +// return; +// } +// Set motifIds = new HashSet<>(); +// Path motifGffFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE); +// try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { +// Gff2 tfbsMotifFeature; +// Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); +// while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { +// String pfmId = getMatrixId(filePattern, tfbsMotifFeature); +// if (StringUtils.isNotEmpty(pfmId)) { +// motifIds.add(pfmId); +// } +// } +// } +// +// ObjectMapper mapper = new ObjectMapper(); +// CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_pfm", true); +// if (logger.isInfoEnabled()) { +// logger.info("Looking up {} pfms", motifIds.size()); +// } +// for (String pfmId : motifIds) { +// String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId +// + "?unit=frequencies;content-type=application/json"; +// URL url = new URL(urlString); +// RegulatoryPfm regulatoryPfm = mapper.readValue(url, RegulatoryPfm.class); +// serializer.serialize(regulatoryPfm); +// // https://github.com/Ensembl/ensembl-rest/wiki/Rate-Limits +// TimeUnit.MILLISECONDS.sleep(250); +// } +// serializer.close(); +// } +// +// private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) { +// Matcher matcher = pattern.matcher(tfbsMotifFeature.getAttribute()); +// if (matcher.find()) { +// return matcher.group(0); +// } +// return null; +// } private DownloadFile downloadMirna() throws IOException, InterruptedException { logger.info("Downloading {} ...", MIRBASE_NAME); - String url = configuration.getDownload().getMirbase().getHost(); - - saveVersionData(EtlCommons.REGULATION_DATA, MIRBASE_NAME, configuration.getDownload().getMirbase().getVersion(), getTimeStamp(), - Collections.singletonList(url), regulationFolder.resolve(MIRBASE_VERSION_FILENAME)); - Path outputPath = regulationFolder.resolve(Paths.get(url).getFileName()); - logger.info("Downloading from {} to {} ...", url, outputPath); - return downloadFile(url, outputPath.toString()); + return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_NAME, REGULATION_DATA, MIRBASE_FILE_ID, + MIRBASE_VERSION_FILENAME, regulationFolder); } private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException { logger.info("Downloading {} ...", MIRTARBASE_NAME); - String url = configuration.getDownload().getMiRTarBase().getHost(); - - saveVersionData(EtlCommons.REGULATION_DATA, MIRTARBASE_NAME, configuration.getDownload().getMiRTarBase().getVersion(), - getTimeStamp(), Collections.singletonList(url), regulationFolder.resolve(MIRTARBASE_VERSION_FILENAME)); - Path outputPath = regulationFolder.resolve(Paths.get(url).getFileName()); - logger.info("Downloading from {} to {} ...", url, outputPath); - return downloadFile(url, outputPath.toString()); + return downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), MIRTARBASE_NAME, REGULATION_DATA, MIRTARBASE_FILE_ID, + MIRBASE_VERSION_FILENAME, regulationFolder); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java index 3bc97b1824..507f554eab 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/DataReleaseManager.java @@ -199,9 +199,6 @@ public void update(DataRelease dataRelase) { if (CollectionUtils.isNotEmpty(source.getUrls())) { map.put("urls", source.getUrls()); } - if (CollectionUtils.isNotEmpty(source.getNotes())) { - map.put("notes", source.getUrls()); - } tmp.add(map); } releaseDBAdaptor.update(dataRelase.getRelease(), "sources", tmp); From e18506b9580d443a518833cf893b6261a8af7a19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 12 Apr 2024 13:39:14 +0200 Subject: [PATCH 054/107] lib: update CellBase downloaders, #TASK-5775, #TASK-5564 --- .../src/main/resources/configuration.yml | 8 +- .../org/opencb/cellbase/lib/EtlCommons.java | 52 ++++++++++- .../lib/download/AbstractDownloadManager.java | 93 ++++++++++++++++--- .../lib/download/CaddDownloadManager.java | 2 +- .../lib/download/ClinicalDownloadManager.java | 4 +- .../MissenseScoresDownloadManager.java | 4 +- .../lib/download/OntologyDownloadManager.java | 2 +- .../lib/download/ProteinDownloadManager.java | 5 +- .../download/RegulationDownloadManager.java | 68 ++++++-------- 9 files changed, 174 insertions(+), 64 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 28263dfb6e..3b237d5c3f 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -53,11 +53,11 @@ download: password: '' libs: "${CELLBASE.ENSEMBL.LIBS}" url: - host: ftp://ftp.ensembl.org/pub/ + host: https://ftp.ensembl.org/pub/ files: - REGULATORY_BUILD: "regulation/put_species_here/*Regulatory_Build.regulatory_features*.gff.gz" - MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/*put_assembly_here.motif_features.gff.gz" - MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/*put_assembly_here.motif_features.gff.gz.tbi" + REGULATORY_BUILD: "regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" + MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 207841aabb..279bf27ce1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -16,9 +16,11 @@ package org.opencb.cellbase.lib; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.Level; import org.apache.logging.log4j.core.config.Configurator; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.commons.utils.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -368,7 +370,55 @@ public static Long countFileLines(Path filePath) throws IOException { } return nLines; } + } + + public static String getEnsemblUrl(DownloadProperties.EnsemblProperties props, String ensemblRelease, String fileId, String species, + String assembly, String chromosome) throws CellBaseException { + if (!props.getUrl().getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.EnsemblProperties within the CellBase" + + " configuration file"); + } + String filesValue = props.getUrl().getFiles().get(fileId); + String url = props.getUrl().getHost() + ensemblRelease + "/" + filesValue; + // Change species, assembly, chromosome if necessary + if (StringUtils.isNotEmpty(species)) { + url = url.replaceAll(PUT_SPECIES_HERE_MARK, species); + } + if (StringUtils.isNotEmpty(assembly)) { + url = url.replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly); + } + if (StringUtils.isNotEmpty(chromosome)) { + url = url.replaceAll(PUT_CHROMOSOME_HERE_MARK, chromosome); + } + return url; + } + public static String getUrl(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + return getUrl(props, fileId, null, null, null); } + public static String getUrl(DownloadProperties.URLProperties props, String fileId, String species, String assembly, String chromosome) + throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" + + " configuration file"); + } + String url; + String filesValue = props.getFiles().get(fileId); + if (filesValue.startsWith("https://") || filesValue.startsWith("http://") || filesValue.startsWith("ftp://")) { + url = filesValue; + } else { + url = props.getHost() + filesValue; + } + if (StringUtils.isNotEmpty(species)) { + url = url.replaceAll(PUT_SPECIES_HERE_MARK, species); + } + if (StringUtils.isNotEmpty(assembly)) { + url = url.replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly); + } + if (StringUtils.isNotEmpty(chromosome)) { + url = url.replaceAll(PUT_CHROMOSOME_HERE_MARK, chromosome); + } + return url; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index c87e2a9512..74ecbe4d4a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -126,7 +126,7 @@ private void init() throws CellBaseException, IOException { logger.info("Processing species {}", speciesConfiguration.getScientificName()); } - public abstract List download() throws IOException, InterruptedException; + public abstract List download() throws IOException, InterruptedException, CellBaseException; protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) { boolean hasInfo = true; @@ -137,26 +137,67 @@ protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) return hasInfo; } - protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String name, String category, String fileId, - String versionFilename, Path outPath) throws IOException, InterruptedException { - logger.info("Downloading {} ({}) file ...", name, category); - String url = props.getHost() + props.getFiles().get(fileId); - File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); - DownloadFile downloadFile = downloadFile(url, outPath.toString()); + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, + String versionFilename, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadAndSaveDataSource(props, fileId, name, category, null, versionFilename, outPath); + } + + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, + String chromosome, String versionFilename, Path outPath) + throws IOException, InterruptedException, CellBaseException { + // Download file + DownloadFile downloadFile = downloadDataSource(props, fileId, chromosome, outPath); // Save data source - saveDataSource(name, category, props.getVersion(), getTimeStamp(), Collections.singletonList(url), + saveDataSource(name, category, props.getVersion(), getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), outPath.resolve(versionFilename)); return downloadFile; } - protected String getTimeStamp() { - return new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()); + protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String name, + String category, String chromosome, String versionFilename, Path outPath) + throws IOException, InterruptedException, CellBaseException { + // Download file + DownloadFile downloadFile = downloadEnsemblDataSource(ensemblProps, fileId, chromosome, outPath); + + // Save data source + saveDataSource(name, category, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + outPath.resolve(versionFilename)); + + return downloadFile; } - protected void saveDataSource(String name, String category, String version, String date, List urls, Path outputFilePath) + protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props, String fileId, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadDataSource(props, fileId, null, outPath); + } + + protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props, String fileId, + String chromosome, Path outPath) + throws IOException, InterruptedException, CellBaseException { + String url = EtlCommons.getUrl(props, fileId, species, assembly, chromosome); + File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); + logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); + return downloadFile(url, outFile.toString()); + } + + protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadEnsemblDataSource(ensemblProps, fileId, null, outPath); + } + + protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String chromosome, + Path outPath) throws IOException, InterruptedException, CellBaseException { + String url = EtlCommons.getEnsemblUrl(ensemblProps, ensemblRelease, fileId, speciesShortName, assemblyConfiguration.getName(), + chromosome); + File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); + logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); + return downloadFile(url, outFile.toString()); + } + + protected void saveDataSource(String name, String category, String version, String date, List urls, Path versionFilePath) throws IOException { DataSource dataSource = new DataSource(name, category, version, date, urls); @@ -165,7 +206,11 @@ protected void saveDataSource(String name, String category, String version, Stri dataSource.setVersion(date); } - dataSourceWriter.writeValue(outputFilePath.toFile(), dataSource); + dataSourceWriter.writeValue(versionFilePath.toFile(), dataSource); + } + + protected String getTimeStamp() { + return new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()); } protected String getLine(Path readmePath, int lineNumber) { @@ -293,6 +338,28 @@ private String getEnsemblURL(SpeciesConfiguration sp) { } } + @Deprecated + protected String getUrl(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" + + " configuration file"); + } + String filesValue = props.getFiles().get(fileId); + if (filesValue.startsWith("https://") || filesValue.startsWith("http://") || filesValue.startsWith("ftp://")) { + return filesValue; + } else { + return props.getHost() + filesValue; + } + } + + protected String getFilenameFromProps(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" + + " configuration file"); + } + return getFilenameFromUrl(props.getFiles().get(fileId)); + } + protected String getFilenameFromUrl(String url) { return Paths.get(url).getFileName().toString(); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index 6743ed8a06..af3ff65baf 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -35,7 +35,7 @@ public CaddDownloadManager(String species, String assembly, Path targetDirectory } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { if (!speciesHasInfoToDownload(speciesConfiguration, VARIATION_FUNCTIONAL_SCORE_DATA)) { return null; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index a274df11a4..1e66f1b5f0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -43,13 +43,13 @@ public ClinicalDownloadManager(String species, String assembly, Path outdir, Cel } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { List downloadFiles = new ArrayList<>(); downloadFiles.addAll(downloadClinical()); return downloadFiles; } - public List downloadClinical() throws IOException, InterruptedException { + public List downloadClinical() throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_SUBDIRECTORY).toAbsolutePath(); Files.createDirectories(clinicalFolder); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java index 0dba31ed78..50cf9ee0c0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java @@ -36,11 +36,11 @@ public MissenseScoresDownloadManager(String species, String assembly, Path targe } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { return Collections.singletonList(downloadRevel()); } - public DownloadFile downloadRevel() throws IOException, InterruptedException { + public DownloadFile downloadRevel() throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); Files.createDirectories(missensePredictionScorePath); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index 7e730a8b0a..b09cf76f2f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -34,7 +34,7 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec super(species, assembly, targetDirectory, configuration); } - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { Path oboFolder = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); Files.createDirectories(oboFolder); logger.info("Downloading {} files {} ...", ONTOLOGY_DATA, oboFolder); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 799bc92aad..519ea828d1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -42,9 +42,10 @@ public ProteinDownloadManager(String species, String assembly, Path targetDirect * * @return list of files downloaded * @throws IOException if there is an error writing to a file - * @throws InterruptedException if there is an error downloading files * + * @throws InterruptedException if there is an error downloading files + * @throws CellBaseException if there is an error in the CelllBase configuration file */ - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { if (!speciesHasInfoToDownload(speciesConfiguration, PROTEIN_DATA)) { return null; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 1ca0693b80..26ed4776da 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -39,7 +39,7 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { if (!speciesHasInfoToDownload(speciesConfiguration, REGULATION_DATA)) { return Collections.emptyList(); } @@ -61,52 +61,44 @@ public List download() throws IOException, InterruptedException { * @throws IOException Any issue when writing files * @throws InterruptedException Any issue downloading files */ - private List downloadRegulatoryaAndMotifFeatures() throws IOException, InterruptedException { - String baseUrl = ensemblHostUrl + "/" + ensemblRelease; - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - baseUrl = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); - } + private List downloadRegulatoryaAndMotifFeatures() throws IOException, InterruptedException, CellBaseException { +// String baseUrl; +// if (configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { +// baseUrl = ensemblHostUrl + ensemblRelease + "/"; +// } else { +// baseUrl = ensemblHostUrl + ensemblRelease + "/" + getPhylo(speciesConfiguration) + "/"; +// } + DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // Regulatory build - String url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(REGULATORY_BUILD_FILE_ID)) - .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName); - String outputFileName = getFilenameFromUrl(url); - Path outputPath = regulationFolder.resolve(outputFileName); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); - // Save data source (name, category, version,...) - saveDataSource(REGULATORY_BUILD_NAME, REGULATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url), - regulationFolder.resolve(REGULATORY_BUILD_VERSION_FILENAME)); + downloadFile = downloadAndSaveEnsemblDataSource(configuration.getDownload().getEnsembl(), REGULATORY_BUILD_FILE_ID, + REGULATORY_BUILD_NAME, REGULATION_DATA, null, REGULATORY_BUILD_VERSION_FILENAME, regulationFolder); + downloadFiles.add(downloadFile); - // Motif features + // Motifs features List urls = new ArrayList<>(); - url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(MOTIF_FEATURES_FILE_ID)) - .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName).replaceAll(PUT_ASSEMBLY_HERE_MARK, assemblyConfiguration.getName()); - outputFileName = getFilenameFromUrl(url); - outputPath = regulationFolder.resolve(outputFileName); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); - urls.add(url); - // Motif features index - url = (baseUrl + configuration.getDownload().getEnsembl().getUrl().getFiles().get(MOTIF_FEATURES_INDEX_FILE_ID)) - .replaceAll(PUT_SPECIES_HERE_MARK, speciesShortName).replaceAll(PUT_ASSEMBLY_HERE_MARK, assemblyConfiguration.getName()); - outputFileName = getFilenameFromUrl(url); - outputPath = regulationFolder.resolve(outputFileName); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - downloadFiles.add(downloadFile(url, outputPath.toString())); + downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), MOTIF_FEATURES_FILE_ID, null, regulationFolder); + downloadFiles.add(downloadFile); + urls.add(downloadFile.getUrl()); + // And now the index file + downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), MOTIF_FEATURES_INDEX_FILE_ID, null, + regulationFolder); + downloadFiles.add(downloadFile); + urls.add(downloadFile.getUrl()); // Save data source (name, category, version,...) - saveDataSource(REGULATORY_BUILD_NAME, MOTIF_FEATURES_NAME, ensemblVersion, getTimeStamp(), urls, + saveDataSource(MOTIF_FEATURES_NAME, REGULATION_DATA, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), urls, regulationFolder.resolve(MOTIF_FEATURES_VERSION_FILENAME)); - // This will be executed in the CellBase build + // TODO: This will be executed in the CellBase build // loadPfmMatrices(); return downloadFiles; } -// private void loadPfmMatrices() throws IOException, NoSuchMethodException, FileFormatException, InterruptedException { +// private void loadPfmMatrices() +// throws IOException, NoSuchMethodException, FileFormatException, InterruptedException, CellBaseException { // logger.info("Downloading and building pfm matrices..."); // if (Files.exists(buildFolder.resolve("regulatory_pfm.json.gz"))) { // logger.info("regulatory_pfm.json.gz is already built"); @@ -150,15 +142,15 @@ private List downloadRegulatoryaAndMotifFeatures() throws IOExcept // return null; // } - private DownloadFile downloadMirna() throws IOException, InterruptedException { + private DownloadFile downloadMirna() throws IOException, InterruptedException, CellBaseException { logger.info("Downloading {} ...", MIRBASE_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_NAME, REGULATION_DATA, MIRBASE_FILE_ID, + return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_FILE_ID, MIRBASE_NAME, REGULATION_DATA, MIRBASE_VERSION_FILENAME, regulationFolder); } - private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException { + private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException, CellBaseException { logger.info("Downloading {} ...", MIRTARBASE_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), MIRTARBASE_NAME, REGULATION_DATA, MIRTARBASE_FILE_ID, - MIRBASE_VERSION_FILENAME, regulationFolder); + return downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), MIRTARBASE_FILE_ID, MIRTARBASE_NAME, REGULATION_DATA, + MIRTARBASE_VERSION_FILENAME, regulationFolder); } } From 69a58bf698262fa67677f7f4c263d2d616118bf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 15 Apr 2024 08:48:59 +0200 Subject: [PATCH 055/107] core: update CellBase configuration file, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 3b237d5c3f..061725feb1 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -70,6 +70,16 @@ download: host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt version: "2023-11-01" refSeq: + host: https://ftp.ncbi.nih.gov/refseq/ + version: "October 16, 2023 (GRCh38.p14)" + files: + GENOMIC_GTF: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz + GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz + PROTEIN_FAA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz + RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz + + + host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz version: "2023-10-11" refSeqFasta: From d4e0cd659e81a6072a1c87e9ce635867042247f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 11:20:49 +0200 Subject: [PATCH 056/107] lib: update MANE Select downloader, #TASK-5775, #TASK-5564 --- .../src/main/resources/configuration.yml | 18 ++------- .../org/opencb/cellbase/lib/EtlCommons.java | 7 ++++ .../lib/download/GeneDownloadManager.java | 39 ++++++++----------- 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 061725feb1..7ebb8d5f6e 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -77,23 +77,11 @@ download: GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz PROTEIN_FAA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz - - - - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz - version: "2023-10-11" - refSeqFasta: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz - version: "2023-10-11" - refSeqProteinFasta: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz - version: "2023-10-11" - refSeqCdna: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz - version: "2023-10-11" maneSelect: - host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz + host: https://ftp.ncbi.nlm.nih.gov/refseq/ version: "1.1" + files: + MANE_SELECT: MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz lrg: host: http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt version: "2021-03-30" diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 279bf27ce1..cd3fef74ff 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -61,7 +61,14 @@ public class EtlCommons { public static final String GENE_DATA = "gene"; public static final String ENSEMBL_CORE_VERSION_FILENAME = "ensemblCore" + SUFFIX_VERSION_FILENAME; + + // MANE Select + public static final String MANE_SELECT_NAME = "MANE Select"; public static final String MANE_SELECT_VERSION_FILENAME = "maneSelect" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String MANE_SELECT_FILE_ID = "MANE_SELECT"; + + public static final String LRG_VERSION_FILENAME = "lrg" + SUFFIX_VERSION_FILENAME; public static final String HGNC_VERSION_FILENAME = "hgnc" + SUFFIX_VERSION_FILENAME; public static final String CANCER_HOTSPOT_VERSION_FILENAME = "cancerHotspot" + SUFFIX_VERSION_FILENAME; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 843bc360e3..9e27ae22f1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -32,19 +32,19 @@ public class GeneDownloadManager extends AbstractDownloadManager { - private static final String ENSEMBL_NAME = "ENSEMBL"; - private static final String REFSEQ_NAME = "RefSeq"; - private static final String UNIPROT_NAME = "UniProt"; - private static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; - private static final String HPO_NAME = "HPO"; - private static final String DISGENET_NAME = "DisGeNET"; - private static final String MANE_SELECT_NAME = "MANE Select"; - private static final String LRG_NAME = "LRG"; - private static final String HGNC_GENE_NAME = "HGNC Gene"; - private static final String CANCER_HOTSPOT_NAME = "Cancer HotSpot"; - private static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; - private static final String DGIDB_NAME = "DGIdb"; - private static final String GNOMAD_NAME = "gnomAD"; +// private static final String ENSEMBL_NAME = "ENSEMBL"; +// private static final String REFSEQ_NAME = "RefSeq"; +// private static final String UNIPROT_NAME = "UniProt"; +// private static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; +// private static final String HPO_NAME = "HPO"; +// private static final String DISGENET_NAME = "DisGeNET"; +// private static final String MANE_SELECT_NAME = "MANE Select"; +// private static final String LRG_NAME = "LRG"; +// private static final String HGNC_GENE_NAME = "HGNC Gene"; +// private static final String CANCER_HOTSPOT_NAME = "Cancer HotSpot"; +// private static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; +// private static final String DGIDB_NAME = "DGIdb"; +// private static final String GNOMAD_NAME = "gnomAD"; private static final Map GENE_UNIPROT_XREF_FILES; @@ -186,16 +186,11 @@ private DownloadFile downloadRefSeqFile(String name, DownloadProperties.URLPrope return downloadFile(url, outputPath.toString()); } - private DownloadFile downloadMane(Path geneFolder) throws IOException, InterruptedException { + private DownloadFile downloadMane(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading MANE Select ..."); - String url = configuration.getDownload().getManeSelect().getHost(); - saveDataSource(EtlCommons.GENE_DATA, MANE_SELECT_NAME, configuration.getDownload().getManeSelect().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(MANE_SELECT_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + logger.info("Downloading {} ...", MANE_SELECT_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, MANE_SELECT_NAME, GENE_DATA, + MANE_SELECT_VERSION_FILENAME, geneFolder); } return null; } From 6ee2f78f82322a0d9e9f3eec6945a2c7b8d8d74e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 13:01:40 +0200 Subject: [PATCH 057/107] lib: update LRG, HGNC, Cancer HotSpot, DGIDB, Gene Uniprot Xref, Gene Expression Atlas, Gene Disease Annotation, gnomAD Constraints and GO Annotation downloaders, #TASK-5775, #TASK-5564 --- .../src/main/resources/configuration.yml | 64 ++++--- .../org/opencb/cellbase/lib/EtlCommons.java | 72 ++++++-- .../lib/download/GeneDownloadManager.java | 161 ++++++------------ 3 files changed, 150 insertions(+), 147 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 7ebb8d5f6e..70acaf8776 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -66,9 +66,6 @@ download: libs: "${CELLBASE.ENSEMBL.LIBS}" url: host: ftp://ftp.ensemblgenomes.org/pub - hgnc: - host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt - version: "2023-11-01" refSeq: host: https://ftp.ncbi.nih.gov/refseq/ version: "October 16, 2023 (GRCh38.p14)" @@ -83,16 +80,52 @@ download: files: MANE_SELECT: MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz lrg: - host: http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt + host: http://ftp.ebi.ac.uk/ version: "2021-03-30" + files: + LRG: pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt + hgnc: + host: https://ftp.ebi.ac.uk/ + version: "2023-11-01" + files: + HGNC: pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt + cancerHotspot: + host: https://www.cancerhotspots.org/ + version: "v2" + files: + CANCER_HOTSPOT: files/hotspots_v2.xls + dgidb: + host: https://old.dgidb.org/ + version: "2022-02-01" + files: + DGIDB: data/monthly_tsvs/2022-Feb/interactions.tsv geneUniprotXref: - host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ + host: http://ftp.uniprot.org/ version: "2024_01 (24-Jan-2024)" + files: + UNIPROT_XREF: pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz geneExpressionAtlas: - host: https://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + host: https://ftp.ebi.ac.uk/ version: "2.0.14" + files: + GENE_EXPRESSION_ATLAS: pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + hpo: + ## NOTE: Download manually from here now + host: https://hpo.jax.org/app/data/annotations + disgenet: + host: https://www.disgenet.org/ + version: "7.0 (January 2020)" + files: + DISGENET: static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz + gnomadConstraints: + host: https://storage.googleapis.com/ + version: "2.1.1" + files: + GNOMAD_CONSTRAINTS: gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz goAnnotation: - host: http://geneontology.org/gene-associations/goa_human.gaf.gz + host: http://geneontology.org/ + files: + GO_ANNOTATION: gene-associations/goa_human.gaf.gz ## Regulation mirbase: @@ -173,9 +206,6 @@ download: GWAS: pub/databases/gwas/releases/2024/02/12/gwas-catalog-associations_ontology-annotated.tsv DBSNP: All.vcf.gz - cancerHotspot: - host: https://www.cancerhotspots.org/files/hotspots_v2.xls - version: "v2" dgv: host: http://dgv.tcag.ca/v106/docs simpleRepeats: @@ -206,22 +236,8 @@ download: files: CADD: download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz - hpo: - ## NOTE: Download manually from here now - host: https://hpo.jax.org/app/data/annotations - disgenet: - host: https://www.disgenet.org/ - version: "7.0 (January 2020)" - files: - ALL_GENE_DISEASE_ASSOCIATIONS: static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz - dgidb: - host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv - version: "2022-02-01" reactome: host: http://www.reactome.org/download/current/biopax.zip - gnomadConstraints: - host: https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz - version: "2.1.1" ## OBO Ontologies hpoObo: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index cd3fef74ff..b7100a18a6 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -62,30 +62,78 @@ public class EtlCommons { public static final String GENE_DATA = "gene"; public static final String ENSEMBL_CORE_VERSION_FILENAME = "ensemblCore" + SUFFIX_VERSION_FILENAME; + // RefSeq + public static final String REFSEQ_NAME = "RefSeq"; + public static final String REFSEQ_VERSION_FILENAME = "refSeq" + SUFFIX_VERSION_FILENAME; +// public static final String REFSEQ_ASTA_VERSION_FILENAME = REFSEQ_DATA + "Fasta" + SUFFIX_VERSION_FILENAME; +// public static final String REFSEQ_PROTEIN_FASTA_VERSION_FILENAME = REFSEQ_DATA + "ProteinFasta" + SUFFIX_VERSION_FILENAME; +// public static final String REFSEQ_CDNA_FASTA_VERSION_FILENAME = REFSEQ_DATA + "CdnaFasta" + SUFFIX_VERSION_FILENAME; + // MANE Select public static final String MANE_SELECT_NAME = "MANE Select"; public static final String MANE_SELECT_VERSION_FILENAME = "maneSelect" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String MANE_SELECT_FILE_ID = "MANE_SELECT"; - + // LRG + public static final String LRG_NAME = "LRG"; public static final String LRG_VERSION_FILENAME = "lrg" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String LRG_FILE_ID = "LRG"; + + // HGNC + public static final String HGNC_NAME = "HGNC Gene"; public static final String HGNC_VERSION_FILENAME = "hgnc" + SUFFIX_VERSION_FILENAME; - public static final String CANCER_HOTSPOT_VERSION_FILENAME = "cancerHotspot" + SUFFIX_VERSION_FILENAME; - public static final String GO_ANNOTATION_VERSION_FILENAME = "goAnnotation" + SUFFIX_VERSION_FILENAME; - public static final String GNOMAD_VERSION_FILENAME = "gnomad" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String HGNC_FILE_ID = "HGNC"; + + // Cancer HotSpot + public static final String CANCER_HOTSPOT_NAME = "Cancer HotSpot"; + public static final String CANCER_HOTSPOT_VERSION_FILENAME = "cancerHotSpot" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String CANCER_HOTSPOT_FILE_ID = "CANCER_HOTSPOT"; + + // DGID (drug) + public static final String DGIDB_NAME = "DGIdb"; public static final String DGIDB_VERSION_FILENAME = "dgidb" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String DGIDB_FILE_ID = "DGIDB"; + + // UniProt Xref + public static final String UNIPROT_XREF_NAME = "UniProt Xref"; public static final String UNIPROT_XREF_VERSION_FILENAME = "uniprotXref" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String UNIPROT_XREF_FILE_ID = "UNIPROT_XREF"; + + // Gene Expression Atlas + public static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; public static final String GENE_EXPRESSION_ATLAS_VERSION_FILENAME = "geneExpressionAtlas" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String GENE_EXPRESSION_ATLAS_FILE_ID = "GENE_EXPRESSION_ATLAS"; + + // Gene Disease Annotation + public static final String GENE_DISEASE_ANNOTATION_NAME = "Gene Disease Annotation"; + // HPO + public static final String HPO_NAME = "HPO"; public static final String HPO_VERSION_FILENAME = "hpo" + SUFFIX_VERSION_FILENAME; - public static final String DISGINET_VERSION_FILENAME = "disgenet" + SUFFIX_VERSION_FILENAME; - - public static final String REFSEQ_DATA = "refseq"; - public static final String REFSEQ_VERSION_FILENAME = REFSEQ_DATA + SUFFIX_VERSION_FILENAME; - public static final String REFSEQ_ASTA_VERSION_FILENAME = REFSEQ_DATA + "Fasta" + SUFFIX_VERSION_FILENAME; - public static final String REFSEQ_PROTEIN_FASTA_VERSION_FILENAME = REFSEQ_DATA + "ProteinFasta" + SUFFIX_VERSION_FILENAME; - public static final String REFSEQ_CDNA_FASTA_VERSION_FILENAME = REFSEQ_DATA + "CdnaFasta" + SUFFIX_VERSION_FILENAME; - public static final String GENE_DISEASE_ASSOCIATION_DATA = "gene_disease_association"; + // DISGENET + public static final String DISGENET_NAME = "DisGeNet"; + public static final String DISGENET_VERSION_FILENAME = "disGeNet" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String DISGENET_FILE_ID = "DISGENET"; + + // gnomAD Constraints + public static final String GNOMAD_CONSTRAINTS_NAME = "gnomAD Constraints"; + public static final String GNOMAD_CONSTRAINTS_VERSION_FILENAME = "gnomadConstraints" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String GNOMAD_CONSTRAINTS_FILE_ID = "GNOMAD_CONSTRAINTS"; + + // GO Annotation + public static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; + public static final String GO_ANNOTATION_VERSION_FILENAME = "goAnnotation" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String GO_ANNOTATION_FILE_ID = "GO_ANNOTATION"; + public static final String VARIATION_DATA = "variation"; public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; public static final String SPLICE_SCORE_DATA = "splice_score"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 9e27ae22f1..679b9aaa95 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -32,20 +32,6 @@ public class GeneDownloadManager extends AbstractDownloadManager { -// private static final String ENSEMBL_NAME = "ENSEMBL"; -// private static final String REFSEQ_NAME = "RefSeq"; -// private static final String UNIPROT_NAME = "UniProt"; -// private static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; -// private static final String HPO_NAME = "HPO"; -// private static final String DISGENET_NAME = "DisGeNET"; -// private static final String MANE_SELECT_NAME = "MANE Select"; -// private static final String LRG_NAME = "LRG"; -// private static final String HGNC_GENE_NAME = "HGNC Gene"; -// private static final String CANCER_HOTSPOT_NAME = "Cancer HotSpot"; -// private static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; -// private static final String DGIDB_NAME = "DGIdb"; -// private static final String GNOMAD_NAME = "gnomAD"; - private static final Map GENE_UNIPROT_XREF_FILES; static { @@ -64,7 +50,7 @@ public GeneDownloadManager(String species, String assembly, Path targetDirectory } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { logger.info("Downloading gene information ..."); Path geneFolder = downloadFolder.resolve("gene"); Files.createDirectories(geneFolder); @@ -143,6 +129,7 @@ private List downloadRefSeq(Path refSeqFolder) throws IOException, String timeStamp = getTimeStamp(); // gtf + dow DownloadFile downloadFile = downloadRefSeqFile(REFSEQ_NAME, configuration.getDownload().getRefSeq(), timeStamp, REFSEQ_VERSION_FILENAME, refSeqFolder); downloadFiles.add(downloadFile); @@ -179,7 +166,7 @@ private DownloadFile downloadRefSeqFile(String name, DownloadProperties.URLPrope String version = urlProperties.getVersion(); String filename = getFilenameFromUrl(url); Path outputPath = refSeqFolder.resolve(filename); - saveDataSource(EtlCommons.REFSEQ_DATA, name, version, timeStamp, Collections.singletonList(url), + saveDataSource(name, EtlCommons.REFSEQ_NAME, version, timeStamp, Collections.singletonList(url), refSeqFolder.resolve(versionFilename)); logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); @@ -195,133 +182,85 @@ private DownloadFile downloadMane(Path geneFolder) throws IOException, Interrupt return null; } - private DownloadFile downloadLrg(Path geneFolder) throws IOException, InterruptedException { + private DownloadFile downloadLrg(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading LRG data ..."); - String url = configuration.getDownload().getLrg().getHost(); - saveDataSource(EtlCommons.GENE_DATA, LRG_NAME, configuration.getDownload().getLrg().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(LRG_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + logger.info("Downloading {} ...", LRG_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_NAME, GENE_DATA, LRG_VERSION_FILENAME, + geneFolder); } return null; } - private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException { + private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading HGNC data ..."); - String url = configuration.getDownload().getHgnc().getHost(); - saveDataSource(GENE_DATA, HGNC_GENE_NAME, configuration.getDownload().getHgnc().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(HGNC_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + logger.info("Downloading {} ...", HGNC_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_NAME, GENE_DATA, + HGNC_VERSION_FILENAME, geneFolder); } return null; } - private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException { + private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading Cancer Hotspot ..."); - String url = configuration.getDownload().getCancerHotspot().getHost(); - saveDataSource(EtlCommons.GENE_DATA, CANCER_HOTSPOT_NAME, configuration.getDownload().getHgnc().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(CANCER_HOTSPOT_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + logger.info("Downloading {} ...", CANCER_HOTSPOT_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, CANCER_HOTSPOT_NAME, + GENE_DATA, CANCER_HOTSPOT_VERSION_FILENAME, geneFolder); } return null; } - private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException { + private DownloadFile downloadDrugData(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading GO annotation..."); - String url = configuration.getDownload().getGoAnnotation().getHost(); - saveDataSource(EtlCommons.GENE_DATA, GO_ANNOTATION_NAME, configuration.getDownload().getGoAnnotation().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GO_ANNOTATION_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + logger.info("Downloading {} ...", DGIDB_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_NAME, GENE_DATA, + DGIDB_VERSION_FILENAME, geneFolder); } return null; } - private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading gnomAD constraints data..."); - String url = configuration.getDownload().getGnomadConstraints().getHost(); - saveDataSource(EtlCommons.GENE_DATA, GNOMAD_NAME, configuration.getDownload().getGnomadConstraints().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(GNOMAD_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { + logger.info("Downloading {} ...", UNIPROT_XREF_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), UNIPROT_XREF_FILE_ID, UNIPROT_XREF_NAME, + GENE_DATA, UNIPROT_XREF_VERSION_FILENAME, geneFolder); } return null; } - private DownloadFile downloadDrugData(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading drug-gene data..."); - String url = configuration.getDownload().getDgidb().getHost(); - saveDataSource(EtlCommons.GENE_DATA, DGIDB_NAME, configuration.getDownload().getDgidb().getVersion(), getTimeStamp(), - Collections.singletonList(url), geneFolder.resolve(DGIDB_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); - } - return null; + private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + logger.info("Downloading {} ...", GENE_EXPRESSION_ATLAS_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), GENE_EXPRESSION_ATLAS_FILE_ID, + GENE_EXPRESSION_ATLAS_NAME, GENE_DATA, GENE_EXPRESSION_ATLAS_VERSION_FILENAME, geneFolder); } - private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException, InterruptedException { - if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { - logger.info("Downloading UniProt ID mapping ..."); + private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + logger.info("Downloading {} ...", GENE_DISEASE_ANNOTATION_NAME); - String filename = GENE_UNIPROT_XREF_FILES.get(speciesConfiguration.getScientificName()); - String geneGtfUrl = configuration.getDownload().getGeneUniprotXref().getHost() + "/" + filename; + // IMPORTANT !!! + logger.warn("{} must be downloaded manually from {} and then create the file {} with data ({}), name ({}) and the version", + HPO_NAME, configuration.getDownload().getHpo().getHost(), HPO_VERSION_FILENAME, GENE_DATA, HPO_NAME); + saveDataSource(HPO_NAME, GENE_DISEASE_ANNOTATION_NAME, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), + Collections.singletonList(configuration.getDownload().getHpo().getHost()), geneFolder.resolve(HPO_VERSION_FILENAME)); - saveDataSource(EtlCommons.GENE_DATA, UNIPROT_NAME, - configuration.getDownload().getGeneUniprotXref().getVersion(), getTimeStamp(), - Collections.singletonList(geneGtfUrl), geneFolder.resolve(UNIPROT_XREF_VERSION_FILENAME)); + return downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_NAME, + GENE_DISEASE_ANNOTATION_NAME, DISGENET_VERSION_FILENAME, geneFolder); + } - Path outputPath = geneFolder.resolve(filename); - logger.info(DOWNLOADING_LOG_MESSAGE, geneGtfUrl, outputPath); - return downloadFile(geneGtfUrl, outputPath.toString()); + private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info("Downloading {} ...", GNOMAD_CONSTRAINTS_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), GNOMAD_CONSTRAINTS_FILE_ID, + GNOMAD_CONSTRAINTS_NAME, GENE_DATA, GNOMAD_CONSTRAINTS_VERSION_FILENAME, geneFolder); } - return null; } - private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene expression atlas ..."); - String geneGtfUrl = configuration.getDownload().getGeneExpressionAtlas().getHost(); - saveDataSource(EtlCommons.GENE_DATA, GENE_EXPRESSION_ATLAS_NAME, configuration.getDownload().getGeneExpressionAtlas().getVersion(), - getTimeStamp(), Collections.singletonList(geneGtfUrl), geneFolder.resolve(GENE_EXPRESSION_ATLAS_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(geneGtfUrl)); - logger.info(DOWNLOADING_LOG_MESSAGE, geneGtfUrl, outputPath); - return downloadFile(geneGtfUrl, outputPath.toString()); - } - - private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene disease annotation ..."); - - // IMPORTANT !!! - logger.warn("HPO must be downloaded manually from {} and then create the file {} with data ({}), name ({}) and the version", - configuration.getDownload().getHpo().getHost(), HPO_VERSION_FILENAME, GENE_DATA, HPO_NAME); - - String url = configuration.getDownload().getDisgenet().getHost(); - saveDataSource(EtlCommons.GENE_DISEASE_ASSOCIATION_DATA, DISGENET_NAME, configuration.getDownload().getDisgenet().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve(DISGINET_VERSION_FILENAME)); - - Path outputPath = geneFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); + private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info("Downloading {} ...", GO_ANNOTATION_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), GO_ANNOTATION_FILE_ID, GO_ANNOTATION_NAME, + GENE_DATA, GO_ANNOTATION_VERSION_FILENAME, geneFolder); + } + return null; } } From d794ceb9add20c22c9698beddd920c3ef8f8468c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 14:28:14 +0200 Subject: [PATCH 058/107] lib: update RefSeq downloader, #TASK-5775, #TASK-5564 --- .../core/config/DownloadProperties.java | 80 ------------------- .../org/opencb/cellbase/lib/EtlCommons.java | 9 ++- .../lib/download/GeneDownloadManager.java | 61 ++++---------- .../lib/download/GenomeDownloadManager.java | 50 ++++++------ 4 files changed, 46 insertions(+), 154 deletions(-) diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index 6d03f28148..bb44f91138 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -28,12 +28,8 @@ public class DownloadProperties { private URLProperties hgnc; private URLProperties cancerHotspot; private URLProperties refSeq; - private URLProperties refSeqFasta; - private URLProperties refSeqProteinFasta; - private URLProperties refSeqCdna; private URLProperties maneSelect; private URLProperties lrg; - private URLProperties geneUniprotXref; private URLProperties geneExpressionAtlas; private URLProperties mirbase; @@ -44,8 +40,6 @@ public class DownloadProperties { private URLProperties intact; private URLProperties interpro; private URLProperties interproRelNotes; - @Deprecated - private URLProperties conservation; private URLProperties phastCons; private URLProperties phylop; private URLProperties gerp; @@ -56,12 +50,6 @@ public class DownloadProperties { private URLProperties clinvarEfoTerms; private URLProperties cosmic; private URLProperties hgmd; - @Deprecated - private URLProperties iarctp53; - @Deprecated - private URLProperties docm; - @Deprecated - private URLProperties docmVersion; private URLProperties dgv; private URLProperties simpleRepeats; private URLProperties windowMasker; @@ -201,17 +189,6 @@ public DownloadProperties setInterproRelNotes(URLProperties interproRelNotes) { return this; } - @Deprecated - public URLProperties getConservation() { - return conservation; - } - - @Deprecated - public DownloadProperties setConservation(URLProperties conservation) { - this.conservation = conservation; - return this; - } - public URLProperties getPhastCons() { return phastCons; } @@ -301,36 +278,6 @@ public DownloadProperties setHgmd(URLProperties hgmd) { return this; } - @Deprecated - public URLProperties getIarctp53() { - return iarctp53; - } - - @Deprecated - public void setIarctp53(URLProperties iarctp53) { - this.iarctp53 = iarctp53; - } - - @Deprecated - public URLProperties getDocm() { - return docm; - } - - @Deprecated - public void setDocm(URLProperties docm) { - this.docm = docm; - } - - @Deprecated - public URLProperties getDocmVersion() { - return docmVersion; - } - - @Deprecated - public void setDocmVersion(URLProperties docmVersion) { - this.docmVersion = docmVersion; - } - public URLProperties getDgv() { return dgv; } @@ -489,19 +436,6 @@ public DownloadProperties setRefSeq(URLProperties refSeq) { return this; } - public URLProperties getRefSeqFasta() { - return refSeqFasta; - } - - public DownloadProperties setRefSeqFasta(URLProperties refSeqFasta) { - this.refSeqFasta = refSeqFasta; - return this; - } - - public URLProperties getRefSeqProteinFasta() { - return refSeqProteinFasta; - } - public URLProperties getRevel() { return revel; } @@ -529,20 +463,6 @@ public DownloadProperties setPharmGKB(URLProperties pharmGKB) { return this; } - public DownloadProperties setRefSeqProteinFasta(URLProperties refSeqProteinFasta) { - this.refSeqProteinFasta = refSeqProteinFasta; - return this; - } - - public URLProperties getRefSeqCdna() { - return refSeqCdna; - } - - public DownloadProperties setRefSeqCdna(URLProperties refSeqCdna) { - this.refSeqCdna = refSeqCdna; - return this; - } - public URLProperties getLrg() { return lrg; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index b7100a18a6..e2f613c500 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -64,10 +64,13 @@ public class EtlCommons { // RefSeq public static final String REFSEQ_NAME = "RefSeq"; + public static final String REFSEQ_DATA = "refseq"; public static final String REFSEQ_VERSION_FILENAME = "refSeq" + SUFFIX_VERSION_FILENAME; -// public static final String REFSEQ_ASTA_VERSION_FILENAME = REFSEQ_DATA + "Fasta" + SUFFIX_VERSION_FILENAME; -// public static final String REFSEQ_PROTEIN_FASTA_VERSION_FILENAME = REFSEQ_DATA + "ProteinFasta" + SUFFIX_VERSION_FILENAME; -// public static final String REFSEQ_CDNA_FASTA_VERSION_FILENAME = REFSEQ_DATA + "CdnaFasta" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String REFSEQ_GENOMIC_GTF_FILE_ID = "GENOMIC_GTF"; + public static final String REFSEQ_GENOMIC_FNA_FILE_ID = "GENOMIC_FNA"; + public static final String REFSEQ_PROTEIN_FAA_FILE_ID = "PROTEIN_FAA"; + public static final String REFSEQ_RNA_FNA_FILE_ID = "RNA_FNA"; // MANE Select public static final String MANE_SELECT_NAME = "MANE Select"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 679b9aaa95..1f02a574ef 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -16,17 +16,15 @@ package org.opencb.cellbase.lib.download; -import org.apache.commons.lang.StringUtils; import org.opencb.cellbase.core.config.CellBaseConfiguration; -import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.*; +import java.util.stream.Collectors; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -119,60 +117,31 @@ private List downloadEnsemblData(Path geneFolder) throws IOExcepti return downloadFiles; } - private List downloadRefSeq(Path refSeqFolder) throws IOException, InterruptedException { + private List downloadRefSeq(Path refSeqFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - - logger.info("Downloading RefSeq data ..."); + logger.info("Downloading {} data ...", REFSEQ_NAME); List downloadFiles = new ArrayList<>(); - String timeStamp = getTimeStamp(); - - // gtf - dow - DownloadFile downloadFile = downloadRefSeqFile(REFSEQ_NAME, configuration.getDownload().getRefSeq(), timeStamp, - REFSEQ_VERSION_FILENAME, refSeqFolder); - downloadFiles.add(downloadFile); - - // genomic fasta - downloadFile = downloadRefSeqFile(REFSEQ_NAME + " Fasta", configuration.getDownload().getRefSeqFasta(), timeStamp, - REFSEQ_ASTA_VERSION_FILENAME, refSeqFolder); - downloadFiles.add(downloadFile); - if (StringUtils.isNotEmpty(downloadFile.getOutputFile()) && Paths.get(downloadFile.getOutputFile()).toFile().exists()) { - logger.info("Unzipping file: {}", downloadFile.getOutputFile()); - EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(downloadFile.getOutputFile()), null); - } else { - logger.warn("Coud not find the file {} to unzip", downloadFile.getOutputFile()); - } - - // protein fasta - downloadFile = downloadRefSeqFile(REFSEQ_NAME + " Protein Fasta", configuration.getDownload().getRefSeqProteinFasta(), - timeStamp, REFSEQ_PROTEIN_FASTA_VERSION_FILENAME, refSeqFolder); - downloadFiles.add(downloadFile); - + // GTF + downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_GENOMIC_GTF_FILE_ID, refSeqFolder)); + // Genomic FASTA + downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_GENOMIC_FNA_FILE_ID, refSeqFolder)); + // Protein FASTA + downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_PROTEIN_FAA_FILE_ID, refSeqFolder)); // cDNA - downloadFile = downloadRefSeqFile(REFSEQ_NAME + " cDNA", configuration.getDownload().getRefSeqCdna(), timeStamp, - REFSEQ_CDNA_FASTA_VERSION_FILENAME, refSeqFolder); - downloadFiles.add(downloadFile); + downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_RNA_FNA_FILE_ID, refSeqFolder)); + + // Save data source (i.e., metadata) + saveDataSource(REFSEQ_NAME, GENE_DATA, configuration.getDownload().getRefSeq().getVersion(), getTimeStamp(), + downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()), + refSeqFolder.resolve(REFSEQ_VERSION_FILENAME)); return downloadFiles; } return Collections.emptyList(); } - private DownloadFile downloadRefSeqFile(String name, DownloadProperties.URLProperties urlProperties, String timeStamp, - String versionFilename, Path refSeqFolder) throws IOException, InterruptedException { - String url = urlProperties.getHost(); - String version = urlProperties.getVersion(); - String filename = getFilenameFromUrl(url); - Path outputPath = refSeqFolder.resolve(filename); - saveDataSource(name, EtlCommons.REFSEQ_NAME, version, timeStamp, Collections.singletonList(url), - refSeqFolder.resolve(versionFilename)); - - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); - return downloadFile(url, outputPath.toString()); - } - private DownloadFile downloadMane(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { logger.info("Downloading {} ...", MANE_SELECT_NAME); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index df4aa069bf..a6c17809b2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -153,31 +153,31 @@ public List downloadConservation() throws IOException, Interrupted } } - if (speciesConfiguration.getScientificName().equals("Mus musculus")) { - Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); - Files.createDirectories(conservationFolder.resolve(PHYLOP_SUBDIRECTORY)); - - String url = configuration.getDownload().getConservation().getHost() + "/mm10"; - String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", - "15", "16", "17", "18", "19", "X", "Y", "M", }; - List phastconsUrls = new ArrayList<>(chromosomes.length); - List phyloPUrls = new ArrayList<>(chromosomes.length); - for (String chromosome : chromosomes) { - String phastConsUrl = url + "/phastCons60way/mm10.60way.phastCons/chr" + chromosome + ".phastCons60way.wigFix.gz"; - downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome - + ".phastCons60way.wigFix.gz").toString())); - phastconsUrls.add(phastConsUrl); - String phyloPUrl = url + "/phyloP60way/mm10.60way.phyloP60way/chr" + chromosome + ".phyloP60way.wigFix.gz"; - downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome - + ".phyloP60way.wigFix.gz").toString())); - phyloPUrls.add(phyloPUrl); - } - saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), - getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); - saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), - getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); - } +// if (speciesConfiguration.getScientificName().equals("Mus musculus")) { +// Files.createDirectories(conservationFolder); +// Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); +// Files.createDirectories(conservationFolder.resolve(PHYLOP_SUBDIRECTORY)); +// +// String url = configuration.getDownload().getConservation().getHost() + "/mm10"; +// String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", +// "15", "16", "17", "18", "19", "X", "Y", "M", }; +// List phastconsUrls = new ArrayList<>(chromosomes.length); +// List phyloPUrls = new ArrayList<>(chromosomes.length); +// for (String chromosome : chromosomes) { +// String phastConsUrl = url + "/phastCons60way/mm10.60way.phastCons/chr" + chromosome + ".phastCons60way.wigFix.gz"; +// downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome +// + ".phastCons60way.wigFix.gz").toString())); +// phastconsUrls.add(phastConsUrl); +// String phyloPUrl = url + "/phyloP60way/mm10.60way.phyloP60way/chr" + chromosome + ".phyloP60way.wigFix.gz"; +// downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome +// + ".phyloP60way.wigFix.gz").toString())); +// phyloPUrls.add(phyloPUrl); +// } +// saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), +// getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); +// saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), +// getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); +// } return downloadFiles; } From 1b751de5f80031e42096ba03c86bd383bec00647 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 16:10:07 +0200 Subject: [PATCH 059/107] lib: update missense scores (REVEL) downloader, #TASK-5775, #TASK-5564 --- .../src/main/java/org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/download/MissenseScoresDownloadManager.java | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index e2f613c500..34ef38baac 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -151,6 +151,7 @@ public class EtlCommons { public static final String PHARMGKB_VERSION_FILENAME = "pharmgkb" + SUFFIX_VERSION_FILENAME; // Missense variantion functional score + public static final String MISSENSE_VARIATION_SCORE_NAME = "Missense Variation Functional Scores"; public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; // Revel public static final String REVEL_NAME = "Revel"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java index 50cf9ee0c0..ca491a97fe 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java @@ -44,10 +44,10 @@ public DownloadFile downloadRevel() throws IOException, InterruptedException, Ce if (speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); Files.createDirectories(missensePredictionScorePath); - logger.info("Downloading Revel data at {} ...", missensePredictionScorePath); - return downloadAndSaveDataSource(configuration.getDownload().getRevel(), REVEL_NAME, - MISSENSE_VARIATION_SCORE_DATA, REVEL_FILE_ID, REVEL_VERSION_FILENAME, missensePredictionScorePath); + logger.info("Downloading {}/{} ...", MISSENSE_VARIATION_SCORE_NAME, REVEL_NAME); + return downloadAndSaveDataSource(configuration.getDownload().getRevel(), REVEL_FILE_ID, REVEL_NAME, + MISSENSE_VARIATION_SCORE_DATA, REVEL_VERSION_FILENAME, missensePredictionScorePath); } return null; } From b63533324c221b05e1e5615b39b90e6c47de0898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 16:15:58 +0200 Subject: [PATCH 060/107] lib: update CADD and clinical variant downloaders, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/download/CaddDownloadManager.java | 4 ++-- .../opencb/cellbase/lib/download/ClinicalDownloadManager.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index af3ff65baf..738c66f3f1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -46,8 +46,8 @@ public List download() throws IOException, InterruptedException, C // Download CADD and save data source - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_NAME, - VARIATION_FUNCTIONAL_SCORE_DATA, CADD_FILE_ID, CADD_VERSION_FILENAME, variationFunctionalScoreFolder); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_NAME, + VARIATION_FUNCTIONAL_SCORE_DATA, CADD_VERSION_FILENAME, variationFunctionalScoreFolder); return Collections.singletonList(downloadFile); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 1e66f1b5f0..37561b111f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -80,8 +80,8 @@ public List downloadClinical() throws IOException, InterruptedExce clinicalFolder.resolve(HGMD_VERSION_FILENAME)); // GWAS catalog - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_NAME, CLINICAL_VARIANTS_DATA, - GWAS_FILE_ID, GWAS_VERSION_FILENAME, clinicalFolder); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_FILE_ID, GWAS_NAME, + CLINICAL_VARIANTS_DATA, GWAS_VERSION_FILENAME, clinicalFolder); downloadFiles.add(downloadFile); // ClinVar From 106b96d1ff01003187fc9b43e2d80c78556d97e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 18 Apr 2024 16:21:08 +0200 Subject: [PATCH 061/107] lib: update protein downloaders, #TASK-5775, #TASK-5564 --- .../src/main/java/org/opencb/cellbase/lib/EtlCommons.java | 1 + .../cellbase/lib/download/ProteinDownloadManager.java | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 34ef38baac..b31cf14d86 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -286,6 +286,7 @@ public class EtlCommons { public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; // Protein + public static final String PROTEIN_NAME = "Protein"; public static final String PROTEIN_DATA = "protein"; public static final String PROTEIN_SUBDIRECTORY = "protein"; // UniProt diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 519ea828d1..50255a3557 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -51,13 +51,13 @@ public List download() throws IOException, InterruptedException, C } Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); Files.createDirectories(proteinFolder); - logger.info("Downloading protein information at {} ..."); + logger.info("Downloading {} information at {} ...", PROTEIN_NAME, proteinFolder); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // Uniprot - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_NAME, PROTEIN_DATA, UNIPROT_FILE_ID, + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_NAME, PROTEIN_DATA, UNIPROT_VERSION_FILENAME, proteinFolder); Path chunksPath = proteinFolder.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); String uniprotFilename = getFilenameFromUrl(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); @@ -67,12 +67,12 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadFile); // Interpro - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_NAME, PROTEIN_DATA, INTERPRO_FILE_ID, + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_NAME, PROTEIN_DATA, INTERPRO_VERSION_FILENAME, proteinFolder); downloadFiles.add(downloadFile); // Intact - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_NAME, PROTEIN_DATA, INTACT_FILE_ID, + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_FILE_ID, INTACT_NAME, PROTEIN_DATA, INTACT_VERSION_FILENAME, proteinFolder); downloadFiles.add(downloadFile); From 55afe6b8383913f0e6d25d55b08f1bcb5765c049 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 19 Apr 2024 11:22:56 +0200 Subject: [PATCH 062/107] lib: update gene downloader (specially for ensembl data), and improve log messages, #TASK-5564 --- .../src/main/resources/configuration.yml | 9 +- .../org/opencb/cellbase/lib/EtlCommons.java | 33 +++-- .../lib/download/AbstractDownloadManager.java | 8 +- .../lib/download/ClinicalDownloadManager.java | 2 +- .../lib/download/GeneDownloadManager.java | 116 +++++++++--------- .../lib/download/GenomeDownloadManager.java | 6 +- .../download/RegulationDownloadManager.java | 7 +- 7 files changed, 101 insertions(+), 80 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 70acaf8776..ffb16393d0 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -55,9 +55,12 @@ download: url: host: https://ftp.ensembl.org/pub/ files: - REGULATORY_BUILD: "regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" - MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" - MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" + GTF: "release-put_release_here/gtf/put_species_here/put_capital_species_here.put_assembly_here.put_release_here.gtf.gz" + PEP_FA: "release-put_release_here/fasta/put_species_here/pep/put_capital_species_here.put_assembly_here.pep.all.fa.gz" + CDNA_FA: "release-put_release_here/fasta/put_species_here/cdna/put_capital_species_here.put_assembly_here.cdna.all.fa.gz" + REGULATORY_BUILD: "regulation/put_species_here/put_species_here.put_capital_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" + MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/put_capital_species_here.put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/put_capital_species_here.put_assembly_here.motif_features.gff.gz.tbi" ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index b31cf14d86..3fa1c6f4ec 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -38,14 +38,19 @@ public class EtlCommons { // Ensembl - public static final String ENSEMBL_NAME = "ENSEMBL"; + public static final String ENSEMBL_NAME = "Ensembl"; + public static final String PUT_RELEASE_HERE_MARK = "put_release_here"; public static final String PUT_SPECIES_HERE_MARK = "put_species_here"; + public static final String PUT_CAPITAL_SPECIES_HERE_MARK = "put_capital_species_here"; public static final String PUT_ASSEMBLY_HERE_MARK = "put_assembly_here"; public static final String PUT_CHROMOSOME_HERE_MARK = "put_chromosome_here"; // Must match the configuration file - public static final String REGULATORY_BUILD_FILE_ID = "REGULATORY_BUILD"; - public static final String MOTIF_FEATURES_FILE_ID = "MOTIF_FEATURES"; - public static final String MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; + public static final String ENSEMBL_GTF_FILE_ID = "GTF"; + public static final String ENSEMBL_PEP_FA_FILE_ID = "PEP_FA"; + public static final String ENSEMBL_CDNA_FA_FILE_ID = "CDNA_FA"; + public static final String ENSEMBL_REGULATORY_BUILD_FILE_ID = "REGULATORY_BUILD"; + public static final String ENSEMBL_MOTIF_FEATURES_FILE_ID = "MOTIF_FEATURES"; + public static final String ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; @@ -149,6 +154,16 @@ public class EtlCommons { public static final String PHARMGKB_DATA = "pharmgkb"; public static final String PHARMGKB_SUBDIRECTORY = "pharmgkb"; public static final String PHARMGKB_VERSION_FILENAME = "pharmgkb" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String PHARMGKB_GENES_FILE_ID = "GENES"; + public static final String PHARMGKB_CHEMICALS_FILE_ID = "CHEMICALS"; + public static final String PHARMGKB_VARIANTS_FILE_ID = "VARIANTS"; + public static final String PHARMGKB_GUIDELINE_ANNOTATIONS_FILE_ID = "GUIDELINE_ANNOTATIONS"; + public static final String PHARMGKB_VARIANT_ANNOTATIONS_FILE_ID = "VARIANT_ANNOTATIONS"; + public static final String PHARMGKB_CLINICAL_ANNOTATIONS_FILE_ID = "CLINICAL_ANNOTATIONS"; + public static final String PHARMGKB_CLINICAL_VARIANTS_FILE_ID = "CLINICAL_VARIANTS"; + public static final String PHARMGKB_DRUG_LABELS_FILE_ID = "DRUG_LABELS"; + public static final String PHARMGKB_RELATIONSHIPS_FILE_ID = "RELATIONSHIPS"; // Missense variantion functional score public static final String MISSENSE_VARIATION_SCORE_NAME = "Missense Variation Functional Scores"; @@ -438,11 +453,15 @@ public static String getEnsemblUrl(DownloadProperties.EnsemblProperties props, S throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.EnsemblProperties within the CellBase" + " configuration file"); } - String filesValue = props.getUrl().getFiles().get(fileId); - String url = props.getUrl().getHost() + ensemblRelease + "/" + filesValue; - // Change species, assembly, chromosome if necessary + String url = props.getUrl().getHost() + props.getUrl().getFiles().get(fileId); + + // Change release, species, assembly, chromosome if necessary + if (StringUtils.isNotEmpty(ensemblRelease)) { + url = url.replaceAll(PUT_RELEASE_HERE_MARK, ensemblRelease.split("-")[1]); + } if (StringUtils.isNotEmpty(species)) { url = url.replaceAll(PUT_SPECIES_HERE_MARK, species); + url = url.replaceAll(PUT_CAPITAL_SPECIES_HERE_MARK, Character.toUpperCase(species.charAt(0)) + species.substring(1)); } if (StringUtils.isNotEmpty(assembly)) { url = url.replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 74ecbe4d4a..f3f01e7c30 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -49,7 +49,9 @@ public abstract class AbstractDownloadManager { - protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} to {} ..."; + protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; + protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; + protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {} done!"; protected String species; protected String assembly; @@ -179,7 +181,7 @@ protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props throws IOException, InterruptedException, CellBaseException { String url = EtlCommons.getUrl(props, fileId, species, assembly, chromosome); File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); return downloadFile(url, outFile.toString()); } @@ -193,7 +195,7 @@ protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblPrope String url = EtlCommons.getEnsemblUrl(ensemblProps, ensemblRelease, fileId, speciesShortName, assemblyConfiguration.getName(), chromosome); File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outFile); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); return downloadFile(url, outFile.toString()); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 37561b111f..77f658626a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -92,7 +92,7 @@ public List downloadClinical() throws IOException, InterruptedExce CLINVAR_EFO_TERMS_FILE_ID)) { url = props.getHost() + props.getFiles().get(fileId); outPath = clinicalFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outPath); downloadFiles.add(downloadFile(url, outPath.toString())); urls.add(url); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 1f02a574ef..7ea434c24c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -74,52 +74,30 @@ public List download() throws IOException, InterruptedException, C return downloadFiles; } - private List downloadEnsemblData(Path geneFolder) throws IOException, InterruptedException { - logger.info("Downloading gene Ensembl data (gtf, pep, cdna, motifs) ..."); - List downloadedUrls = new ArrayList<>(4); + private List downloadEnsemblData(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, ENSEMBL_NAME); + List downloadFiles = new ArrayList<>(); - String ensemblHost = ensemblHostUrl + "/" + ensemblRelease; - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - ensemblHost = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); - } + // GTF + downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_GTF_FILE_ID, geneFolder)); + // PEP + downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_PEP_FA_FILE_ID, geneFolder)); + // CDNA + downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_CDNA_FA_FILE_ID, geneFolder)); - String ensemblCollection = ""; - if (configuration.getSpecies().getBacteria().contains(speciesConfiguration)) { - // WARN: assuming there's just one assembly - ensemblCollection = speciesConfiguration.getAssemblies().get(0).getEnsemblCollection() + "/"; - } - - // Ensembl leaves now several GTF files in the FTP folder, we need to build a more accurate URL - // to download the correct GTF file. - String version = ensemblRelease.split("-")[1]; - String url = ensemblHost + "/gtf/" + ensemblCollection + speciesShortName + "/*" + version + ".gtf.gz"; - String fileName = geneFolder.resolve(speciesShortName + ".gtf.gz").toString(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, fileName); - downloadFiles.add(downloadFile(url, fileName)); - downloadedUrls.add(url); - - url = ensemblHost + "/fasta/" + ensemblCollection + speciesShortName + "/pep/*.pep.all.fa.gz"; - fileName = geneFolder.resolve(speciesShortName + ".pep.all.fa.gz").toString(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, fileName); - downloadFiles.add(downloadFile(url, fileName)); - downloadedUrls.add(url); - - url = ensemblHost + "/fasta/" + ensemblCollection + speciesShortName + "/cdna/*.cdna.all.fa.gz"; - fileName = geneFolder.resolve(speciesShortName + ".cdna.all.fa.gz").toString(); - logger.info(DOWNLOADING_LOG_MESSAGE, url, fileName); - downloadFiles.add(downloadFile(url, fileName)); - downloadedUrls.add(url); - - saveDataSource(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), downloadedUrls, + // Save data source (i.e., metadata) + saveDataSource(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), + downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()), geneFolder.resolve(ENSEMBL_CORE_VERSION_FILENAME)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, ENSEMBL_NAME); return downloadFiles; } private List downloadRefSeq(Path refSeqFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} data ...", REFSEQ_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, REFSEQ_NAME); List downloadFiles = new ArrayList<>(); @@ -137,6 +115,7 @@ private List downloadRefSeq(Path refSeqFolder) throws IOException, downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()), refSeqFolder.resolve(REFSEQ_VERSION_FILENAME)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, REFSEQ_NAME); return downloadFiles; } return Collections.emptyList(); @@ -144,66 +123,80 @@ private List downloadRefSeq(Path refSeqFolder) throws IOException, private DownloadFile downloadMane(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", MANE_SELECT_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, MANE_SELECT_NAME, GENE_DATA, - MANE_SELECT_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, MANE_SELECT_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, + MANE_SELECT_NAME, GENE_DATA, MANE_SELECT_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, MANE_SELECT_NAME); + return downloadFile; } return null; } private DownloadFile downloadLrg(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", LRG_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_NAME, GENE_DATA, LRG_VERSION_FILENAME, - geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, LRG_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_NAME, GENE_DATA, + LRG_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, LRG_NAME); + return downloadFile; } return null; } private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", HGNC_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_NAME, GENE_DATA, + logger.info(DOWNLOADING_LOG_MESSAGE, HGNC_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_NAME, GENE_DATA, HGNC_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, HGNC_NAME); + return downloadFile; } return null; } private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", CANCER_HOTSPOT_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, CANCER_HOTSPOT_NAME, - GENE_DATA, CANCER_HOTSPOT_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, CANCER_HOTSPOT_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, + CANCER_HOTSPOT_NAME, GENE_DATA, CANCER_HOTSPOT_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, CANCER_HOTSPOT_NAME); + return downloadFile; } return null; } private DownloadFile downloadDrugData(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", DGIDB_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_NAME, GENE_DATA, - DGIDB_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, DGIDB_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_NAME, + GENE_DATA, DGIDB_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, DGIDB_NAME); + return downloadFile; } return null; } private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { - logger.info("Downloading {} ...", UNIPROT_XREF_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), UNIPROT_XREF_FILE_ID, UNIPROT_XREF_NAME, - GENE_DATA, UNIPROT_XREF_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, UNIPROT_XREF_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), UNIPROT_XREF_FILE_ID, + UNIPROT_XREF_NAME, GENE_DATA, UNIPROT_XREF_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, UNIPROT_XREF_NAME); + return downloadFile; } return null; } private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException, CellBaseException { - logger.info("Downloading {} ...", GENE_EXPRESSION_ATLAS_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), GENE_EXPRESSION_ATLAS_FILE_ID, - GENE_EXPRESSION_ATLAS_NAME, GENE_DATA, GENE_EXPRESSION_ATLAS_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, GENE_EXPRESSION_ATLAS_NAME); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), + GENE_EXPRESSION_ATLAS_FILE_ID, GENE_EXPRESSION_ATLAS_NAME, GENE_DATA, GENE_EXPRESSION_ATLAS_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENE_EXPRESSION_ATLAS_NAME); + return downloadFile; } private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOException, InterruptedException, CellBaseException { - logger.info("Downloading {} ...", GENE_DISEASE_ANNOTATION_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, GENE_DISEASE_ANNOTATION_NAME); // IMPORTANT !!! logger.warn("{} must be downloaded manually from {} and then create the file {} with data ({}), name ({}) and the version", @@ -211,13 +204,16 @@ private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOExc saveDataSource(HPO_NAME, GENE_DISEASE_ANNOTATION_NAME, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), Collections.singletonList(configuration.getDownload().getHpo().getHost()), geneFolder.resolve(HPO_VERSION_FILENAME)); - return downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_NAME, + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_NAME, GENE_DISEASE_ANNOTATION_NAME, DISGENET_VERSION_FILENAME, geneFolder); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENE_DISEASE_ANNOTATION_NAME); + return downloadFile; } private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", GNOMAD_CONSTRAINTS_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, GNOMAD_CONSTRAINTS_NAME); return downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), GNOMAD_CONSTRAINTS_FILE_ID, GNOMAD_CONSTRAINTS_NAME, GENE_DATA, GNOMAD_CONSTRAINTS_VERSION_FILENAME, geneFolder); } @@ -226,7 +222,7 @@ private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOExcepti private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading {} ...", GO_ANNOTATION_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, GO_ANNOTATION_NAME); return downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), GO_ANNOTATION_FILE_ID, GO_ANNOTATION_NAME, GENE_DATA, GO_ANNOTATION_VERSION_FILENAME, geneFolder); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index a6c17809b2..1ef4e66ae0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -206,7 +206,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.TRF_VERSION_FILENAME)); Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); // Download genomic super duplications @@ -216,7 +216,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.GSD_VERSION_FILENAME)); outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); // Download WindowMasker @@ -227,7 +227,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.WM_VERSION_FILENAME)); outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); } return downloadFiles; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 26ed4776da..d11e907aa0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -73,17 +73,18 @@ private List downloadRegulatoryaAndMotifFeatures() throws IOExcept List downloadFiles = new ArrayList<>(); // Regulatory build - downloadFile = downloadAndSaveEnsemblDataSource(configuration.getDownload().getEnsembl(), REGULATORY_BUILD_FILE_ID, + downloadFile = downloadAndSaveEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_REGULATORY_BUILD_FILE_ID, REGULATORY_BUILD_NAME, REGULATION_DATA, null, REGULATORY_BUILD_VERSION_FILENAME, regulationFolder); downloadFiles.add(downloadFile); // Motifs features List urls = new ArrayList<>(); - downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), MOTIF_FEATURES_FILE_ID, null, regulationFolder); + downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_MOTIF_FEATURES_FILE_ID, null, + regulationFolder); downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); // And now the index file - downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), MOTIF_FEATURES_INDEX_FILE_ID, null, + downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID, null, regulationFolder); downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); From 88c2b17614fe7af59db9372b00da2d15c6ccfc45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 10:32:24 +0200 Subject: [PATCH 063/107] core: add Ensembl primary fasta URL into the configuration file for the genome downloader, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index ffb16393d0..5022340bec 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -55,6 +55,8 @@ download: url: host: https://ftp.ensembl.org/pub/ files: + # New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead + PRIMARY_FA: "release-put_release_here/fasta/put_species_here/dna/put_capital_species_here.put_assembly_here.dna.primary_assembly.fa.gz" GTF: "release-put_release_here/gtf/put_species_here/put_capital_species_here.put_assembly_here.put_release_here.gtf.gz" PEP_FA: "release-put_release_here/fasta/put_species_here/pep/put_capital_species_here.put_assembly_here.pep.all.fa.gz" CDNA_FA: "release-put_release_here/fasta/put_species_here/cdna/put_capital_species_here.put_assembly_here.cdna.all.fa.gz" From eee13e30826bde6774910adec721f35977a783bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 10:34:44 +0200 Subject: [PATCH 064/107] lib: update genome download manager by declaring and using constants from the class EtlCommand and improve log messages, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 9 +- .../cellbase/lib/download/Downloader.java | 2 +- .../lib/download/GenomeDownloadManager.java | 95 ++++++------------- 3 files changed, 36 insertions(+), 70 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 3fa1c6f4ec..3a98939a23 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -45,6 +45,7 @@ public class EtlCommons { public static final String PUT_ASSEMBLY_HERE_MARK = "put_assembly_here"; public static final String PUT_CHROMOSOME_HERE_MARK = "put_chromosome_here"; // Must match the configuration file + public static final String ENSEMBL_PRIMARY_FA_FILE_ID = "PRIMARY_FA"; public static final String ENSEMBL_GTF_FILE_ID = "GTF"; public static final String ENSEMBL_PEP_FA_FILE_ID = "PEP_FA"; public static final String ENSEMBL_CDNA_FA_FILE_ID = "CDNA_FA"; @@ -61,9 +62,13 @@ public class EtlCommons { public static final String SUFFIX_VERSION_FILENAME = "Version.json"; + // Genome (Ensembl) + public static final String GENOME_NAME = "Genome"; public static final String GENOME_DATA = "genome"; + public static final String GENOME_SUBDIRECTORY = "genome"; public static final String GENOME_VERSION_FILENAME = "genome" + SUFFIX_VERSION_FILENAME; + // Gene (Ensembl) public static final String GENE_DATA = "gene"; public static final String ENSEMBL_CORE_VERSION_FILENAME = "ensemblCore" + SUFFIX_VERSION_FILENAME; @@ -201,7 +206,8 @@ public class EtlCommons { // Must match the configuration file public static final String GWAS_FILE_ID = "GWAS"; - public static final String STRUCTURAL_VARIANTS_DATA = "svs"; + // Repeats + public static final String REPEATS_NAME = "Repeats"; public static final String REPEATS_DATA = "repeats"; public static final String REPEATS_SUBDIRECTORY = "genome"; public static final String REPEATS_JSON = "repeats"; @@ -322,6 +328,7 @@ public class EtlCommons { public static final String INTACT_FILE_ID = "INTACT"; // Conservation scores + public static final String CONSERVATION_NAME = "Conservation"; public static final String CONSERVATION_DATA = "conservation"; public static final String CONSERVATION_SUBDIRECTORY = "conservation"; // GERP diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java index 17022cae4b..d72d077e4a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java @@ -40,7 +40,7 @@ public Downloader(String species, String assembly, Path outputDirectory, CellBas public List downloadGenome() throws IOException, CellBaseException, InterruptedException { GenomeDownloadManager manager = new GenomeDownloadManager(species, assembly, outputDirectory, configuration); - return manager.download(); + return manager.downloadReferenceGenome(); } public List downloadGene() throws IOException, CellBaseException, InterruptedException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 1ef4e66ae0..f36f493e1f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -17,7 +17,6 @@ package org.opencb.cellbase.lib.download; import com.beust.jcommander.ParameterException; -import org.apache.commons.lang.StringUtils; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; @@ -40,46 +39,27 @@ public GenomeDownloadManager(String species, String assembly, Path targetDirecto } @Override - public List download() throws IOException, InterruptedException { - List downloadFiles = new ArrayList<>(); - downloadFiles.addAll(downloadReferenceGenome()); - downloadFiles.addAll(downloadConservation()); - downloadFiles.addAll(downloadRepeats()); - - return downloadFiles; + public List download() throws IOException, InterruptedException, CellBaseException { + return downloadReferenceGenome(); } - public List downloadReferenceGenome() throws IOException, InterruptedException { - logger.info("Downloading genome information ..."); - Path sequenceFolder = downloadFolder.resolve("genome"); + public List downloadReferenceGenome() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, GENOME_NAME); + Path sequenceFolder = downloadFolder.resolve(GENOME_SUBDIRECTORY); Files.createDirectories(sequenceFolder); // Reference genome sequences are downloaded from Ensembl // New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead - String url = ensemblHostUrl + "/" + ensemblRelease; - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - url = url + "/fasta/" + speciesShortName + "/dna/*.dna.primary_assembly.fa.gz"; - } else { - if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { - url = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration); - } - url = url + "/fasta/"; - if (configuration.getSpecies().getBacteria().contains(speciesConfiguration)) { - // WARN: assuming there's just one assembly - url = url + speciesConfiguration.getAssemblies().get(0).getEnsemblCollection() + "/"; - } - url = url + speciesShortName + "/dna/*.dna.toplevel.fa.gz"; - } + DownloadFile downloadFile = downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_PRIMARY_FA_FILE_ID, + sequenceFolder); - String outputFileName = StringUtils.capitalize(speciesShortName) + "." + assemblyConfiguration.getName() + ".fa.gz"; - Path outputPath = sequenceFolder.resolve(outputFileName); - logger.info("Saving reference genome version data at {}", sequenceFolder.resolve(GENOME_VERSION_FILENAME)); + // Save data source saveDataSource(ENSEMBL_NAME, EtlCommons.GENOME_DATA, ensemblVersion, getTimeStamp(), - Collections.singletonList(url), sequenceFolder.resolve(GENOME_VERSION_FILENAME)); - List downloadFiles = Collections.singletonList(downloadFile(url, outputPath.toString())); - logger.info("Unzipping file: {}", outputFileName); - EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(outputPath.toString()), null); - return downloadFiles; + Collections.singletonList(downloadFile.getUrl()), sequenceFolder.resolve(GENOME_VERSION_FILENAME)); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENOME_NAME); + + return Collections.singletonList(downloadFile); } /** @@ -89,13 +69,14 @@ public List downloadReferenceGenome() throws IOException, Interrup * @throws InterruptedException if there is an error downloading files */ public List downloadConservation() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "conservation")) { + if (!speciesHasInfoToDownload(speciesConfiguration, CONSERVATION_DATA)) { return Collections.emptyList(); } - logger.info("Downloading conservation information ..."); - Path conservationFolder = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); List downloadFiles = new ArrayList<>(); if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info(DOWNLOADING_LOG_MESSAGE, CONSERVATION_NAME); + Path conservationFolder = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); + Files.createDirectories(conservationFolder); Files.createDirectories(conservationFolder.resolve(GERP_SUBDIRECTORY)); Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); @@ -111,7 +92,7 @@ public List downloadConservation() throws IOException, Interrupted List phastconsUrls = new ArrayList<>(chromosomes.length); List phyloPUrls = new ArrayList<>(chromosomes.length); // Downloading PhastCons and PhyloP - logger.info("Downloading {} and {}", PHASTCONS_NAME, PHYLOP_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, (PHASTCONS_NAME + "/" + PHYLOP_NAME)); for (String chromosome : chromosomes) { // PhastCons String phastConsUrl = configuration.getDownload().getPhastCons().getHost() + configuration.getDownload().getPhastCons() @@ -119,7 +100,7 @@ public List downloadConservation() throws IOException, Interrupted .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phastConsUrl).getFileName().toString(); outputPath = conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve(filename); - logger.info("Downloading {} from {} to {}", PHASTCONS_NAME, phastConsUrl, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phastConsUrl, outputPath); downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); phastconsUrls.add(phastConsUrl); @@ -129,18 +110,18 @@ public List downloadConservation() throws IOException, Interrupted .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phyloPUrl).getFileName().toString(); outputPath = conservationFolder.resolve(PHYLOP_SUBDIRECTORY).resolve(filename); - logger.info("Downloading {} from {} to {}", PHYLOP_NAME, phyloPUrl, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phyloPUrl, outputPath); downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); phyloPUrls.add(phyloPUrl); } // Downloading Gerp - logger.info("Downloading {}", GERP_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, GERP_NAME); String gerpUrl = configuration.getDownload().getGerp().getHost() + configuration.getDownload().getGerp().getFiles() .get(GERP_FILE_ID); filename = Paths.get(gerpUrl).getFileName().toString(); outputPath = conservationFolder.resolve(GERP_SUBDIRECTORY).resolve(filename); - logger.info("Downloading from {} to {}", gerpUrl, outputPath); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, gerpUrl, outputPath); downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); // Save data version @@ -151,42 +132,18 @@ public List downloadConservation() throws IOException, Interrupted saveDataSource(GERP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), Collections.singletonList(gerpUrl), conservationFolder.resolve(GERP_VERSION_FILENAME)); } + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, CONSERVATION_NAME); } -// if (speciesConfiguration.getScientificName().equals("Mus musculus")) { -// Files.createDirectories(conservationFolder); -// Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); -// Files.createDirectories(conservationFolder.resolve(PHYLOP_SUBDIRECTORY)); -// -// String url = configuration.getDownload().getConservation().getHost() + "/mm10"; -// String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", -// "15", "16", "17", "18", "19", "X", "Y", "M", }; -// List phastconsUrls = new ArrayList<>(chromosomes.length); -// List phyloPUrls = new ArrayList<>(chromosomes.length); -// for (String chromosome : chromosomes) { -// String phastConsUrl = url + "/phastCons60way/mm10.60way.phastCons/chr" + chromosome + ".phastCons60way.wigFix.gz"; -// downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome -// + ".phastCons60way.wigFix.gz").toString())); -// phastconsUrls.add(phastConsUrl); -// String phyloPUrl = url + "/phyloP60way/mm10.60way.phyloP60way/chr" + chromosome + ".phyloP60way.wigFix.gz"; -// downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve("chr" + chromosome -// + ".phyloP60way.wigFix.gz").toString())); -// phyloPUrls.add(phyloPUrl); -// } -// saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), -// getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); -// saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), -// getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); -// } return downloadFiles; } public List downloadRepeats() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "repeats")) { + if (!speciesHasInfoToDownload(speciesConfiguration, REPEATS_DATA)) { return Collections.emptyList(); } if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info("Downloading repeats data ..."); + logger.info(DOWNLOADING_LOG_MESSAGE, REPEATS_NAME); Path repeatsFolder = downloadFolder.resolve(EtlCommons.REPEATS_SUBDIRECTORY); Files.createDirectories(repeatsFolder); List downloadFiles = new ArrayList<>(); @@ -230,6 +187,8 @@ public List downloadRepeats() throws IOException, InterruptedExcep logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); } + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, REPEATS_NAME); + return downloadFiles; } return Collections.emptyList(); From cd367b998dd0d3905e5a0162145b864a8b42857e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 12:32:55 +0200 Subject: [PATCH 065/107] app: update genome builder by using constants from the class EtlCommons, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 110 ++++++++++++++---- 1 file changed, 88 insertions(+), 22 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 71b20e8b5a..482a6b5693 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -17,11 +17,14 @@ package org.opencb.cellbase.app.cli.admin.executors; import com.beust.jcommander.ParameterException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; import org.apache.commons.lang.StringUtils; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; @@ -33,12 +36,16 @@ import java.io.File; import java.io.IOException; -import java.nio.file.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.core.utils.SpeciesUtils.getSpeciesShortname; import static org.opencb.cellbase.lib.EtlCommons.*; /** @@ -52,6 +59,10 @@ public class BuildCommandExecutor extends CommandExecutor { private Path downloadFolder = null; // /_/download private boolean normalize = true; + private SpeciesConfiguration.Assembly assembly; + private String ensemblVersion; + private String ensemblRelease; + private File ensemblScriptsFolder; private boolean flexibleGTFParsing; @@ -83,7 +94,7 @@ public void execute() { if (speciesConfiguration == null) { throw new CellBaseException("Invalid species: '" + buildCommandOptions.species + "'"); } - SpeciesConfiguration.Assembly assembly = null; + if (!StringUtils.isEmpty(buildCommandOptions.assembly)) { assembly = SpeciesUtils.getAssembly(speciesConfiguration, buildCommandOptions.assembly); if (assembly == null) { @@ -93,7 +104,10 @@ public void execute() { assembly = SpeciesUtils.getDefaultAssembly(speciesConfiguration); } - String spShortName = SpeciesUtils.getSpeciesShortname(speciesConfiguration); + ensemblVersion = assembly.getEnsemblVersion(); + ensemblRelease = "release-" + ensemblVersion.split("_")[0]; + + String spShortName = getSpeciesShortname(speciesConfiguration); String spAssembly = assembly.getName().toLowerCase(); Path spFolder = output.resolve(spShortName + "_" + spAssembly); // /_/download @@ -121,9 +135,6 @@ public void execute() { logger.info("Building '{}' data", buildOption); CellBaseBuilder parser = null; switch (buildOption) { -// case EtlCommons.GENOME_INFO_DATA: -// buildGenomeInfo(); -// break; case EtlCommons.GENOME_DATA: parser = buildGenomeSequence(); break; @@ -250,11 +261,17 @@ private void copyVersionFiles(List pathList) { // } // } - private CellBaseBuilder buildGenomeSequence() { - copyVersionFiles(Collections.singletonList(downloadFolder.resolve("genome/genomeVersion.json"))); - Path fastaFile = getFastaReferenceGenome(); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "genome_sequence"); - return new GenomeSequenceFastaBuilder(fastaFile, serializer); + private CellBaseBuilder buildGenomeSequence() throws CellBaseException { + // Sanity check + Path genomeVersionPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(GENOME_VERSION_FILENAME); + copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_SUBDIRECTORY)); + + // Get FASTA path + Path fastaPath = getFastaReferenceGenome(); + + // Create serializer and return the genome builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(GENOME_SUBDIRECTORY), GENOME_DATA); + return new GenomeSequenceFastaBuilder(fastaPath, serializer); } private CellBaseBuilder buildGene() throws CellBaseException { @@ -381,19 +398,26 @@ private String getDefaultHumanAssembly() { + "configuration file. No hsapiens data found within the configuration.json file"); } - private Path getFastaReferenceGenome() { - Path fastaFile = null; - try { - DirectoryStream stream = Files.newDirectoryStream(downloadFolder.resolve("genome"), entry -> { - return entry.toString().endsWith(".fa"); - }); - for (Path entry : stream) { - fastaFile = entry; + private Path getFastaReferenceGenome() throws CellBaseException { + // Check FASTA and unzip if necessary + String ensemblUrl = getEnsemblUrl(configuration.getDownload().getEnsembl(), ensemblRelease, ENSEMBL_PRIMARY_FA_FILE_ID, + getSpeciesShortname(speciesConfiguration), assembly.getName(), null); + String fastaFilename = Paths.get(ensemblUrl).getFileName().toString(); + Path fastaPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(fastaFilename); + if (fastaPath.toFile().exists()) { + // Gunzip + logger.info("Gunzip file: " + fastaPath); + try { + EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(fastaPath.toString()), null); + } catch (IOException | InterruptedException e) { + throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); } - } catch (IOException e) { - e.printStackTrace(); } - return fastaFile; + fastaPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(fastaFilename.replace(".gz", "")); + if (!fastaPath.toFile().exists()) { + throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); + } + return fastaPath; } private CellBaseBuilder buildSplice() throws IOException { @@ -448,4 +472,46 @@ private CellBaseBuilder buildPharmacogenomics() throws IOException { CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder); return new PharmGKBBuilder(inFolder, serializer); } + + private void checkVersionFiles(List versionPaths) throws CellBaseException { + ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); + for (Path versionPath : versionPaths) { + if (!versionPath.toFile().exists()) { + throw new CellBaseException("Version file " + versionPath + " does not exist: this file is mandatory for version control"); + } + try { + DataSource dataSource = dataSourceReader.readValue(versionPath.toFile()); + if (org.apache.commons.lang3.StringUtils.isEmpty(dataSource.getVersion())) { + throw new CellBaseException("Version missing version in file " + versionPath + ": a version must be specified in the" + + " file"); + } + } catch (IOException e) { + throw new CellBaseException("Error parsing the version file " + versionPath, e); + } + } + } + + private void copyVersionFiles(List versionPaths, Path targetPath) throws CellBaseException { + // Check version files before copying them + checkVersionFiles(versionPaths); + if (!targetPath.toFile().exists()) { + try { + Files.createDirectories(targetPath); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + targetPath, e); + } + } + + for (Path versionPath : versionPaths) { + try { + Files.copy(versionPath, targetPath.resolve(versionPath.getFileName()), StandardCopyOption.REPLACE_EXISTING); + } catch (IOException e) { + throw new CellBaseException("Error copying version file " + versionPath + " to " + targetPath, e); + } + // Sanity check after copying + if (!targetPath.resolve(versionPath.getFileName()).toFile().exists()) { + throw new CellBaseException("Something wrong happened when copying version file " + versionPath + " to " + targetPath); + } + } + } } From ce6f8d5e4a2e20df5f4e43269a2d9cfdefc366bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 13:08:22 +0200 Subject: [PATCH 066/107] app: fix sonnar issues in BuildCommandExecutor, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 120 ++++-------------- 1 file changed, 27 insertions(+), 93 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 482a6b5693..1f99975a4b 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -34,7 +34,6 @@ import org.opencb.cellbase.lib.builders.*; import org.opencb.cellbase.lib.builders.clinical.variant.ClinicalVariantBuilder; -import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -60,11 +59,8 @@ public class BuildCommandExecutor extends CommandExecutor { private boolean normalize = true; private SpeciesConfiguration.Assembly assembly; - private String ensemblVersion; private String ensemblRelease; - private File ensemblScriptsFolder; - private boolean flexibleGTFParsing; private SpeciesConfiguration speciesConfiguration; @@ -75,15 +71,16 @@ public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildComma this.output = Paths.get(buildCommandOptions.outputDirectory); normalize = !buildCommandOptions.skipNormalize; - this.ensemblScriptsFolder = new File(System.getProperty("basedir") + "/bin/ensembl-scripts/"); this.flexibleGTFParsing = buildCommandOptions.flexibleGTFParsing; } - /** * Parse specific 'build' command options. + * + * @throws CellBaseException Exception */ - public void execute() { + public void execute() throws CellBaseException { + String buildOption = null; try { // Output directory need to be created if it doesn't exist if (!Files.exists(output)) { @@ -104,7 +101,7 @@ public void execute() { assembly = SpeciesUtils.getDefaultAssembly(speciesConfiguration); } - ensemblVersion = assembly.getEnsemblVersion(); + String ensemblVersion = assembly.getEnsemblVersion(); ensemblRelease = "release-" + ensemblVersion.split("_")[0]; String spShortName = getSpeciesShortname(speciesConfiguration); @@ -130,9 +127,8 @@ public void execute() { } for (int i = 0; i < buildOptions.length; i++) { - String buildOption = buildOptions[i]; + buildOption = buildOptions[i]; - logger.info("Building '{}' data", buildOption); CellBaseBuilder parser = null; switch (buildOption) { case EtlCommons.GENOME_DATA: @@ -156,9 +152,6 @@ public void execute() { case EtlCommons.PROTEIN_DATA: parser = buildProtein(); break; -// case EtlCommons.PPI_DATA: -// parser = getInteractionParser(); -// break; case EtlCommons.CONSERVATION_DATA: parser = buildConservation(); break; @@ -181,24 +174,26 @@ public void execute() { parser = buildPharmacogenomics(); break; default: - logger.error("Build option '" + buildCommandOptions.data + "' is not valid"); + logger.error("Build option '{}' is not valid", buildCommandOptions.data); break; } if (parser != null) { - try { - parser.parse(); - } catch (Exception e) { - logger.error("Error executing 'build' command " + buildCommandOptions.data + ": " + e.getMessage(), e); - } + logger.info("Building '{}' data ...", buildOption); + parser.parse(); + logger.info("Building '{}' data. Done.", buildOption); parser.disconnect(); } } } } catch (ParameterException e) { logger.error("Error parsing build command line parameters: " + e.getMessage(), e); - } catch (IOException | CellBaseException e) { - logger.error(e.getMessage()); + } catch (Exception e) { + String msg = "Error executing the command 'build'."; + if (StringUtils.isNotEmpty(buildOption)) { + msg += " It was building the data '" + buildOption + "'"; + } + throw new CellBaseException(msg, e); } } @@ -207,7 +202,6 @@ private CellBaseBuilder buildRepeats() { copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.TRF_VERSION_FILENAME))); copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.GSD_VERSION_FILENAME))); copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.WM_VERSION_FILENAME))); - // TODO: chunk size is not really used in ConvervedRegionParser, remove? CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.REPEATS_JSON); return new RepeatsBuilder(repeatsFilesDir, serializer); } @@ -223,44 +217,11 @@ private void copyVersionFiles(List pathList) { try { Files.copy(path, downloadFolder.resolve(path.getFileName()), StandardCopyOption.REPLACE_EXISTING); } catch (IOException e) { - logger.warn("Version file {} not found - skipping", path.toString()); + logger.warn("Version file {} not found - skipping", path); } } } -// private void buildGenomeInfo() { -// /** -// * To get some extra info about the genome such as chromosome length or cytobands -// * we execute the following script. -// */ -// try { -// String outputFileName = downloadFolder.resolve("genome_info.json").toAbsolutePath().toString(); -// List args = new ArrayList<>(); -// args.addAll(Arrays.asList("--species", speciesConfigurathtion.getScientificName(), -// "--assembly", buildCommandOptions.assembly == null ? getDefaultHumanAssembly() : buildCommandOptions.assembly, -// "-o", outputFileName, -// "--ensembl-libs", configuration.getDownload().getEnsembl().getLibs())); -// if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration) -// && !speciesConfiguration.getScientificName().equals("Drosophila melanogaster")) { -// args.add("--phylo"); -// args.add("no-vertebrate"); -// } -// -// String geneInfoLogFileName = downloadFolder.resolve("genome_info.log").toAbsolutePath().toString(); -// -// boolean downloadedGenomeInfo; -// downloadedGenomeInfo = EtlCommons.runCommandLineProcess(ensemblScriptsFolder, "./genome_info.pl", args, geneInfoLogFileName); -// -// if (downloadedGenomeInfo) { -// logger.info(outputFileName + " created OK"); -// } else { -// logger.error("Genome info for " + speciesConfiguration.getScientificName() + " cannot be downloaded"); -// } -// } catch (IOException | InterruptedException e) { -// e.printStackTrace(); -// } -// } - private CellBaseBuilder buildGenomeSequence() throws CellBaseException { // Sanity check Path genomeVersionPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(GENOME_VERSION_FILENAME); @@ -316,42 +277,12 @@ private CellBaseBuilder buildRegulation() { } private CellBaseBuilder buildProtein() { - Path proteinFolder = downloadFolder.resolve("protein"); + Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); copyVersionFiles(Arrays.asList(proteinFolder.resolve("uniprotVersion.json"), proteinFolder.resolve("interproVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "protein"); - return new ProteinBuilder(proteinFolder.resolve("uniprot_chunks"), - downloadFolder.resolve("protein").resolve("protein2ipr.dat.gz"), speciesConfiguration.getScientificName(), serializer); - } - - private void getProteinFunctionPredictionMatrices(SpeciesConfiguration sp, Path geneFolder) - throws IOException, InterruptedException { - logger.info("Downloading protein function prediction matrices ..."); - - // run protein_function_prediction_matrices.pl - String proteinFunctionProcessLogFile = geneFolder.resolve("protein_function_prediction_matrices.log").toString(); - List args = Arrays.asList("--species", sp.getScientificName(), "--outdir", geneFolder.toString(), - "--ensembl-libs", configuration.getDownload().getEnsembl().getLibs()); - - boolean proteinFunctionPredictionMatricesObtaines = EtlCommons.runCommandLineProcess(ensemblScriptsFolder, - "./protein_function_prediction_matrices.pl", - args, - proteinFunctionProcessLogFile); - - // check output - if (proteinFunctionPredictionMatricesObtaines) { - logger.info("Protein function prediction matrices created OK"); - } else { - logger.error("Protein function prediction matrices for " + sp.getScientificName() + " cannot be downloaded"); - } - } - - private CellBaseBuilder getInteractionParser() { - Path proteinFolder = downloadFolder.resolve("protein"); - Path psimiTabFile = proteinFolder.resolve("intact.txt"); - copyVersionFiles(Arrays.asList(proteinFolder.resolve("intactVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "protein_protein_interaction"); - return new InteractionBuilder(psimiTabFile, speciesConfiguration.getScientificName(), serializer); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, PROTEIN_DATA); + return new ProteinBuilder(proteinFolder.resolve("uniprot_chunks"), downloadFolder.resolve(PROTEIN_SUBDIRECTORY) + .resolve("protein2ipr.dat.gz"), speciesConfiguration.getScientificName(), serializer); } private CellBaseBuilder buildConservation() { @@ -359,7 +290,6 @@ private CellBaseBuilder buildConservation() { copyVersionFiles(Arrays.asList(conservationFilesDir.resolve("gerpVersion.json"), conservationFilesDir.resolve("phastConsVersion.json"), conservationFilesDir.resolve("phyloPVersion.json"))); - // TODO: chunk size is not really used in ConvervedRegionParser, remove? int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder); return new ConservationBuilder(conservationFilesDir, conservationChunkSize, serializer); @@ -406,10 +336,14 @@ private Path getFastaReferenceGenome() throws CellBaseException { Path fastaPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(fastaFilename); if (fastaPath.toFile().exists()) { // Gunzip - logger.info("Gunzip file: " + fastaPath); + logger.info("Gunzip file: {}", fastaPath); try { EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(fastaPath.toString()), null); - } catch (IOException | InterruptedException e) { + } catch (IOException e) { + throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); } } From 3566e011aaddff926f4a8320648559b386b822e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 14:01:21 +0200 Subject: [PATCH 067/107] app: improve log/exception messages in DownloadCommandExecutor, #TASK-5564 --- .../app/cli/admin/executors/DownloadCommandExecutor.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index f8d3e04eb9..8a763ae3c9 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -17,7 +17,6 @@ package org.opencb.cellbase.app.cli.admin.executors; import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; import org.opencb.cellbase.core.exception.CellBaseException; @@ -25,7 +24,6 @@ import org.opencb.cellbase.lib.download.DownloadFile; import org.opencb.cellbase.lib.download.Downloader; -import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; @@ -109,12 +107,12 @@ public void execute() throws CellBaseException { } } AbstractDownloadManager.writeDownloadLogFile(outputDirectory, downloadFiles); - } catch (IOException | NoSuchMethodException | FileFormatException e) { - throw new CellBaseException("Error executing command line 'download'", e); } catch (InterruptedException e) { // Restore interrupted state... Thread.currentThread().interrupt(); - throw new CellBaseException("Error executing command line 'download'", e); + throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); + } catch (Exception e) { + throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); } } From cd94452ba578e658c6cbd2cce158fdaae8215bca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 14:02:11 +0200 Subject: [PATCH 068/107] app: update repeats builder, and improve log/exception messages, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 184 ++++++++++-------- 1 file changed, 107 insertions(+), 77 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 1f99975a4b..6f9f531e47 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -19,7 +19,7 @@ import com.beust.jcommander.ParameterException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; import org.opencb.cellbase.core.config.SpeciesConfiguration; @@ -64,6 +64,10 @@ public class BuildCommandExecutor extends CommandExecutor { private boolean flexibleGTFParsing; private SpeciesConfiguration speciesConfiguration; + private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, REFSEQ_DATA, + VARIATION_FUNCTIONAL_SCORE_DATA, MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, + CLINICAL_VARIANTS_DATA, REPEATS_DATA, ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); + public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); @@ -82,6 +86,9 @@ public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildComma public void execute() throws CellBaseException { String buildOption = null; try { + // Check data sources + List dataList = checkDataSources(); + // Output directory need to be created if it doesn't exist if (!Files.exists(output)) { Files.createDirectories(output); @@ -118,92 +125,84 @@ public void execute() throws CellBaseException { makeDir(buildFolder); } - if (buildCommandOptions.data != null) { - String[] buildOptions; - if (buildCommandOptions.data.equals("all")) { - buildOptions = speciesConfiguration.getData().toArray(new String[0]); - } else { - buildOptions = buildCommandOptions.data.split(","); + for (String data : dataList) { + CellBaseBuilder parser; + switch (data) { + case GENOME_DATA: + parser = buildGenomeSequence(); + break; + case GENE_DATA: + parser = buildGene(); + break; + case REFSEQ_DATA: + parser = buildRefSeq(); + break; + case VARIATION_FUNCTIONAL_SCORE_DATA: + parser = buildCadd(); + break; + case MISSENSE_VARIATION_SCORE_DATA: + parser = buildRevel(); + break; + case REGULATION_DATA: + parser = buildRegulation(); + break; + case PROTEIN_DATA: + parser = buildProtein(); + break; + case CONSERVATION_DATA: + parser = buildConservation(); + break; + case CLINICAL_VARIANTS_DATA: + parser = buildClinicalVariants(); + break; + case REPEATS_DATA: + parser = buildRepeats(); + break; + case ONTOLOGY_DATA: + parser = buildObo(); + break; + case SPLICE_SCORE_DATA: + parser = buildSplice(); + break; + case PUBMED_DATA: + parser = buildPubMed(); + break; + case PHARMACOGENOMICS_DATA: + parser = buildPharmacogenomics(); + break; + default: + throw new IllegalArgumentException("Value '" + buildOption + "' is not allowed for the data parameter." + + " Valid values are: " + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build" + + " everything"); } - for (int i = 0; i < buildOptions.length; i++) { - buildOption = buildOptions[i]; - - CellBaseBuilder parser = null; - switch (buildOption) { - case EtlCommons.GENOME_DATA: - parser = buildGenomeSequence(); - break; - case EtlCommons.GENE_DATA: - parser = buildGene(); - break; - case EtlCommons.REFSEQ_DATA: - parser = buildRefSeq(); - break; - case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: - parser = buildCadd(); - break; - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: - parser = buildRevel(); - break; - case EtlCommons.REGULATION_DATA: - parser = buildRegulation(); - break; - case EtlCommons.PROTEIN_DATA: - parser = buildProtein(); - break; - case EtlCommons.CONSERVATION_DATA: - parser = buildConservation(); - break; - case EtlCommons.CLINICAL_VARIANTS_DATA: - parser = buildClinicalVariants(); - break; - case EtlCommons.REPEATS_DATA: - parser = buildRepeats(); - break; - case ONTOLOGY_DATA: - parser = buildObo(); - break; - case EtlCommons.SPLICE_SCORE_DATA: - parser = buildSplice(); - break; - case EtlCommons.PUBMED_DATA: - parser = buildPubMed(); - break; - case EtlCommons.PHARMACOGENOMICS_DATA: - parser = buildPharmacogenomics(); - break; - default: - logger.error("Build option '{}' is not valid", buildCommandOptions.data); - break; - } - - if (parser != null) { - logger.info("Building '{}' data ...", buildOption); - parser.parse(); - logger.info("Building '{}' data. Done.", buildOption); - parser.disconnect(); - } + if (parser != null) { + logger.info("Building '{}' data ...", buildOption); + parser.parse(); + logger.info("Building '{}' data. Done.", buildOption); + parser.disconnect(); } } - } catch (ParameterException e) { - logger.error("Error parsing build command line parameters: " + e.getMessage(), e); } catch (Exception e) { - String msg = "Error executing the command 'build'."; + String msg = "Error executing the command 'build'"; if (StringUtils.isNotEmpty(buildOption)) { - msg += " It was building the data '" + buildOption + "'"; + msg += ". The last data being built was '" + buildOption + "'"; } - throw new CellBaseException(msg, e); + throw new CellBaseException(msg + ": " + e.getMessage(), e); } } - private CellBaseBuilder buildRepeats() { - Path repeatsFilesDir = downloadFolder.resolve(EtlCommons.REPEATS_SUBDIRECTORY); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.TRF_VERSION_FILENAME))); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.GSD_VERSION_FILENAME))); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.WM_VERSION_FILENAME))); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.REPEATS_JSON); - return new RepeatsBuilder(repeatsFilesDir, serializer); + private CellBaseBuilder buildRepeats() throws CellBaseException { + // Sanity check + Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_SUBDIRECTORY); + List versionPaths = Arrays.asList(repeatsDownloadPath.resolve(TRF_VERSION_FILENAME), + repeatsDownloadPath.resolve(GSD_VERSION_FILENAME), + repeatsDownloadPath.resolve(WM_VERSION_FILENAME)); + copyVersionFiles(versionPaths, buildFolder.resolve(REPEATS_SUBDIRECTORY)); + + // Create serializer and return the repeats builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_SUBDIRECTORY), REPEATS_DATA); + return new RepeatsBuilder(repeatsDownloadPath, serializer); } private CellBaseBuilder buildObo() { @@ -448,4 +447,35 @@ private void copyVersionFiles(List versionPaths, Path targetPath) throws C } } } + + private List checkDataSources() { + if (StringUtils.isEmpty(buildCommandOptions.data)) { + throw new IllegalArgumentException("Missing data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to download everything"); + } + List dataList = Arrays.asList(buildCommandOptions.data.split(",")); + for (String data : dataList) { + switch (data) { + case GENOME_DATA: + case GENE_DATA: + case REFSEQ_DATA: + case VARIATION_FUNCTIONAL_SCORE_DATA: + case MISSENSE_VARIATION_SCORE_DATA: + case REGULATION_DATA: + case PROTEIN_DATA: + case CONSERVATION_DATA: + case CLINICAL_VARIANTS_DATA: + case REPEATS_DATA: + case ONTOLOGY_DATA: + case SPLICE_SCORE_DATA: + case PUBMED_DATA: + case PHARMACOGENOMICS_DATA: + break; + default: + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build everything"); + } + } + return dataList; + } } From 148814fdc5ac2a25f80cccd47e38e24d712e7631 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 15:35:13 +0200 Subject: [PATCH 069/107] lib: update the repeats builder by removing the hardcoded filenames and taking them from the configuration file; update JUnit test and improve log messages, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 19 ++--- .../org/opencb/cellbase/lib/EtlCommons.java | 13 +--- .../lib/builders/CellBaseBuilder.java | 4 + .../cellbase/lib/builders/RepeatsBuilder.java | 77 ++++++++++++------- .../lib/builders/RepeatsBuilderTest.java | 4 +- .../test/resources/configuration.test.yaml | 17 +++- 6 files changed, 84 insertions(+), 50 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 6f9f531e47..5b03fd510e 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -84,7 +84,7 @@ public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildComma * @throws CellBaseException Exception */ public void execute() throws CellBaseException { - String buildOption = null; + String data = null; try { // Check data sources List dataList = checkDataSources(); @@ -125,8 +125,9 @@ public void execute() throws CellBaseException { makeDir(buildFolder); } - for (String data : dataList) { - CellBaseBuilder parser; + CellBaseBuilder parser; + for (int i = 0; i < dataList.size(); i++) { + data = dataList.get(i); switch (data) { case GENOME_DATA: parser = buildGenomeSequence(); @@ -171,22 +172,22 @@ public void execute() throws CellBaseException { parser = buildPharmacogenomics(); break; default: - throw new IllegalArgumentException("Value '" + buildOption + "' is not allowed for the data parameter." + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter." + " Valid values are: " + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build" + " everything"); } if (parser != null) { - logger.info("Building '{}' data ...", buildOption); + logger.info(CellBaseBuilder.BUILDING_LOG_MESSAGE, data); parser.parse(); - logger.info("Building '{}' data. Done.", buildOption); + logger.info(CellBaseBuilder.BUILDING_DONE_LOG_MESSAGE, data); parser.disconnect(); } } } catch (Exception e) { String msg = "Error executing the command 'build'"; - if (StringUtils.isNotEmpty(buildOption)) { - msg += ". The last data being built was '" + buildOption + "'"; + if (StringUtils.isNotEmpty(data)) { + msg += ". The last data being built was '" + data + "'"; } throw new CellBaseException(msg + ": " + e.getMessage(), e); } @@ -202,7 +203,7 @@ private CellBaseBuilder buildRepeats() throws CellBaseException { // Create serializer and return the repeats builder CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_SUBDIRECTORY), REPEATS_DATA); - return new RepeatsBuilder(repeatsDownloadPath, serializer); + return new RepeatsBuilder(repeatsDownloadPath, serializer, configuration); } private CellBaseBuilder buildObo() { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 3a98939a23..4370d0f203 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -65,8 +65,8 @@ public class EtlCommons { // Genome (Ensembl) public static final String GENOME_NAME = "Genome"; public static final String GENOME_DATA = "genome"; - public static final String GENOME_SUBDIRECTORY = "genome"; - public static final String GENOME_VERSION_FILENAME = "genome" + SUFFIX_VERSION_FILENAME; + public static final String GENOME_SUBDIRECTORY = GENOME_DATA; + public static final String GENOME_VERSION_FILENAME = GENOME_DATA + SUFFIX_VERSION_FILENAME; // Gene (Ensembl) public static final String GENE_DATA = "gene"; @@ -209,24 +209,19 @@ public class EtlCommons { // Repeats public static final String REPEATS_NAME = "Repeats"; public static final String REPEATS_DATA = "repeats"; - public static final String REPEATS_SUBDIRECTORY = "genome"; + public static final String REPEATS_SUBDIRECTORY = GENOME_SUBDIRECTORY; + @Deprecated public static final String REPEATS_JSON = "repeats"; // Simple repeats public static final String TRF_NAME = "Tandem Repeats Finder"; - @Deprecated - public static final String TRF_FILE = "simpleRepeat.txt.gz"; public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME; public static final String SIMPLE_REPEATS_FILE_ID = "SIMPLE_REPEATS"; // Genomic super duplications public static final String GSD_NAME = "Genomic Super Duplications"; - @Deprecated - public static final String GSD_FILE = "genomicSuperDups.txt.gz"; public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME; public static final String GENOMIC_SUPER_DUPS_FILE_ID = "GENOMIC_SUPER_DUPS"; // Window masker public static final String WM_NAME = "Window Masker"; - @Deprecated - public static final String WM_FILE = "windowmaskerSdust.txt.gz"; public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME; public static final String WINDOW_MASKER_FILE_ID = "WINDOW_MASKER"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index 79e5b7e58b..9dc95f8d83 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -29,6 +29,10 @@ public abstract class CellBaseBuilder { protected Logger logger; + public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; + public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done!"; + + public CellBaseBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java index d37765e0b6..6cefc0266f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java @@ -18,6 +18,8 @@ import org.opencb.biodata.models.core.Region; import org.opencb.biodata.models.variant.avro.Repeat; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.commons.ProgressLogger; @@ -27,55 +29,74 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; + +import static org.opencb.cellbase.lib.EtlCommons.*; /** * Created by fjlopez on 05/05/17. */ public class RepeatsBuilder extends CellBaseBuilder { - private static final String TRF = "trf"; - private static final String GSD = "genomicSuperDup"; - private static final String WM = "windowMasker"; + + private CellBaseConfiguration configuration; + private final Path filesDir; - public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer) { + public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) { super(serializer); this.filesDir = filesDir; + this.configuration = configuration; } @Override public void parse() throws Exception { + logger.info(BUILDING_LOG_MESSAGE, EtlCommons.REPEATS_NAME); - logger.info("Parsing repeats..."); - if (Files.exists(filesDir.resolve(EtlCommons.TRF_FILE))) { - parseTrfFile(filesDir.resolve(EtlCommons.TRF_FILE)); - } else { - logger.warn("No TRF file found {}", EtlCommons.TRF_FILE); - logger.warn("Skipping TRF file parsing. TRF data models will not be built."); + // Check Simple Repeats (TRF) filename + String trfFilename = Paths.get(configuration.getDownload().getSimpleRepeats().getFiles().get(SIMPLE_REPEATS_FILE_ID)).getFileName() + .toString(); + if (!Files.exists(filesDir.resolve(trfFilename))) { + throw new CellBaseException(TRF_NAME + " file " + trfFilename + " does not exist at " + filesDir); } - if (Files.exists(filesDir.resolve(EtlCommons.GSD_FILE))) { - parseGsdFile(filesDir.resolve(EtlCommons.GSD_FILE)); - } else { - logger.warn("No Genomic Super Duplications file found {}", EtlCommons.GSD_FILE); - logger.warn("Skipping Genomic Super Duplications file parsing. " - + "Genomic Super Duplications data models will not be built."); + // Check Genomic Super Duplications (GSD) file + String gsdFilename = Paths.get(configuration.getDownload().getGenomicSuperDups().getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID)) + .getFileName().toString(); + if (!Files.exists(filesDir.resolve(gsdFilename))) { + throw new CellBaseException(GSD_NAME + " file " + gsdFilename + " does not exist at " + filesDir); } - if (Files.exists(filesDir.resolve(EtlCommons.WM_FILE))) { - parseWmFile(filesDir.resolve(EtlCommons.WM_FILE)); - } else { - logger.warn("No WindowMasker file found {}", EtlCommons.WM_FILE); - logger.warn("Skipping WindowMasker file parsing. WindowMasker data models will not be built."); + // Check Window Masker (WM) file + String wmFilename = Paths.get(configuration.getDownload().getWindowMasker().getFiles().get(WINDOW_MASKER_FILE_ID)).getFileName() + .toString(); + if (!Files.exists(filesDir.resolve(wmFilename))) { + throw new CellBaseException(WM_NAME + " file " + wmFilename + " does not exist at " + filesDir); } - logger.info("Done."); + + // Parse TRF file + logger.info(BUILDING_LOG_MESSAGE, TRF_NAME); + parseTrfFile(filesDir.resolve(trfFilename)); + logger.info(BUILDING_DONE_LOG_MESSAGE, TRF_NAME); + + // Parse GSD file + logger.info(BUILDING_LOG_MESSAGE, GSD_NAME); + parseGsdFile(filesDir.resolve(gsdFilename)); + logger.info(BUILDING_DONE_LOG_MESSAGE, GSD_NAME); + + // Parse WM file + logger.info(BUILDING_LOG_MESSAGE, WM_NAME); + parseWmFile(filesDir.resolve(wmFilename)); + logger.info(BUILDING_DONE_LOG_MESSAGE, WM_NAME); + + logger.info(BUILDING_DONE_LOG_MESSAGE, EtlCommons.REPEATS_NAME); } private void parseTrfFile(Path filePath) throws IOException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed TRF lines:", + ProgressLogger progressLogger = new ProgressLogger("Parsed " + TRF_NAME + " lines:", () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseTrfLine(line)); @@ -90,14 +111,14 @@ private Repeat parseTrfLine(String line) { return new Repeat(null, Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), Integer.valueOf(parts[5]), Integer.valueOf(parts[7]), - Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF); + Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF_NAME); } private void parseGsdFile(Path filePath) throws IOException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed GSD lines:", + ProgressLogger progressLogger = new ProgressLogger("Parsed " + GSD_NAME + " lines:", () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseGSDLine(line)); @@ -112,7 +133,7 @@ private Repeat parseGSDLine(String line) { return new Repeat(parts[11], Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, 2f, Float.valueOf(parts[26]), null, - null, GSD); + null, GSD_NAME); } @@ -120,7 +141,7 @@ private void parseWmFile(Path filePath) throws IOException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed WM lines:", + ProgressLogger progressLogger = new ProgressLogger("Parsed " + WM_NAME + " lines:", () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseWmLine(line)); @@ -134,6 +155,6 @@ private Repeat parseWmLine(String line) { String[] parts = line.split("\t"); return new Repeat(parts[4].replace("\t", ""), Region.normalizeChromosome(parts[1]), - Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM); + Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM_NAME); } } diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java index 9c69a1e602..acce1fa92b 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java @@ -21,6 +21,7 @@ import org.junit.jupiter.api.Test; import org.eclipse.jetty.util.ajax.JSON; import org.opencb.biodata.models.variant.avro.Repeat; +import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.commons.utils.FileUtils; @@ -46,9 +47,10 @@ public RepeatsBuilderTest() { @Test public void testParse() throws Exception { + CellBaseConfiguration configuration = CellBaseConfiguration.load(getClass().getResourceAsStream("configuration.test.yaml")); Path repeatsFilesDir = Paths.get(getClass().getResource("/repeats").getPath()); CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "repeats.test"); - (new RepeatsBuilder(repeatsFilesDir, serializer)).parse(); + (new RepeatsBuilder(repeatsFilesDir, serializer, configuration)).parse(); serializer.close(); assertEquals(loadRepeatSet(Paths.get(getClass().getResource("/repeats/repeats.test.json.gz").getFile())), loadRepeatSet(Paths.get("/tmp/repeats.test.json.gz"))); diff --git a/cellbase-lib/src/test/resources/configuration.test.yaml b/cellbase-lib/src/test/resources/configuration.test.yaml index 1322d2fa52..fd7a1498f8 100644 --- a/cellbase-lib/src/test/resources/configuration.test.yaml +++ b/cellbase-lib/src/test/resources/configuration.test.yaml @@ -85,12 +85,23 @@ download: host: http://docm.genome.wustl.edu dgv: host: http://dgv.tcag.ca/v106/docs + simpleRepeats: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + SIMPLE_REPEATS: goldenPath/put_assembly_here/database/simpleRepeat.txt.gz windowMasker: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + WINDOW_MASKER: goldenPath/put_assembly_here/database/windowmaskerSdust.txt.gz genomicSuperDups: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + GENOMIC_SUPER_DUPS: goldenPath/put_assembly_here/database/genomicSuperDups.txt.gz + gwasCatalog: host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2016/09/28/gwas-catalog-associations.tsv hpo: From 30a4c87dfbbb6f84518757a75284dc9ac1d49aaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 22 Apr 2024 17:30:47 +0200 Subject: [PATCH 070/107] lib: update conservation builder by removing the hardcoded filenames and taking them from the version files (i.e., URLs of the DataSource), improve log/exception messages, and fix sonnar issues, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 18 +- .../org/opencb/cellbase/lib/EtlCommons.java | 14 +- .../lib/builders/CellBaseBuilder.java | 34 +- .../lib/builders/ConservationBuilder.java | 441 ++++++++---------- .../lib/builders/ConservationBuilderTest.java | 3 + 5 files changed, 245 insertions(+), 265 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 5b03fd510e..6e44db2a75 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -178,9 +178,7 @@ public void execute() throws CellBaseException { } if (parser != null) { - logger.info(CellBaseBuilder.BUILDING_LOG_MESSAGE, data); parser.parse(); - logger.info(CellBaseBuilder.BUILDING_DONE_LOG_MESSAGE, data); parser.disconnect(); } } @@ -285,14 +283,16 @@ private CellBaseBuilder buildProtein() { .resolve("protein2ipr.dat.gz"), speciesConfiguration.getScientificName(), serializer); } - private CellBaseBuilder buildConservation() { - Path conservationFilesDir = downloadFolder.resolve("conservation"); - copyVersionFiles(Arrays.asList(conservationFilesDir.resolve("gerpVersion.json"), - conservationFilesDir.resolve("phastConsVersion.json"), - conservationFilesDir.resolve("phyloPVersion.json"))); + private CellBaseBuilder buildConservation() throws CellBaseException { + // Sanity check + Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); + copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(GERP_VERSION_FILENAME), + conservationDownloadPath.resolve(PHASTCONS_VERSION_FILENAME), conservationDownloadPath.resolve(PHYLOP_VERSION_FILENAME)), + buildFolder.resolve(CONSERVATION_SUBDIRECTORY)); + int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder); - return new ConservationBuilder(conservationFilesDir, conservationChunkSize, serializer); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(CONSERVATION_SUBDIRECTORY)); + return new ConservationBuilder(conservationDownloadPath, conservationChunkSize, serializer); } private CellBaseBuilder buildClinicalVariants() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 4370d0f203..11a01b7a8b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -333,13 +333,15 @@ public class EtlCommons { public static final String GERP_FILE_ID = "GERP"; // PHASTCONS public static final String PHASTCONS_NAME = "PhastCons"; - public static final String PHASTCONS_SUBDIRECTORY = "phastCons"; - public static final String PHASTCONS_VERSION_FILENAME = "phastCons" + SUFFIX_VERSION_FILENAME; + public static final String PHASTCONS_DATA = "phastCons"; + public static final String PHASTCONS_SUBDIRECTORY = PHASTCONS_DATA; + public static final String PHASTCONS_VERSION_FILENAME = PHASTCONS_DATA + SUFFIX_VERSION_FILENAME; public static final String PHASTCONS_FILE_ID = "PHASTCONS"; // PHYLOP public static final String PHYLOP_NAME = "PhyloP"; - public static final String PHYLOP_SUBDIRECTORY = "phylop"; - public static final String PHYLOP_VERSION_FILENAME = "phylop" + SUFFIX_VERSION_FILENAME; + public static final String PHYLOP_DATA = "phylop"; + public static final String PHYLOP_SUBDIRECTORY = PHYLOP_DATA; + public static final String PHYLOP_VERSION_FILENAME = PHYLOP_DATA + SUFFIX_VERSION_FILENAME; public static final String PHYLOP_FILE_ID = "PHYLOP"; // Splice scores @@ -502,4 +504,8 @@ public static String getUrl(DownloadProperties.URLProperties props, String fileI } return url; } + + public static String getFilename(String prefix, String chromosome) { + return prefix + "_" + chromosome; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index 9dc95f8d83..3efe5d1388 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -16,10 +16,19 @@ package org.opencb.cellbase.lib.builders; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.File; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + /** * Created by imedina on 30/08/14. */ @@ -30,7 +39,10 @@ public abstract class CellBaseBuilder { protected Logger logger; public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; - public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done!"; + public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done."; + + public static final String PARSING_LOG_MESSAGE = "Parsing file {} ..."; + public static final String PARSING_DONE_LOG_MESSAGE = "Parsing file {} done."; public CellBaseBuilder(CellBaseSerializer serializer) { @@ -50,4 +62,24 @@ public void disconnect() { } } + protected List checkFiles(DataSource dataSource, Path targetPath, String name) throws CellBaseException { + logger.info("Checking {} folder and files", name); + if (!targetPath.toFile().exists()) { + throw new CellBaseException(name + " folder does not exist " + targetPath); + } + + List files = new ArrayList<>(); + + List filenames = dataSource.getUrls().stream().map(u -> Paths.get(u).getFileName().toString()).collect(Collectors.toList()); + for (String filename : filenames) { + File file = targetPath.resolve(filename).toFile(); + if (!file.exists()) { + throw new CellBaseException("File " + file + " does not exits"); + } else { + files.add(file); + } + } + + return files; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 9247b78faa..3aa9e2bb91 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -16,25 +16,26 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; import org.opencb.biodata.models.core.GenomicScoreRegion; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.MongoDBCollectionConfiguration; import org.opencb.commons.utils.FileUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.BufferedReader; +import java.io.File; import java.io.IOException; -import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class ConservationBuilder extends CellBaseBuilder { - private Logger logger; private Path conservedRegionPath; private int chunkSize; @@ -50,326 +51,259 @@ public ConservationBuilder(Path conservedRegionPath, int chunkSize, CellBaseFile fileSerializer = serializer; this.conservedRegionPath = conservedRegionPath; this.chunkSize = chunkSize; - logger = LoggerFactory.getLogger(ConservationBuilder.class); outputFileNames = new HashMap<>(); } @Override public void parse() throws IOException, CellBaseException { - System.out.println("conservedRegionPath = " + conservedRegionPath.toString()); if (conservedRegionPath == null || !Files.exists(conservedRegionPath) || !Files.isDirectory(conservedRegionPath)) { - throw new IOException("Conservation directory does not exist, is not a directory or cannot be read"); + throw new IOException("Conservation directory " + conservedRegionPath + " does not exist or it is not a directory or it cannot" + + " be read"); } - /* - * GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse - * this file correctly, so we transform the file into a bedGraph format which is human readable. - */ - Path gerpFolderPath = conservedRegionPath.resolve(EtlCommons.GERP_SUBDIRECTORY); - if (gerpFolderPath.toFile().exists()) { - logger.debug("Parsing GERP data ..."); - gerpParser(gerpFolderPath); - } else { - logger.debug("GERP data not found: " + gerpFolderPath.toString()); + ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); + + // Check GERP folder and files + Path gerpPath = conservedRegionPath.resolve(GERP_SUBDIRECTORY); + List gerpFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(GERP_VERSION_FILENAME).toFile()), gerpPath, + GERP_NAME); + + // Check PhastCons folder and files + Path phastConsPath = conservedRegionPath.resolve(PHASTCONS_SUBDIRECTORY); + List phastConsFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(PHASTCONS_VERSION_FILENAME).toFile()), + phastConsPath, PHASTCONS_NAME); + + // Check PhyloP folder and files + Path phylopPath = conservedRegionPath.resolve(PHYLOP_SUBDIRECTORY); + List phylopFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(PHYLOP_VERSION_FILENAME).toFile()), + phylopPath, PHYLOP_NAME); + + // GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse + // this file correctly, so we transform the file into a bedGraph format which is human-readable. + if (gerpFiles.size() != 1) { + throw new CellBaseException("Only one " + GERP_NAME + " file is expected, but currently there are " + gerpFiles.size() + + " files"); } + gerpParser(gerpFiles.get(0).toPath()); - /* - * UCSC phastCons and phylop are stored in the same format. They are processed together. - */ + // UCSC phastCons and phylop are stored in the same format. They are processed together. Map files = new HashMap<>(); String chromosome; Set chromosomes = new HashSet<>(); - // Reading all files in phastCons folder - DirectoryStream directoryStream = Files.newDirectoryStream(conservedRegionPath.resolve("phastCons"), "*.wigFix.gz"); - for (Path path : directoryStream) { - chromosome = path.getFileName().toString().split("\\.")[0].replace("chr", ""); + // Process PhastCons filenames + for (File file : phastConsFiles) { + chromosome = file.getName().split("\\.")[0].replace("chr", ""); chromosomes.add(chromosome); - files.put(chromosome + "phastCons", path); + files.put(chromosome + PHASTCONS_DATA, file.toPath()); } - // Reading all files in phylop folder - directoryStream = Files.newDirectoryStream(conservedRegionPath.resolve("phylop"), "*.wigFix.gz"); - for (Path path : directoryStream) { - chromosome = path.getFileName().toString().split("\\.")[0].replace("chr", ""); + // Process PhyloP filenames + for (File file : phylopFiles) { + chromosome = file.getName().split("\\.")[0].replace("chr", ""); chromosomes.add(chromosome); - files.put(chromosome + "phylop", path); + files.put(chromosome + PHYLOP_DATA, file.toPath()); } - /* - * Now we can iterate over all the chromosomes found and process the files - */ - logger.debug("Chromosomes found '{}'", chromosomes.toString()); + // Now we can iterate over all the chromosomes found and process the files + logger.debug("Chromosomes found '{}'", chromosomes); for (String chr : chromosomes) { - logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + "phastCons")); - processWigFixFile(files.get(chr + "phastCons"), "phastCons"); + logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHASTCONS_DATA)); + processWigFixFile(files.get(chr + PHASTCONS_DATA), PHASTCONS_NAME); - logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + "phylop")); - processWigFixFile(files.get(chr + "phylop"), "phylop"); + logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHYLOP_DATA)); + processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_NAME); } } - private void gerpParser(Path gerpFolderPath) throws IOException, CellBaseException { - Path gerpProcessFilePath = gerpFolderPath.resolve(EtlCommons.GERP_PROCESSED_FILE); - logger.info("parsing {}", gerpProcessFilePath); - BufferedReader bufferedReader = FileUtils.newBufferedReader(gerpProcessFilePath); - - String line; - int startOfBatch = 0; - int previousEndValue = 0; - String chromosome = null; - String previousChromosomeValue = null; - - List conservationScores = new ArrayList<>(chunkSize); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - - // file is wrong. throw an exception instead? - if (fields.length != 4) { - logger.error("skipping invalid line: " + line.length()); - continue; - } + private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, gerpProcessFilePath); - chromosome = fields[0]; + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(gerpProcessFilePath)) { + String line; + int startOfBatch = 0; + int previousEndValue = 0; + String chromosome = null; + String previousChromosomeValue = null; - // new chromosome, store batch - if (previousChromosomeValue != null && !previousChromosomeValue.equals(chromosome)) { - storeScores(startOfBatch, previousChromosomeValue, conservationScores); + List conservationScores = new ArrayList<>(chunkSize); + while ((line = bufferedReader.readLine()) != null) { + String[] fields = line.split("\t"); - // reset values for current batch - startOfBatch = 0; - } + // Checking line + if (fields.length != 4) { + throw new CellBaseException("Invalid " + GERP_NAME + " line (expecting 4 columns): " + line); + } - // reset chromosome for next entry - previousChromosomeValue = chromosome; + chromosome = fields[0]; - // file is american! starts at zero, add one - int start = Integer.parseInt(fields[1]) + 1; - // inclusive - int end = Integer.parseInt(fields[2]) + 1; + // New chromosome, store batch + if (previousChromosomeValue != null && !previousChromosomeValue.equals(chromosome)) { + storeScores(startOfBatch, previousChromosomeValue, conservationScores); - // start coordinate for this batch of 2,000 - if (startOfBatch == 0) { - startOfBatch = start; - previousEndValue = 0; - } + // Reset values for current batch + startOfBatch = 0; + } - // if there is a gap between the last entry and this one. - if (previousEndValue != 0 && (start - previousEndValue) != 0) { - // gap is too big! store what we already have before processing more - if (start - previousEndValue >= chunkSize) { - // we have a full batch, store - storeScores(startOfBatch, chromosome, conservationScores); + // Reset chromosome for next entry + previousChromosomeValue = chromosome; - // reset batch to start at this record + // File is american! starts at zero, add one + int start = Integer.parseInt(fields[1]) + 1; + // Inclusive + int end = Integer.parseInt(fields[2]) + 1; + + // sSart coordinate for this batch of 2,000 + if (startOfBatch == 0) { startOfBatch = start; - } else { - // fill in the gap with zeroes - // don't overfill the batch - while (previousEndValue < start && conservationScores.size() < chunkSize) { - conservationScores.add((float) 0); - previousEndValue++; + previousEndValue = 0; + } + + // If there is a gap between the last entry and this one + if (previousEndValue != 0 && (start - previousEndValue) != 0) { + // Gap is too big! store what we already have before processing more + if (start - previousEndValue >= chunkSize) { + // We have a full batch, store + storeScores(startOfBatch, chromosome, conservationScores); + + // Reset batch to start at this record + startOfBatch = start; + } else { + // Fill in the gap with zeroes, don't overfill the batch + while (previousEndValue < start && conservationScores.size() < chunkSize) { + conservationScores.add((float) 0); + previousEndValue++; + } + + // We have a full batch, store + if (conservationScores.size() == chunkSize) { + storeScores(startOfBatch, chromosome, conservationScores); + + // Reset: start a new batch + startOfBatch = start; + } } + } + + // Reset value + previousEndValue = end; + + // Score for these coordinates + String score = fields[3]; - // we have a full batch, store + // Add the score for each coordinate included in the range start-end + while (start < end) { + // We have a full batch: store if (conservationScores.size() == chunkSize) { storeScores(startOfBatch, chromosome, conservationScores); - // reset. start a new batch + // Reset: start a new batch startOfBatch = start; } - } - } - // reset value - previousEndValue = end; + // Add score to batch + conservationScores.add(Float.valueOf(score)); - // score for these coordinates - String score = fields[3]; + // Increment coordinate + start++; + } - // add the score for each coordinate included in the range start-end - while (start < end) { - // we have a full batch, store + // We have a full batch: store if (conservationScores.size() == chunkSize) { storeScores(startOfBatch, chromosome, conservationScores); - // reset. start a new batch - startOfBatch = start; + // Reset: start a new batch + startOfBatch = 0; } - - // add score to batch - conservationScores.add(Float.valueOf(score)); - - // increment coordinate - start++; } - - // we have a full batch, store - if (conservationScores.size() == chunkSize) { + // We need to serialize the last chunk that might be incomplete + if (!conservationScores.isEmpty()) { storeScores(startOfBatch, chromosome, conservationScores); - - // reset, start a new batch - startOfBatch = 0; } } - // we need to serialize the last chunk that might be incomplete - if (!conservationScores.isEmpty()) { - storeScores(startOfBatch, chromosome, conservationScores); - } - bufferedReader.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gerpProcessFilePath); } private void storeScores(int startOfBatch, String chromosome, List conservationScores) throws CellBaseException { - // if this is a small batch, fill in the missing coordinates with 0 + // If this is a small batch, fill in the missing coordinates with 0 while (conservationScores.size() < chunkSize) { conservationScores.add((float) 0); } if (conservationScores.size() != chunkSize) { - throw new CellBaseException("invalid chunk size " + conservationScores.size() + " for " + chromosome + ":" + startOfBatch); + throw new CellBaseException("Invalid chunk size " + conservationScores.size() + " for " + chromosome + ":" + startOfBatch); } - GenomicScoreRegion conservationScoreRegion = new GenomicScoreRegion(chromosome, startOfBatch, - startOfBatch + conservationScores.size() - 1, "gerp", conservationScores); + GenomicScoreRegion conservationScoreRegion = new GenomicScoreRegion<>(chromosome, startOfBatch, + startOfBatch + conservationScores.size() - 1, GERP_NAME, conservationScores); fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome)); - // reset + // Reset conservationScores.clear(); } -// @Deprecated -// private void gerpParser(Path gerpFolderPath) throws IOException, InterruptedException { -// logger.info("Uncompressing {}", gerpFolderPath.resolve(EtlCommons.GERP_FILE)); -// List tarArgs = Arrays.asList("-xvzf", gerpFolderPath.resolve(EtlCommons.GERP_FILE).toString(), -// "--overwrite", "-C", gerpFolderPath.toString()); -// EtlCommons.runCommandLineProcess(null, "tar", tarArgs, null); -// -// DirectoryStream pathDirectoryStream = Files.newDirectoryStream(gerpFolderPath, "*.rates"); -// boolean filesFound = false; -// for (Path path : pathDirectoryStream) { -// filesFound = true; -// logger.info("Processing file '{}'", path.getFileName().toString()); -// String[] chromosome = path.getFileName().toString().replaceFirst("chr", "").split("\\."); -// BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(String.valueOf(path)))); -// String line; -// int start = 1; -// int end = 1999; -// int counter = 1; -// String[] fields; -// List val = new ArrayList<>(chunkSize); -// while ((line = bufferedReader.readLine()) != null) { -// fields = line.split("\t"); -// val.add(Float.valueOf(fields[1])); -// counter++; -// if (counter == chunkSize) { -//// ConservationScoreRegion conservationScoreRegion = new ConservationScoreRegion(chromosome[0], start, end, "gerp", -// val); -// GenomicScoreRegion conservationScoreRegion = -// new GenomicScoreRegion<>(chromosome[0], start, end, "gerp", val); -// fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome[0])); -// -// start = end + 1; -// end += chunkSize; -// -// counter = 0; -// val.clear(); -// } -// } -// -// // we need to serialize the last chunk that might be incomplete -//// ConservationScoreRegion conservationScoreRegion = -//// new ConservationScoreRegion(chromosome[0], start, start + val.size() - 1, "gerp", val); -// GenomicScoreRegion conservationScoreRegion = -// new GenomicScoreRegion<>(chromosome[0], start, start + val.size() - 1, "gerp", val); -// fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome[0])); -// -// bufferedReader.close(); -// } -// -// if (!filesFound) { -// logger.warn("No GERP++ files were found. Please check that the original file {} is there, that it was" -// + " properly decompressed and that the *.rates files are present", -// gerpFolderPath.resolve(EtlCommons.GERP_FILE)); -// } -// } - private void processWigFixFile(Path inGzPath, String conservationSource) throws IOException { - BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath); - - String line; - String chromosome = ""; -// int start = 0, end = 0; - int start = 0; - float value; - Map attributes = new HashMap<>(); -// ConservedRegion conservedRegion = null; - List values = new ArrayList<>(); -// ConservationScoreRegion conservedRegion = null; - GenomicScoreRegion conservedRegion = null; - - while ((line = bufferedReader.readLine()) != null) { - if (line.startsWith("fixedStep")) { - //new group, save last - if (conservedRegion != null) { -// conservedRegion.setEnd(end); -// conservedRegion = new ConservationScoreRegion(chromosome, start, end, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, - conservationSource, values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - } + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath)) { + + String line; + String chromosome = ""; + int start = 0; + float value; + Map attributes = new HashMap<>(); + List values = new ArrayList<>(); + GenomicScoreRegion conservedRegion = null; + + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("fixedStep")) { + // New group, save last + if (conservedRegion != null) { + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, + conservationSource, values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); + } -// offset = 0; - attributes.clear(); - String[] attrFields = line.split(" "); - String[] attrKeyValue; - for (String attrField : attrFields) { - if (!attrField.equalsIgnoreCase("fixedStep")) { - attrKeyValue = attrField.split("="); - attributes.put(attrKeyValue[0].toLowerCase(), attrKeyValue[1]); + attributes.clear(); + String[] attrFields = line.split(" "); + String[] attrKeyValue; + for (String attrField : attrFields) { + if (!attrField.equalsIgnoreCase("fixedStep")) { + attrKeyValue = attrField.split("="); + attributes.put(attrKeyValue[0].toLowerCase(), attrKeyValue[1]); + } } - } - chromosome = formatChromosome(attributes); - start = Integer.parseInt(attributes.get("start")); -// end = Integer.parseInt(attributes.get("start")); - - values = new ArrayList<>(2000); - } else { - int startChunk = start / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; -// end++; - int endChunk = (start + values.size()) / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; - // This is the endChunk if current read score is - // appended to the array (otherwise it would be - // start + values.size() - 1). If this endChunk is - // different from the startChunk means that current - // conserved region must be dumped and current - // score must be associated to next chunk. Main - // difference to what there was before is that if - // the fixedStep starts on the last position of a - // chunk e.g. 1999, the chunk must be created with - // just that score - the chunk was left empty with - // the old code - if (startChunk != endChunk) { -// conservedRegion = new ConservationScoreRegion(chromosome, start, end - 1, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, - conservationSource, values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - start = start + values.size(); - values.clear(); - } + chromosome = formatChromosome(attributes); + start = Integer.parseInt(attributes.get("start")); - value = Float.parseFloat(line.trim()); - values.add(value); + values = new ArrayList<>(2000); + } else { + int startChunk = start / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; + int endChunk = (start + values.size()) / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; + // This is the endChunk if current read score is appended to the array (otherwise it would be start + values.size() + // - 1). If this endChunk is different from the startChunk means that current conserved region must be dumped and + // current score must be associated to next chunk. Main difference to what there was before is that if the fixedStep + // starts on the last position of a chunk e.g. 1999, the chunk must be created with just that score - the chunk was + // left empty with the old code + if (startChunk != endChunk) { + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, + values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); + start = start + values.size(); + values.clear(); + } + + value = Float.parseFloat(line.trim()); + values.add(value); + } } + + // Write last + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); } - //write last -// conservedRegion = new ConservationScoreRegion(chromosome, start, end, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, - values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - bufferedReader.close(); } private String getOutputFileName(String chromosome) { @@ -379,13 +313,18 @@ private String getOutputFileName(String chromosome) { } String outputFileName = outputFileNames.get(chromosome); if (outputFileName == null) { - outputFileName = "conservation_" + chromosome; + outputFileName = getFilename(CONSERVATION_DATA, chromosome); outputFileNames.put(chromosome, outputFileName); } return outputFileName; } - // phylop and phastcons list the chromosome as M instead of the standard MT. replace. + /** + * Remove chr from the chromosome name; and phylop and phastcons list the chromosome as M instead of the standard MT, replace it. + * + * @param attributes Attributes map with the chromosome name + * @return The new chromosome name + */ private String formatChromosome(Map attributes) { String chromosome = attributes.get("chrom").replace("chr", ""); diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java index 5af6cbd7e9..6a21908c13 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/ConservationBuilderTest.java @@ -23,6 +23,7 @@ import org.eclipse.jetty.util.ajax.JSON; import org.opencb.biodata.models.core.GenomicScoreRegion; import org.opencb.biodata.models.variant.avro.Repeat; +import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.commons.utils.FileUtils; @@ -41,6 +42,8 @@ public class ConservationBuilderTest { @Test public void testParse() throws Exception { + CellBaseConfiguration configuration = CellBaseConfiguration.load(getClass().getResourceAsStream("configuration.test.yaml")); + Path conservationDir = Paths.get(ConservationBuilderTest.class.getResource("/conservation").toURI()); CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "gerp.test"); (new ConservationBuilder(conservationDir, BATCH_SIZE, serializer)).parse(); From 85e17db92499b4325958330d53831a0177277fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 23 Apr 2024 13:49:25 +0200 Subject: [PATCH 071/107] lib: call bigWigToBedGraph to convert the GERP bigwig to bed graph file, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 22 +++++++++++- .../lib/builders/ConservationBuilder.java | 34 +++++++++++++++++-- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 11a01b7a8b..e7f53fb687 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -28,6 +28,7 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.io.InputStreamReader; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; @@ -393,7 +394,7 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); - logger.debug("Executing command: " + StringUtils.join(builder.command(), " ")); + logger.info("Executing command: " + StringUtils.join(builder.command(), " ")); Process process = builder.start(); process.waitFor(); @@ -508,4 +509,23 @@ public static String getUrl(DownloadProperties.URLProperties props, String fileI public static String getFilename(String prefix, String chromosome) { return prefix + "_" + chromosome; } + + public static boolean isExecutableAvailable(String executable) throws IOException, InterruptedException { + ProcessBuilder processBuilder = new ProcessBuilder("which", executable); + Process process = processBuilder.start(); + + try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()))) { + String line; + StringBuilder output = new StringBuilder(); + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + } + } + + int exitCode = process.waitFor(); + + // if exitCode is 0 then the executable is installed at + output.toString().trim()), + // otherwise, it's not + return (exitCode == 0); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 3aa9e2bb91..4014fdccdb 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -22,6 +22,7 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.MongoDBCollectionConfiguration; import org.opencb.commons.utils.FileUtils; @@ -30,6 +31,7 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -56,6 +58,8 @@ public ConservationBuilder(Path conservedRegionPath, int chunkSize, CellBaseFile @Override public void parse() throws IOException, CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, CONSERVATION_NAME); + if (conservedRegionPath == null || !Files.exists(conservedRegionPath) || !Files.isDirectory(conservedRegionPath)) { throw new IOException("Conservation directory " + conservedRegionPath + " does not exist or it is not a directory or it cannot" + " be read"); @@ -84,7 +88,30 @@ public void parse() throws IOException, CellBaseException { throw new CellBaseException("Only one " + GERP_NAME + " file is expected, but currently there are " + gerpFiles.size() + " files"); } - gerpParser(gerpFiles.get(0).toPath()); + File bigwigFile = gerpFiles.get(0); + File bedgraphFile = Paths.get(gerpFiles.get(0).getAbsolutePath() + ".bedgraph").toFile(); + String exec = "bigWigToBedGraph"; + if (!bedgraphFile.exists()) { + try { + if (isExecutableAvailable(exec)) { + EtlCommons.runCommandLineProcess(null, exec, Arrays.asList(bigwigFile.toString(), bedgraphFile.toString()), null); + } else { + throw new CellBaseException(exec + " not found in your system, install it to build " + GERP_NAME + ". It is available" + + " at http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/"); + } + } catch (IOException e) { + throw new CellBaseException("Error executing " + exec + " in BIGWIG file " + bigwigFile, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("" + e.getMessage(), e); + } + if (!bedgraphFile.exists()) { + throw new CellBaseException("Something happened when executing " + exec + " in BIGWIG file " + bigwigFile + "; the BED" + + " graph file was not generated. Please, check " + exec); + } + } + gerpParser(bedgraphFile.toPath()); // UCSC phastCons and phylop are stored in the same format. They are processed together. Map files = new HashMap<>(); @@ -114,6 +141,8 @@ public void parse() throws IOException, CellBaseException { logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHYLOP_DATA)); processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_NAME); } + + logger.info(BUILDING_DONE_LOG_MESSAGE, CONSERVATION_NAME); } private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseException { @@ -132,7 +161,8 @@ private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseEx // Checking line if (fields.length != 4) { - throw new CellBaseException("Invalid " + GERP_NAME + " line (expecting 4 columns): " + line); + throw new CellBaseException("Invalid " + GERP_NAME + " line (expecting 4 columns): " + fields.length + " items: " + + line); } chromosome = fields[0]; From 0223cb5ca43bfbde6f0ac71e2220cd75dd3f524e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 23 Apr 2024 17:57:40 +0200 Subject: [PATCH 072/107] lib: include log messages, #TASK-5564 --- .../org/opencb/cellbase/lib/builders/ConservationBuilder.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 4014fdccdb..ca34cfd2d7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -276,6 +276,7 @@ private void storeScores(int startOfBatch, String chromosome, List conser } private void processWigFixFile(Path inGzPath, String conservationSource) throws IOException { + logger.info(PARSING_LOG_MESSAGE, inGzPath); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath)) { String line; @@ -334,6 +335,7 @@ private void processWigFixFile(Path inGzPath, String conservationSource) throws conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, values); fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); } + logger.info(PARSING_DONE_LOG_MESSAGE, inGzPath); } private String getOutputFileName(String chromosome) { From 833c3371fc29e24deb98e8dcb7e20309251f3a5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 23 Apr 2024 19:09:01 +0200 Subject: [PATCH 073/107] lib: improve ProteinBuilder by removing hardcoded file names, adding checks and log messages, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 21 ++++--- .../org/opencb/cellbase/lib/EtlCommons.java | 13 +++++ .../cellbase/lib/builders/ProteinBuilder.java | 57 +++++++++++-------- .../lib/download/AbstractDownloadManager.java | 14 +---- 4 files changed, 62 insertions(+), 43 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 6e44db2a75..04e5b928a3 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -210,6 +210,7 @@ private CellBaseBuilder buildObo() { return new OntologyBuilder(oboDir, serializer); } + @Deprecated private void copyVersionFiles(List pathList) { for (Path path : pathList) { try { @@ -274,13 +275,19 @@ private CellBaseBuilder buildRegulation() { return new RegulatoryFeatureBuilder(regulatoryRegionFilesDir, serializer); } - private CellBaseBuilder buildProtein() { - Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); - copyVersionFiles(Arrays.asList(proteinFolder.resolve("uniprotVersion.json"), - proteinFolder.resolve("interproVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, PROTEIN_DATA); - return new ProteinBuilder(proteinFolder.resolve("uniprot_chunks"), downloadFolder.resolve(PROTEIN_SUBDIRECTORY) - .resolve("protein2ipr.dat.gz"), speciesConfiguration.getScientificName(), serializer); + private CellBaseBuilder buildProtein() throws CellBaseException { + // Sanity check + Path proteinDownloadPath = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); + Path proteinBuildPath = buildFolder.resolve(PROTEIN_SUBDIRECTORY); + copyVersionFiles(Arrays.asList(proteinDownloadPath.resolve(UNIPROT_VERSION_FILENAME), + proteinDownloadPath.resolve(INTERPRO_VERSION_FILENAME)), proteinBuildPath); + + // Create the file serializer and the protein builder + Path chunksPath = proteinDownloadPath.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); + String uniprotFilename = getFilenameFromUrl(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(proteinBuildPath, PROTEIN_DATA); + return new ProteinBuilder(chunksPath, proteinDownloadPath.resolve(uniprotFilename), speciesConfiguration.getScientificName(), + serializer); } private CellBaseBuilder buildConservation() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index e7f53fb687..f0fbcd1702 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -30,6 +30,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; @@ -528,4 +529,16 @@ public static boolean isExecutableAvailable(String executable) throws IOExceptio // otherwise, it's not return (exitCode == 0); } + + public static String getFilenameFromProps(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" + + " configuration file"); + } + return getFilenameFromUrl(props.getFiles().get(fileId)); + } + + public static String getFilenameFromUrl(String url) { + return Paths.get(url).getFileName().toString(); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index 0369a0e6aa..3dc6f04212 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -21,6 +21,7 @@ import com.fasterxml.jackson.databind.ObjectWriter; import org.opencb.biodata.formats.protein.uniprot.UniProtParser; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.*; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; @@ -42,14 +43,14 @@ import java.util.Map; import java.util.Set; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class ProteinBuilder extends CellBaseBuilder { private Path uniprotFilesDir; private Path interproFilePath; private String species; - private Map proteinMap; - protected Logger logger = LoggerFactory.getLogger(this.getClass()); public ProteinBuilder(Path uniprotFilesDir, String species, CellBaseSerializer serializer) { @@ -65,23 +66,33 @@ public ProteinBuilder(Path uniprotFilesDir, Path interproFilePath, String specie } @Override - public void parse() throws IOException { + public void parse() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, PROTEIN_NAME); + // Check UniProt if (uniprotFilesDir == null || !Files.exists(uniprotFilesDir)) { - throw new IOException("File '" + uniprotFilesDir + "' not valid"); + throw new CellBaseException("Could not build " + UNIPROT_NAME + ": folder " + uniprotFilesDir + " does not exist"); + } + + // Check InterPro + if (interproFilePath != null && Files.exists(interproFilePath)) { + throw new CellBaseException("Could not build " + INTERPRO_NAME + ": file " + interproFilePath + " does not exist"); } + // Prepare RocksDB RocksDB rocksDb = getDBConnection(); ObjectMapper mapper = new ObjectMapper(); mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); ObjectWriter jsonObjectWriter = mapper.writerFor(Entry.class); - proteinMap = new HashMap<>(30000); -// UniProtParser up = new UniProtParser(); + Map proteinMap = new HashMap<>(30000); + + // Parsing files try { File[] files = uniprotFilesDir.toFile().listFiles((dir, name) -> name.endsWith(".xml") || name.endsWith(".xml.gz")); for (File file : files) { + logger.info(PARSING_LOG_MESSAGE, file); Uniprot uniprot = (Uniprot) UniProtParser.loadXMLInfo(file.toString(), UniProtParser.UNIPROT_CONTEXT); for (Entry entry : uniprot.getEntry()) { @@ -89,16 +100,16 @@ public void parse() throws IOException { for (OrganismNameType organismNameType : entry.getOrganism().getName()) { entryOrganism = organismNameType.getValue(); if (entryOrganism.equals(species)) { -// proteinMap.put(entry.getAccession().get(0), entry); rocksDb.put(entry.getAccession().get(0).getBytes(), jsonObjectWriter.writeValueAsBytes(entry)); } } } + logger.info(PARSING_DONE_LOG_MESSAGE, file); } logger.debug("Number of proteins stored in map: '{}'", proteinMap.size()); - if (interproFilePath != null && Files.exists(interproFilePath)) { - BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interproFilePath); + logger.info(PARSING_LOG_MESSAGE, interproFilePath); + try (BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interproFilePath)) { Set hashSet = new HashSet<>(proteinMap.keySet()); Set visited = new HashSet<>(30000); @@ -114,7 +125,6 @@ public void parse() throws IOException { iprAdded = false; BigInteger start = BigInteger.valueOf(Integer.parseInt(fields[4])); BigInteger end = BigInteger.valueOf(Integer.parseInt(fields[5])); -// for (FeatureType featureType : proteinMap.get(fields[0]).getFeature()) { byte[] bytes = rocksDb.get(fields[0].getBytes()); Entry entry = mapper.readValue(bytes, Entry.class); for (FeatureType featureType : entry.getFeature()) { @@ -145,7 +155,6 @@ public void parse() throws IOException { locationType.setEnd(positionType2); featureType.setLocation(locationType); -// proteinMap.get(fields[0]).getFeature().add(featureType); bytes = rocksDb.get(fields[0].getBytes()); entry = mapper.readValue(bytes, Entry.class); entry.getFeature().add(featureType); @@ -158,11 +167,13 @@ public void parse() throws IOException { } if (++numInterProLinesProcessed % 10000000 == 0) { - logger.debug("{} InterPro lines processed. {} unique proteins processed", - numInterProLinesProcessed, numUniqueProteinsProcessed); + logger.debug("{} {} lines processed. {} unique proteins processed", numInterProLinesProcessed, INTERPRO_NAME, + numUniqueProteinsProcessed); } } - interproBuffereReader.close(); + logger.info(PARSING_DONE_LOG_MESSAGE, interproFilePath); + } catch (IOException e) { + throw new CellBaseException("Error parsing " + INTERPRO_NAME + " file: " + interproFilePath, e); } // Serialize and save results @@ -173,24 +184,22 @@ public void parse() throws IOException { } rocksDb.close(); - } catch (JAXBException | RocksDBException e) { - e.printStackTrace(); + } catch (JAXBException | RocksDBException | IOException e) { + throw new CellBaseException("Error parsing " + PROTEIN_NAME + " files", e); } + + logger.info(BUILDING_DONE_LOG_MESSAGE, PROTEIN_NAME); } - private RocksDB getDBConnection() { - // a static method that loads the RocksDB C++ library. + private RocksDB getDBConnection() throws CellBaseException { + // A static method that loads the RocksDB C++ library RocksDB.loadLibrary(); - // the Options class contains a set of configurable DB options - // that determines the behavior of a database. + // The Options class contains a set of configurable DB options that determines the behavior of a database Options options = new Options().setCreateIfMissing(true); try { return RocksDB.open(options, uniprotFilesDir.resolve("integration.idx").toString()); } catch (RocksDBException e) { - // do some error handling - e.printStackTrace(); - System.exit(1); + throw new CellBaseException("Error preparing RocksDB", e); } - return null; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index f3f01e7c30..7cf171e7dd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -47,6 +47,8 @@ import java.time.LocalDateTime; import java.util.*; +import static org.opencb.cellbase.lib.EtlCommons.getFilenameFromUrl; + public abstract class AbstractDownloadManager { protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; @@ -353,18 +355,6 @@ protected String getUrl(DownloadProperties.URLProperties props, String fileId) t return props.getHost() + filesValue; } } - - protected String getFilenameFromProps(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { - if (!props.getFiles().containsKey(fileId)) { - throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" - + " configuration file"); - } - return getFilenameFromUrl(props.getFiles().get(fileId)); - } - - protected String getFilenameFromUrl(String url) { - return Paths.get(url).getFileName().toString(); - } } From 01deb0c1bd08354c6c081e2d2dbd8a6d826dc2e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 09:43:30 +0200 Subject: [PATCH 074/107] lib: move DataSource reader from ConservationBuilder to the parent CellBaseBuilder to be used by other builders, #TASK-5564 --- .../org/opencb/cellbase/lib/builders/CellBaseBuilder.java | 3 +++ .../opencb/cellbase/lib/builders/ConservationBuilder.java | 5 ----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index 3efe5d1388..49d847c033 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -16,6 +16,8 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; @@ -35,6 +37,7 @@ public abstract class CellBaseBuilder { protected CellBaseSerializer serializer; + protected ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); protected Logger logger; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index ca34cfd2d7..79099a4d93 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -16,11 +16,8 @@ package org.opencb.cellbase.lib.builders; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.ObjectReader; import org.opencb.biodata.models.core.GenomicScoreRegion; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.MongoDBCollectionConfiguration; @@ -65,8 +62,6 @@ public void parse() throws IOException, CellBaseException { + " be read"); } - ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); - // Check GERP folder and files Path gerpPath = conservedRegionPath.resolve(GERP_SUBDIRECTORY); List gerpFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(GERP_VERSION_FILENAME).toFile()), gerpPath, From 9416894717b9cff285fa6736822f990edee9d2da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 11:40:51 +0200 Subject: [PATCH 075/107] lib: move the function to split UniProt into chuncks from the protein downloader to the protein builder, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 5 +- .../cellbase/lib/builders/ProteinBuilder.java | 119 +++++++++++++----- .../lib/download/ProteinDownloadManager.java | 60 ++------- 3 files changed, 99 insertions(+), 85 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 04e5b928a3..3d3b9d9d37 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -283,11 +283,8 @@ private CellBaseBuilder buildProtein() throws CellBaseException { proteinDownloadPath.resolve(INTERPRO_VERSION_FILENAME)), proteinBuildPath); // Create the file serializer and the protein builder - Path chunksPath = proteinDownloadPath.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); - String uniprotFilename = getFilenameFromUrl(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); CellBaseSerializer serializer = new CellBaseJsonFileSerializer(proteinBuildPath, PROTEIN_DATA); - return new ProteinBuilder(chunksPath, proteinDownloadPath.resolve(uniprotFilename), speciesConfiguration.getScientificName(), - serializer); + return new ProteinBuilder(proteinDownloadPath, speciesConfiguration.getScientificName(), serializer); } private CellBaseBuilder buildConservation() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index 3dc6f04212..eb4c04a909 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -35,52 +35,67 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.io.PrintWriter; import java.math.BigInteger; import java.nio.file.Files; import java.nio.file.Path; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.util.*; import static org.opencb.cellbase.lib.EtlCommons.*; public class ProteinBuilder extends CellBaseBuilder { - private Path uniprotFilesDir; - private Path interproFilePath; + private Path proteinPath; private String species; protected Logger logger = LoggerFactory.getLogger(this.getClass()); - public ProteinBuilder(Path uniprotFilesDir, String species, CellBaseSerializer serializer) { - this(uniprotFilesDir, null, species, serializer); - } - - public ProteinBuilder(Path uniprotFilesDir, Path interproFilePath, String species, CellBaseSerializer serializer) { + public ProteinBuilder(Path proteinPath, String species, CellBaseSerializer serializer) { super(serializer); - this.uniprotFilesDir = uniprotFilesDir; - this.interproFilePath = interproFilePath; + this.proteinPath = proteinPath; this.species = species; } @Override - public void parse() throws CellBaseException { + public void parse() throws CellBaseException, IOException { logger.info(BUILDING_LOG_MESSAGE, PROTEIN_NAME); - // Check UniProt - if (uniprotFilesDir == null || !Files.exists(uniprotFilesDir)) { - throw new CellBaseException("Could not build " + UNIPROT_NAME + ": folder " + uniprotFilesDir + " does not exist"); + // Sanity check + if (proteinPath == null) { + throw new CellBaseException(PROTEIN_NAME + " directory is missing (null)"); + } + if (!Files.exists(proteinPath)) { + throw new CellBaseException(PROTEIN_NAME + " directory " + proteinPath + " does not exist"); + } + if (!Files.isDirectory(proteinPath)) { + throw new CellBaseException(PROTEIN_NAME + " directory " + proteinPath + " is not a directory"); + } + + // Check UniProt file + List uniProtFiles = checkFiles(dataSourceReader.readValue(proteinPath.resolve(UNIPROT_VERSION_FILENAME).toFile()), + proteinPath, PROTEIN_NAME + "/" + UNIPROT_NAME); + if (uniProtFiles.size() != 1) { + throw new CellBaseException("Only one " + UNIPROT_NAME + " file is expected, but currently there are " + uniProtFiles.size() + + " files"); } - // Check InterPro - if (interproFilePath != null && Files.exists(interproFilePath)) { - throw new CellBaseException("Could not build " + INTERPRO_NAME + ": file " + interproFilePath + " does not exist"); + // Check InterPro file + List interProFiles = checkFiles(dataSourceReader.readValue(proteinPath.resolve(INTERPRO_VERSION_FILENAME).toFile()), + proteinPath, PROTEIN_NAME + "/" + INTERPRO_NAME); + if (interProFiles.size() != 1) { + throw new CellBaseException("Only one " + INTERPRO_NAME + " file is expected, but currently there are " + uniProtFiles.size() + + " files"); } + // Prepare UniProt data by splitting data in chunks + Path uniProtChunksPath = serializer.getOutdir().resolve(UNIPROT_CHUNKS_SUBDIRECTORY); + logger.info("Split {} file {} into chunks at {}", UNIPROT_NAME, uniProtFiles.get(0).getName(), uniProtChunksPath); + Files.createDirectories(uniProtChunksPath); + splitUniprot(proteinPath.resolve(uniProtFiles.get(0).getName()), uniProtChunksPath); + // Prepare RocksDB - RocksDB rocksDb = getDBConnection(); + RocksDB rocksDb = getDBConnection(uniProtChunksPath); ObjectMapper mapper = new ObjectMapper(); mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); ObjectWriter jsonObjectWriter = mapper.writerFor(Entry.class); @@ -89,7 +104,7 @@ public void parse() throws CellBaseException { // Parsing files try { - File[] files = uniprotFilesDir.toFile().listFiles((dir, name) -> name.endsWith(".xml") || name.endsWith(".xml.gz")); + File[] files = uniProtChunksPath.toFile().listFiles((dir, name) -> name.endsWith(".xml") || name.endsWith(".xml.gz")); for (File file : files) { logger.info(PARSING_LOG_MESSAGE, file); @@ -108,8 +123,8 @@ public void parse() throws CellBaseException { } logger.debug("Number of proteins stored in map: '{}'", proteinMap.size()); - logger.info(PARSING_LOG_MESSAGE, interproFilePath); - try (BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interproFilePath)) { + logger.info(PARSING_LOG_MESSAGE, interProFiles.get(0)); + try (BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interProFiles.get(0).toPath())) { Set hashSet = new HashSet<>(proteinMap.keySet()); Set visited = new HashSet<>(30000); @@ -171,9 +186,9 @@ public void parse() throws CellBaseException { numUniqueProteinsProcessed); } } - logger.info(PARSING_DONE_LOG_MESSAGE, interproFilePath); + logger.info(PARSING_DONE_LOG_MESSAGE, interProFiles.get(0)); } catch (IOException e) { - throw new CellBaseException("Error parsing " + INTERPRO_NAME + " file: " + interproFilePath, e); + throw new CellBaseException("Error parsing " + INTERPRO_NAME + " file: " + interProFiles.get(0), e); } // Serialize and save results @@ -191,15 +206,63 @@ public void parse() throws CellBaseException { logger.info(BUILDING_DONE_LOG_MESSAGE, PROTEIN_NAME); } - private RocksDB getDBConnection() throws CellBaseException { + private RocksDB getDBConnection(Path uniProtChunksPath) throws CellBaseException { // A static method that loads the RocksDB C++ library RocksDB.loadLibrary(); // The Options class contains a set of configurable DB options that determines the behavior of a database Options options = new Options().setCreateIfMissing(true); try { - return RocksDB.open(options, uniprotFilesDir.resolve("integration.idx").toString()); + return RocksDB.open(options, uniProtChunksPath.resolve("integration.idx").toString()); } catch (RocksDBException e) { throw new CellBaseException("Error preparing RocksDB", e); } } + + private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException { + PrintWriter pw = null; + try (BufferedReader br = FileUtils.newBufferedReader(uniprotFilePath)) { + StringBuilder header = new StringBuilder(); + boolean beforeEntry = true; + boolean inEntry = false; + int count = 0; + int chunk = 0; + String line; + while ((line = br.readLine()) != null) { + if (line.trim().startsWith("")) { + inEntry = false; + if (count % 10000 == 0) { + if (pw != null) { + pw.print(""); + pw.close(); + } + chunk++; + } + } + } + pw.print(""); + pw.close(); + } finally { + if (pw != null) { + pw.close(); + } + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 50255a3557..5cb8a4c1f0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -18,14 +18,12 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.commons.utils.FileUtils; -import java.io.BufferedReader; import java.io.IOException; -import java.io.PrintWriter; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -46,12 +44,13 @@ public ProteinDownloadManager(String species, String assembly, Path targetDirect * @throws CellBaseException if there is an error in the CelllBase configuration file */ public List download() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, PROTEIN_NAME); if (!speciesHasInfoToDownload(speciesConfiguration, PROTEIN_DATA)) { - return null; + logger.info("{} not supported for the species {}", PROTEIN_NAME, speciesConfiguration.getScientificName()); + return Collections.emptyList(); } Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); Files.createDirectories(proteinFolder); - logger.info("Downloading {} information at {} ...", PROTEIN_NAME, proteinFolder); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); @@ -59,14 +58,9 @@ public List download() throws IOException, InterruptedException, C // Uniprot downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_NAME, PROTEIN_DATA, UNIPROT_VERSION_FILENAME, proteinFolder); - Path chunksPath = proteinFolder.resolve(UNIPROT_CHUNKS_SUBDIRECTORY); - String uniprotFilename = getFilenameFromUrl(configuration.getDownload().getUniprot().getFiles().get(UNIPROT_FILE_ID)); - logger.info("Split UniProt file {} into chunks at {}", uniprotFilename, chunksPath); - Files.createDirectories(chunksPath); - splitUniprot(proteinFolder.resolve(uniprotFilename), chunksPath); downloadFiles.add(downloadFile); - // Interpro + // InterPro downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_NAME, PROTEIN_DATA, INTERPRO_VERSION_FILENAME, proteinFolder); downloadFiles.add(downloadFile); @@ -76,48 +70,8 @@ public List download() throws IOException, InterruptedException, C INTACT_VERSION_FILENAME, proteinFolder); downloadFiles.add(downloadFile); - return downloadFiles; - } - - private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException { - BufferedReader br = FileUtils.newBufferedReader(uniprotFilePath); - PrintWriter pw = null; - StringBuilder header = new StringBuilder(); - boolean beforeEntry = true; - boolean inEntry = false; - int count = 0; - int chunk = 0; - String line; - while ((line = br.readLine()) != null) { - if (line.trim().startsWith("")) { - inEntry = false; - if (count % 10000 == 0) { - pw.print(""); - pw.close(); - chunk++; - } - } - } - pw.print(""); - pw.close(); - br.close(); + return downloadFiles; } } From 909c0b2fbd1a1ab522489994044a49754ffe8ba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 12:59:30 +0200 Subject: [PATCH 076/107] core: fix regulation URLs in the configuration file, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 5022340bec..a2330cd00c 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -60,9 +60,9 @@ download: GTF: "release-put_release_here/gtf/put_species_here/put_capital_species_here.put_assembly_here.put_release_here.gtf.gz" PEP_FA: "release-put_release_here/fasta/put_species_here/pep/put_capital_species_here.put_assembly_here.pep.all.fa.gz" CDNA_FA: "release-put_release_here/fasta/put_species_here/cdna/put_capital_species_here.put_assembly_here.cdna.all.fa.gz" - REGULATORY_BUILD: "regulation/put_species_here/put_species_here.put_capital_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" - MOTIF_FEATURES: "regulation/put_species_here/MotifFeatures/put_capital_species_here.put_assembly_here.motif_features.gff.gz" - MOTIF_FEATURES_INDEX: "regulation/put_species_here/MotifFeatures/put_capital_species_here.put_assembly_here.motif_features.gff.gz.tbi" + REGULATORY_BUILD: "release-put_release_here/regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" + MOTIF_FEATURES: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 From 71d8056e0ef6b61a0534a10b6cdd25f620d4b054 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 13:01:54 +0200 Subject: [PATCH 077/107] lib: launch a CellBase exception if executing a command (wget, gunzip,...) fails; and improve log messages, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 26 +++++++++++++------ .../lib/download/AbstractDownloadManager.java | 4 +-- .../lib/download/GenomeDownloadManager.java | 5 ++-- .../lib/download/PharmGKBDownloadManager.java | 2 +- .../lib/download/PubMedDownloadManager.java | 2 +- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index f0fbcd1702..3faea3b305 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -382,7 +382,7 @@ public class EtlCommons { public static final String PUBMED_REGEX_FILE_ID = "PUBMED"; public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) - throws IOException, InterruptedException { + throws IOException, InterruptedException, CellBaseException { // This small hack allow to configure the appropriate Logger level from the command line, this is done // by setting the DEFAULT_LOG_LEVEL_KEY before the logger object is created. // org.apache.log4j.Logger rootLogger = LogManager.getRootLogger(); @@ -395,18 +395,28 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); - logger.info("Executing command: " + StringUtils.join(builder.command(), " ")); + logger.debug("Executing command: " + StringUtils.join(builder.command(), " ")); Process process = builder.start(); process.waitFor(); // Check process output - boolean executedWithoutErrors = true; - int genomeInfoExitValue = process.exitValue(); - if (genomeInfoExitValue != 0) { - logger.warn("Error executing {}, error code: {}. More info in log file: {}", binPath, genomeInfoExitValue, logFilePath); - executedWithoutErrors = false; + if (process.exitValue() != 0) { + String msg = "Error executing command '" + binPath + "'; error code = " + process.exitValue() + ". More info in log file: " + + logFilePath; + logger.error(msg); + throw new CellBaseException(msg); } - return executedWithoutErrors; + + return true; +// +// +// boolean executedWithoutErrors = true; +// int genomeInfoExitValue = process.exitValue(); +// if (genomeInfoExitValue != 0) { +// logger.warn("Error executing {}, error code: {}. More info in log file: {}", binPath, genomeInfoExitValue, logFilePath); +// executedWithoutErrors = false; +// } +// return executedWithoutErrors; } private static ProcessBuilder getProcessBuilder(File workingDirectory, String binPath, List args, String logFilePath) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 7cf171e7dd..193f2e146d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -259,12 +259,12 @@ protected String getPhylo(SpeciesConfiguration sp) { } } - protected DownloadFile downloadFile(String url, String outputFileName) throws IOException, InterruptedException { + protected DownloadFile downloadFile(String url, String outputFileName) throws IOException, InterruptedException, CellBaseException { return downloadFile(url, outputFileName, null); } protected DownloadFile downloadFile(String url, String outputFileName, List wgetAdditionalArgs) - throws IOException, InterruptedException { + throws IOException, InterruptedException, CellBaseException { DownloadFile downloadFileInfo = new DownloadFile(url, outputFileName, Timestamp.valueOf(LocalDateTime.now()).toString()); Long startTime = System.currentTimeMillis(); if (Paths.get(outputFileName).toFile().exists()) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index f36f493e1f..210d5bc39f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -67,8 +67,9 @@ public List downloadReferenceGenome() throws IOException, Interrup * @return list of files downloaded * @throws IOException if there is an error writing to a file * @throws InterruptedException if there is an error downloading files + * @throws CellBaseException if there is an error executing the command line */ - public List downloadConservation() throws IOException, InterruptedException { + public List downloadConservation() throws IOException, InterruptedException, CellBaseException { if (!speciesHasInfoToDownload(speciesConfiguration, CONSERVATION_DATA)) { return Collections.emptyList(); } @@ -138,7 +139,7 @@ public List downloadConservation() throws IOException, Interrupted return downloadFiles; } - public List downloadRepeats() throws IOException, InterruptedException { + public List downloadRepeats() throws IOException, InterruptedException, CellBaseException { if (!speciesHasInfoToDownload(speciesConfiguration, REPEATS_DATA)) { return Collections.emptyList(); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 04e72d3247..f52c3f8a23 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -42,7 +42,7 @@ public PharmGKBDownloadManager(String species, String assembly, Path targetDirec } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { DownloadProperties.URLProperties pharmGKB = configuration.getDownload().getPharmGKB(); Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_SUBDIRECTORY).resolve(PHARMGKB_SUBDIRECTORY); Files.createDirectories(pharmgkbDownloadFolder); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index e5a8c78f26..87e4ec8b98 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -35,7 +35,7 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { Path pubmedFolder = downloadFolder.resolve(EtlCommons.PUBMED_SUBDIRECTORY); Files.createDirectories(pubmedFolder); logger.info("Downloading {} files at {} ...", EtlCommons.PUBMED_DATA, pubmedFolder); From 15448249e65f9017d9746e2d8acdf575952f2164 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 13:25:15 +0200 Subject: [PATCH 078/107] lib: fix sonnar issues, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 88 ++++++++----------- 1 file changed, 35 insertions(+), 53 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 3faea3b305..fec7904b80 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -37,7 +37,7 @@ /** * Created by fjlopez on 03/06/16. */ -public class EtlCommons { +public final class EtlCommons { // Ensembl public static final String ENSEMBL_NAME = "Ensembl"; @@ -159,8 +159,8 @@ public class EtlCommons { // PharmGKB public static final String PHARMGKB_NAME = "PharmGKB"; public static final String PHARMGKB_DATA = "pharmgkb"; - public static final String PHARMGKB_SUBDIRECTORY = "pharmgkb"; - public static final String PHARMGKB_VERSION_FILENAME = "pharmgkb" + SUFFIX_VERSION_FILENAME; + public static final String PHARMGKB_SUBDIRECTORY = PHARMGKB_DATA; + public static final String PHARMGKB_VERSION_FILENAME = PHARMGKB_DATA + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String PHARMGKB_GENES_FILE_ID = "GENES"; public static final String PHARMGKB_CHEMICALS_FILE_ID = "CHEMICALS"; @@ -212,6 +212,9 @@ public class EtlCommons { public static final String REPEATS_NAME = "Repeats"; public static final String REPEATS_DATA = "repeats"; public static final String REPEATS_SUBDIRECTORY = GENOME_SUBDIRECTORY; + /** + * @deprecated (when refactoring downloaders, builders and loaders) + */ @Deprecated public static final String REPEATS_JSON = "repeats"; // Simple repeats @@ -290,15 +293,6 @@ public class EtlCommons { public static final String CADD_DATA = "cadd"; public static final String PPI_DATA = "ppi"; public static final String DRUG_DATA = "drug"; -// public static final String CLINVAR_DATA = "clinvar"; -// public static final String DOCM_DATA = "docm"; -// public static final String COSMIC_DATA = "cosmic"; -// public static final String GWAS_DATA = "gwas"; -// public static final String IARCTP53_GERMLINE_FILE = "germlineMutationDataIARC TP53 Database, R20.txt"; -// public static final String IARCTP53_GERMLINE_REFERENCES_FILE = "germlineMutationReferenceIARC TP53 Database, R20.txt"; -// public static final String IARCTP53_SOMATIC_FILE = "somaticMutationDataIARC TP53 Database, R20.txt"; -// public static final String IARCTP53_SOMATIC_REFERENCES_FILE = "somaticMutationReferenceIARC TP53 Database, R20.txt"; -// public static final String HGMD_DATA = "hgmd"; // Load specific data options public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; @@ -348,23 +342,18 @@ public class EtlCommons { // Splice scores public static final String MMSPLICE_SUBDIRECTORY = "mmsplice"; - public static final String MMSPLICE_VERSION_FILENAME = "mmsplice" + SUFFIX_VERSION_FILENAME; + public static final String MMSPLICE_VERSION_FILENAME = MMSPLICE_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; public static final String SPLICEAI_SUBDIRECTORY = "spliceai"; - public static final String SPLICEAI_VERSION_FILENAME = "spliceai" + SUFFIX_VERSION_FILENAME; + public static final String SPLICEAI_VERSION_FILENAME = SPLICEAI_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; - // binary bigwig file + /** + * @deprecated (when refactoring downloaders, builders and loaders) + */ @Deprecated public static final String GERP_FILE = "gerp_conservation_scores.homo_sapiens.GRCh38.bw"; - // bigwig file manually transformed to bedGraph file - public static final String GERP_PROCESSED_FILE = "gerp.bedGraph.gz"; //"gerp_conservation_scores.homo_sapiens.GRCh38.bedGraph.gz"; public static final String CLINICAL_VARIANTS_JSON_FILE = "clinical_variants.json.gz"; public static final String CLINICAL_VARIANTS_ANNOTATED_JSON_FILE = "clinical_variants.full.json.gz"; - public static final String DOCM_FILE = "docm.json.gz"; public static final String DOCM_NAME = "DOCM"; - public static final String STRUCTURAL_VARIANTS_FOLDER = "structuralVariants"; - public static final String DGV_FILE = "dgv.txt"; - public static final String DGV_VERSION_FILE = "dgvVersion.json"; - public static final String STRUCTURAL_VARIANTS_JSON = "structuralVariants"; public static final String OBO_JSON = "ontology"; public static final String HPO_VERSION_FILE = "hpo" + SUFFIX_VERSION_FILENAME; @@ -377,17 +366,16 @@ public class EtlCommons { // PubMed public static final String PUBMED_NAME = "PubMed"; public static final String PUBMED_DATA = "pubmed"; - public static final String PUBMED_SUBDIRECTORY = "pubmed"; - public static final String PUBMED_VERSION_FILENAME = "pubmed" + SUFFIX_VERSION_FILENAME; + public static final String PUBMED_SUBDIRECTORY = PUBMED_DATA; + public static final String PUBMED_VERSION_FILENAME = PUBMED_DATA + SUFFIX_VERSION_FILENAME; public static final String PUBMED_REGEX_FILE_ID = "PUBMED"; + private EtlCommons() { + throw new IllegalStateException("Utility class"); + } + public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) throws IOException, InterruptedException, CellBaseException { - // This small hack allow to configure the appropriate Logger level from the command line, this is done - // by setting the DEFAULT_LOG_LEVEL_KEY before the logger object is created. -// org.apache.log4j.Logger rootLogger = LogManager.getRootLogger(); -// ConsoleAppender stderr = (ConsoleAppender) rootLogger.getAppender("stdout"); -// stderr.setThreshold(Level.toLevel("debug")); Configurator.setRootLevel(Level.INFO); @@ -395,7 +383,9 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); - logger.debug("Executing command: " + StringUtils.join(builder.command(), " ")); + if (logger.isDebugEnabled()) { + logger.debug("Executing command: {}", StringUtils.join(builder.command(), " ")); + } Process process = builder.start(); process.waitFor(); @@ -408,15 +398,6 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat } return true; -// -// -// boolean executedWithoutErrors = true; -// int genomeInfoExitValue = process.exitValue(); -// if (genomeInfoExitValue != 0) { -// logger.warn("Error executing {}, error code: {}. More info in log file: {}", binPath, genomeInfoExitValue, logFilePath); -// executedWithoutErrors = false; -// } -// return executedWithoutErrors; } private static ProcessBuilder getProcessBuilder(File workingDirectory, String binPath, List args, String logFilePath) { @@ -466,24 +447,23 @@ public static Long countFileLines(Path filePath) throws IOException { public static String getEnsemblUrl(DownloadProperties.EnsemblProperties props, String ensemblRelease, String fileId, String species, String assembly, String chromosome) throws CellBaseException { if (!props.getUrl().getFiles().containsKey(fileId)) { - throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.EnsemblProperties within the CellBase" - + " configuration file"); + throw new CellBaseException(getMissingFileIdMessage(fileId)); } String url = props.getUrl().getHost() + props.getUrl().getFiles().get(fileId); // Change release, species, assembly, chromosome if necessary if (StringUtils.isNotEmpty(ensemblRelease)) { - url = url.replaceAll(PUT_RELEASE_HERE_MARK, ensemblRelease.split("-")[1]); + url = url.replace(PUT_RELEASE_HERE_MARK, ensemblRelease.split("-")[1]); } if (StringUtils.isNotEmpty(species)) { - url = url.replaceAll(PUT_SPECIES_HERE_MARK, species); - url = url.replaceAll(PUT_CAPITAL_SPECIES_HERE_MARK, Character.toUpperCase(species.charAt(0)) + species.substring(1)); + url = url.replace(PUT_SPECIES_HERE_MARK, species); + url = url.replace(PUT_CAPITAL_SPECIES_HERE_MARK, Character.toUpperCase(species.charAt(0)) + species.substring(1)); } if (StringUtils.isNotEmpty(assembly)) { - url = url.replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly); + url = url.replace(PUT_ASSEMBLY_HERE_MARK, assembly); } if (StringUtils.isNotEmpty(chromosome)) { - url = url.replaceAll(PUT_CHROMOSOME_HERE_MARK, chromosome); + url = url.replace(PUT_CHROMOSOME_HERE_MARK, chromosome); } return url; } @@ -495,8 +475,7 @@ public static String getUrl(DownloadProperties.URLProperties props, String fileI public static String getUrl(DownloadProperties.URLProperties props, String fileId, String species, String assembly, String chromosome) throws CellBaseException { if (!props.getFiles().containsKey(fileId)) { - throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" - + " configuration file"); + throw new CellBaseException(getMissingFileIdMessage(fileId)); } String url; String filesValue = props.getFiles().get(fileId); @@ -506,13 +485,13 @@ public static String getUrl(DownloadProperties.URLProperties props, String fileI url = props.getHost() + filesValue; } if (StringUtils.isNotEmpty(species)) { - url = url.replaceAll(PUT_SPECIES_HERE_MARK, species); + url = url.replace(PUT_SPECIES_HERE_MARK, species); } if (StringUtils.isNotEmpty(assembly)) { - url = url.replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly); + url = url.replace(PUT_ASSEMBLY_HERE_MARK, assembly); } if (StringUtils.isNotEmpty(chromosome)) { - url = url.replaceAll(PUT_CHROMOSOME_HERE_MARK, chromosome); + url = url.replace(PUT_CHROMOSOME_HERE_MARK, chromosome); } return url; } @@ -542,8 +521,7 @@ public static boolean isExecutableAvailable(String executable) throws IOExceptio public static String getFilenameFromProps(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { if (!props.getFiles().containsKey(fileId)) { - throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" - + " configuration file"); + throw new CellBaseException(getMissingFileIdMessage(fileId)); } return getFilenameFromUrl(props.getFiles().get(fileId)); } @@ -551,4 +529,8 @@ public static String getFilenameFromProps(DownloadProperties.URLProperties props public static String getFilenameFromUrl(String url) { return Paths.get(url).getFileName().toString(); } + + private static String getMissingFileIdMessage(String fileId) { + return "File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase configuration file"; + } } From 3e438746890db156393ca5b1f98c44a53510bcc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Apr 2024 18:03:53 +0200 Subject: [PATCH 079/107] lib: move the function to parse and build PFMs from the regulation downloader to the regulation builder; and improve regulation builder by adding checks, log messages and fixing sonnar issues, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 18 ++- .../org/opencb/cellbase/lib/EtlCommons.java | 18 ++- .../builders/RegulatoryFeatureBuilder.java | 124 +++++++++++++++--- .../download/RegulationDownloadManager.java | 50 +------ .../RegulatoryFeatureBuilderTest.java | 2 +- 5 files changed, 137 insertions(+), 75 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 3d3b9d9d37..718595bfe5 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -210,6 +210,9 @@ private CellBaseBuilder buildObo() { return new OntologyBuilder(oboDir, serializer); } + /** + * @deprecated (when using the new copyVersionFiles) + */ @Deprecated private void copyVersionFiles(List pathList) { for (Path path : pathList) { @@ -268,11 +271,16 @@ private CellBaseBuilder buildRevel() { return new RevelScoreBuilder(missensePredictionScorePath, serializer); } - private CellBaseBuilder buildRegulation() { - Path regulatoryRegionFilesDir = downloadFolder.resolve("regulation"); - copyVersionFiles(Collections.singletonList(regulatoryRegionFilesDir.resolve("ensemblRegulationVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_region"); - return new RegulatoryFeatureBuilder(regulatoryRegionFilesDir, serializer); + private CellBaseBuilder buildRegulation() throws CellBaseException { + // Sanity check + Path regulationDownloadPath = downloadFolder.resolve(REGULATION_DATA); + Path regulationBuildPath = buildFolder.resolve(REGULATION_DATA); + copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(REGULATORY_BUILD_VERSION_FILENAME), + regulationDownloadPath.resolve(MOTIF_FEATURES_VERSION_FILENAME)), regulationBuildPath); + + // Create the file serializer and the regulatory feature builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(regulationBuildPath, REGULATORY_REGION_BASENAME); + return new RegulatoryFeatureBuilder(regulationDownloadPath, serializer); } private CellBaseBuilder buildProtein() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index fec7904b80..707ecf2714 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -29,6 +29,7 @@ import java.io.File; import java.io.IOException; import java.io.InputStreamReader; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; @@ -267,8 +268,11 @@ public final class EtlCommons { public static final String CADD_FILE_ID = "CADD"; // Regulation + public static final String REGULATION_NAME = "Regulation"; public static final String REGULATION_DATA = "regulation"; - public static final String REGULATION_SUBDIRECTORY = "regulation"; + public static final String REGULATION_SUBDIRECTORY = REGULATION_DATA; + public static final String REGULATORY_PFM_BASENAME = "regulatory_pfm"; + public static final String REGULATORY_REGION_BASENAME = "regulatory_region"; // Regulatory build and motif features (see Ensembl files: regulatory build and motif features files) public static final String REGULATORY_BUILD_NAME = "Regulatory Build"; public static final String REGULATORY_BUILD_VERSION_FILENAME = "regulatoryBuild" + SUFFIX_VERSION_FILENAME; @@ -530,6 +534,18 @@ public static String getFilenameFromUrl(String url) { return Paths.get(url).getFileName().toString(); } + public static void checkDirectory(Path path, String name) throws CellBaseException { + if (path == null) { + throw new CellBaseException(name + " directory is null"); + } + if (!Files.exists(path)) { + throw new CellBaseException(name + " directory " + path + " does not exist"); + } + if (!Files.isDirectory(path)) { + throw new CellBaseException(name + " directory " + path + " is not a directory"); + } + } + private static String getMissingFileIdMessage(String fileId) { return "File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase configuration file"; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java index d1ae5fb205..c8067661dc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java @@ -16,63 +16,149 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.formats.feature.gff.Gff2; import org.opencb.biodata.formats.feature.gff.io.Gff2Reader; import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.biodata.models.core.RegulatoryFeature; +import org.opencb.biodata.models.core.RegulatoryPfm; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import java.io.File; import java.io.IOException; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashSet; +import java.util.List; import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.opencb.cellbase.lib.EtlCommons.*; public class RegulatoryFeatureBuilder extends CellBaseBuilder { - private final Path gffFile; - protected Set regulatoryFeatureSet; + private Path regulationPath; + + private Set regulatoryFeatureSet; - public RegulatoryFeatureBuilder(Path regulatoryDirectoryPath, CellBaseSerializer serializer) { + public RegulatoryFeatureBuilder(Path regulationPath, CellBaseSerializer serializer) { super(serializer); - // TODO: fix it ! - gffFile = null; -// gffFile = regulatoryDirectoryPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); + this.regulationPath = regulationPath; } @Override public void parse() throws Exception { - logger.info("Parsing regulatory features..."); - if (Files.exists(gffFile)) { - parseGffFile(gffFile); - } else { - // TODO: fix it -// logger.warn("No regulatory features GFF file found {}", EtlCommons.REGULATORY_FEATURES_FILE); - logger.warn("Skipping regulatory features GFF file parsing. Regulatory feature data models will not be built."); + logger.info(BUILDING_LOG_MESSAGE, REGULATION_NAME); + + // Sanity check + checkDirectory(regulationPath, REGULATION_NAME); + + // Check build regulatory files + List regulatoryFiles = checkFiles(dataSourceReader.readValue(regulationPath.resolve(REGULATORY_BUILD_VERSION_FILENAME) + .toFile()), regulationPath, REGULATION_NAME + "/" + REGULATORY_BUILD_NAME); + if (regulatoryFiles.size() != 1) { + throw new CellBaseException("One " + REGULATORY_BUILD_NAME + " file is expected, but currently there are " + + regulatoryFiles.size() + " files"); } + + // Check motif features files + List motifFeaturesFiles = checkFiles(dataSourceReader.readValue(regulationPath.resolve(MOTIF_FEATURES_VERSION_FILENAME) + .toFile()), regulationPath, REGULATION_NAME + "/" + MOTIF_FEATURES_NAME); + if (motifFeaturesFiles.size() != 2) { + throw new CellBaseException("Two " + MOTIF_FEATURES_NAME + " files are expected, but currently there are " + + motifFeaturesFiles.size() + " files"); + } + + // Downloading and building pfm matrices + File motifFile = motifFeaturesFiles.get(0).getName().endsWith("tbi") ? motifFeaturesFiles.get(1) : motifFeaturesFiles.get(0); + loadPfmMatrices(motifFile.toPath(), serializer.getOutdir()); + + // Parse regulatory build features + parseGffFile(regulatoryFiles.get(0).toPath()); + + logger.info(BUILDING_DONE_LOG_MESSAGE, REGULATION_NAME); } protected void parseGffFile(Path regulatoryFeatureFile) throws IOException, NoSuchMethodException, FileFormatException { + logger.info(PARSING_LOG_MESSAGE, regulatoryFeatureFile); + + // Create and populate regulatory feature set regulatoryFeatureSet = new HashSet<>(); - if (regulatoryFeatureFile != null && Files.exists(regulatoryFeatureFile) && !Files.isDirectory(regulatoryFeatureFile) - && Files.size(regulatoryFeatureFile) > 0) { - Gff2Reader regulatoryFeatureReader = new Gff2Reader(regulatoryFeatureFile); + try (Gff2Reader regulatoryFeatureReader = new Gff2Reader(regulatoryFeatureFile)) { Gff2 feature; while ((feature = regulatoryFeatureReader.read()) != null) { regulatoryFeatureSet.add(feature); } - regulatoryFeatureReader.close(); } - int i = 0; // Serialize and save results for (Gff2 feature : regulatoryFeatureSet) { - // ID=TF_binding_site:ENSR00000243312; + // In order to get the ID we split the attribute format: ID=TF_binding_site:ENSR00000243312; .... String id = feature.getAttribute().split(";")[0].split(":")[1]; RegulatoryFeature regulatoryFeature = new RegulatoryFeature(id, feature.getSequenceName(), feature.getFeature(), feature.getStart(), feature.getEnd()); serializer.serialize(regulatoryFeature); } serializer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, regulatoryFeatureFile); + } + + private void loadPfmMatrices(Path motifGffFile, Path buildFolder) throws IOException, NoSuchMethodException, FileFormatException, + InterruptedException { + Path regulatoryPfmPath = buildFolder.resolve(REGULATORY_PFM_BASENAME + ".json.gz"); + logger.info("Downloading and building PFM matrices in {} from {} ...", regulatoryPfmPath, motifGffFile); + if (Files.exists(regulatoryPfmPath)) { + logger.info("{} is already built", regulatoryPfmPath); + return; + } + + Set motifIds = new HashSet<>(); + try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { + Gff2 tfbsMotifFeature; + Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); + while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { + String pfmId = getMatrixId(filePattern, tfbsMotifFeature); + if (StringUtils.isNotEmpty(pfmId)) { + motifIds.add(pfmId); + } + } + } + + ObjectMapper mapper = new ObjectMapper(); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, REGULATORY_PFM_BASENAME, true); + if (logger.isInfoEnabled()) { + logger.info("Looking up {} PFMs", motifIds.size()); + } + for (String pfmId : motifIds) { + String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId + + "?unit=frequencies;content-type=application/json"; + URL url = new URL(urlString); + RegulatoryPfm regulatoryPfm = mapper.readValue(url, RegulatoryPfm.class); + serializer.serialize(regulatoryPfm); + // https://github.com/Ensembl/ensembl-rest/wiki/Rate-Limits + TimeUnit.MILLISECONDS.sleep(250); + } + serializer.close(); + + logger.info("Downloading and building PFM matrices at {} done.", regulatoryPfmPath); + } + + private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) { + Matcher matcher = pattern.matcher(tfbsMotifFeature.getAttribute()); + if (matcher.find()) { + return matcher.group(0); + } + return null; + } + + public Set getRegulatoryFeatureSet() { + return regulatoryFeatureSet; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index d11e907aa0..56d15bf844 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -89,60 +89,12 @@ private List downloadRegulatoryaAndMotifFeatures() throws IOExcept downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); // Save data source (name, category, version,...) - saveDataSource(MOTIF_FEATURES_NAME, REGULATION_DATA, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), urls, + saveDataSource(MOTIF_FEATURES_NAME, REGULATION_DATA, "(" + ENSEMBL_NAME + " " + ensemblVersion + ")", getTimeStamp(), urls, regulationFolder.resolve(MOTIF_FEATURES_VERSION_FILENAME)); - // TODO: This will be executed in the CellBase build -// loadPfmMatrices(); - return downloadFiles; } -// private void loadPfmMatrices() -// throws IOException, NoSuchMethodException, FileFormatException, InterruptedException, CellBaseException { -// logger.info("Downloading and building pfm matrices..."); -// if (Files.exists(buildFolder.resolve("regulatory_pfm.json.gz"))) { -// logger.info("regulatory_pfm.json.gz is already built"); -// return; -// } -// Set motifIds = new HashSet<>(); -// Path motifGffFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE); -// try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { -// Gff2 tfbsMotifFeature; -// Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); -// while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { -// String pfmId = getMatrixId(filePattern, tfbsMotifFeature); -// if (StringUtils.isNotEmpty(pfmId)) { -// motifIds.add(pfmId); -// } -// } -// } -// -// ObjectMapper mapper = new ObjectMapper(); -// CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_pfm", true); -// if (logger.isInfoEnabled()) { -// logger.info("Looking up {} pfms", motifIds.size()); -// } -// for (String pfmId : motifIds) { -// String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId -// + "?unit=frequencies;content-type=application/json"; -// URL url = new URL(urlString); -// RegulatoryPfm regulatoryPfm = mapper.readValue(url, RegulatoryPfm.class); -// serializer.serialize(regulatoryPfm); -// // https://github.com/Ensembl/ensembl-rest/wiki/Rate-Limits -// TimeUnit.MILLISECONDS.sleep(250); -// } -// serializer.close(); -// } -// -// private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) { -// Matcher matcher = pattern.matcher(tfbsMotifFeature.getAttribute()); -// if (matcher.find()) { -// return matcher.group(0); -// } -// return null; -// } - private DownloadFile downloadMirna() throws IOException, InterruptedException, CellBaseException { logger.info("Downloading {} ...", MIRBASE_NAME); return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_FILE_ID, MIRBASE_NAME, REGULATION_DATA, diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilderTest.java index 1bd36998b6..cde955fb63 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilderTest.java @@ -33,7 +33,7 @@ public void testParse() throws Exception { CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "regulatory_feature", true); RegulatoryFeatureBuilder parser = new RegulatoryFeatureBuilder(regulationDirectoryPath, serializer); parser.parse(); - Set features = parser.regulatoryFeatureSet; + Set features = parser.getRegulatoryFeatureSet(); assertEquals(1, features.size()); Gff2 feature = features.iterator().next(); From 959e42365fe80db418377e140fdbe464eb233f52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 12:11:06 +0200 Subject: [PATCH 080/107] core: update ontology section of the CellBase configuration since ontology versions will be taken from the OBO files content, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index a2330cd00c..af817b1844 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -247,22 +247,22 @@ download: ## OBO Ontologies hpoObo: host: http://purl.obolibrary.org/obo/ - version: "2024-03-01" + ## The version is retrieved from the OBO file files: HPO: hp.obo goObo: - host: http://purl.obolibrary.org/obo/go/ - version: "2024-03-01" + host: http://purl.obolibrary.org/obo/ + ## The version is retrieved from the OBO file files: GO: go/go-basic.obo doidObo: host: http://purl.obolibrary.org/obo/ - version: "2024-03-01" + ## The version is retrieved from the OBO file files: DOID: doid.obo mondoObo: host: http://purl.obolibrary.org/obo/ - version: "2024-03-01" + ## The version is retrieved from the OBO file files: MONDO: mondo.obo From 158c259a25fdd3f898401a6eea551d52e2c14180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 12:12:07 +0200 Subject: [PATCH 081/107] lib: update ontology download since ontology versions will be taken from the OBO files content; and improve log messages, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 3 +- .../lib/download/OntologyDownloadManager.java | 50 +++++++++++++++---- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 707ecf2714..fafd01a0f3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -232,8 +232,9 @@ public final class EtlCommons { public static final String WINDOW_MASKER_FILE_ID = "WINDOW_MASKER"; // Ontology + public static final String ONTOLOGY_NAME = "Ontology"; public static final String ONTOLOGY_DATA = "ontology"; - public static final String ONTOLOGY_SUBDIRECTORY = "ontology"; + public static final String ONTOLOGY_SUBDIRECTORY = ONTOLOGY_DATA; // HPO public static final String HPO_OBO_NAME = "HPO"; public static final String HPO_OBO_VERSION_FILENAME = "hpoObo" + SUFFIX_VERSION_FILENAME; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index b09cf76f2f..4a91d84225 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -18,17 +18,22 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.commons.utils.FileUtils; +import java.io.BufferedReader; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; public class OntologyDownloadManager extends AbstractDownloadManager { + private static final String DATA_VERSION_FIELD = "data-version:"; + public OntologyDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); @@ -37,31 +42,58 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec public List download() throws IOException, InterruptedException, CellBaseException { Path oboFolder = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); Files.createDirectories(oboFolder); - logger.info("Downloading {} files {} ...", ONTOLOGY_DATA, oboFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, ONTOLOGY_NAME); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // HPO - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_NAME, ONTOLOGY_DATA, - HPO_OBO_FILE_ID, HPO_OBO_VERSION_FILENAME, oboFolder); + downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, oboFolder); + String version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(HPO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(HPO_OBO_VERSION_FILENAME)); downloadFiles.add(downloadFile); // GO - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoObo(), GO_OBO_NAME, ONTOLOGY_DATA, - GO_OBO_FILE_ID, GO_OBO_VERSION_FILENAME, oboFolder); + downloadFile = downloadDataSource(configuration.getDownload().getGoObo(), GO_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(GO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(GO_OBO_VERSION_FILENAME)); downloadFiles.add(downloadFile); // DOID - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_NAME, ONTOLOGY_DATA, - DOID_OBO_FILE_ID, DOID_OBO_VERSION_FILENAME, oboFolder); + downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(DOID_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(DOID_OBO_VERSION_FILENAME)); downloadFiles.add(downloadFile); // Mondo - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_NAME, ONTOLOGY_DATA, - MONDO_OBO_FILE_ID, MONDO_OBO_VERSION_FILENAME, oboFolder); + downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, oboFolder); + version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); + saveDataSource(MONDO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(MONDO_OBO_VERSION_FILENAME)); downloadFiles.add(downloadFile); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, ONTOLOGY_NAME); + return downloadFiles; } + + private String getVersionFromOboFile(Path oboPath) throws CellBaseException, IOException { + String version = null; + if (!oboPath.toFile().exists()) { + throw new CellBaseException("OBO file " + oboPath + " does not exit"); + } + try (BufferedReader reader = FileUtils.newBufferedReader(oboPath)) { + String line; + while ((line = reader.readLine()) != null) { + if (line.startsWith(DATA_VERSION_FIELD)) { + version = line.split(DATA_VERSION_FIELD)[1].trim(); + break; + } + } + } + return version; + } } From 0b83831ce59e8d98f553af7adadc317b6cc1830e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 13:21:38 +0200 Subject: [PATCH 082/107] app: update the build command executor to check/copy the ontology version files before creating the ontology builder, #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 718595bfe5..42392f523b 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -204,10 +204,18 @@ private CellBaseBuilder buildRepeats() throws CellBaseException { return new RepeatsBuilder(repeatsDownloadPath, serializer, configuration); } - private CellBaseBuilder buildObo() { - Path oboDir = downloadFolder.resolve(ONTOLOGY_DATA); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.OBO_JSON); - return new OntologyBuilder(oboDir, serializer); + private CellBaseBuilder buildObo() throws CellBaseException { + Path oboDownloadPath = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); + Path oboBuildPath = buildFolder.resolve(ONTOLOGY_SUBDIRECTORY); + List versionPaths = Arrays.asList(oboDownloadPath.resolve(HPO_OBO_VERSION_FILENAME), + oboDownloadPath.resolve(GO_OBO_VERSION_FILENAME), + oboDownloadPath.resolve(DOID_OBO_VERSION_FILENAME), + oboDownloadPath.resolve(MONDO_OBO_VERSION_FILENAME)); + copyVersionFiles(versionPaths, oboBuildPath); + + // Create serializer and return the ontology builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(oboBuildPath, OBO_BASENAME); + return new OntologyBuilder(oboDownloadPath, serializer); } /** From 39f0f4148000cfc881fb6ac7cf9732688d0582cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 13:23:18 +0200 Subject: [PATCH 083/107] lib: improve the ontology builder by removing hardcoded filenames, adding log messages and refactoring code, #TASK-5576, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 3 +- .../lib/builders/OntologyBuilder.java | 79 +++++++++++-------- 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index fafd01a0f3..83b6d2d562 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -235,6 +235,7 @@ public final class EtlCommons { public static final String ONTOLOGY_NAME = "Ontology"; public static final String ONTOLOGY_DATA = "ontology"; public static final String ONTOLOGY_SUBDIRECTORY = ONTOLOGY_DATA; + public static final String OBO_BASENAME = "ontology"; // HPO public static final String HPO_OBO_NAME = "HPO"; public static final String HPO_OBO_VERSION_FILENAME = "hpoObo" + SUFFIX_VERSION_FILENAME; @@ -359,8 +360,6 @@ public final class EtlCommons { public static final String CLINICAL_VARIANTS_JSON_FILE = "clinical_variants.json.gz"; public static final String CLINICAL_VARIANTS_ANNOTATED_JSON_FILE = "clinical_variants.full.json.gz"; public static final String DOCM_NAME = "DOCM"; - - public static final String OBO_JSON = "ontology"; public static final String HPO_VERSION_FILE = "hpo" + SUFFIX_VERSION_FILENAME; public static final String GO_VERSION_FILE = "go" + SUFFIX_VERSION_FILENAME; public static final String DO_VERSION_FILE = "do" + SUFFIX_VERSION_FILENAME; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index cbe7c56952..679e0d30f8 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -19,60 +19,69 @@ import org.opencb.biodata.formats.obo.OboParser; import org.opencb.biodata.models.core.OntologyTerm; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; import java.nio.file.Path; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class OntologyBuilder extends CellBaseBuilder { - private Path hpoFile; - private Path goFile; - private Path doidFile; - private Path mondoFile; + private Path oboDownloadPath; - public OntologyBuilder(Path oboDirectoryPath, CellBaseSerializer serializer) { + public OntologyBuilder(Path oboDownloadPath, CellBaseSerializer serializer) { super(serializer); - // TODO: fix it !! -// hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE); -// goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE); -// doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE); -// mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE); + this.oboDownloadPath = oboDownloadPath; } @Override public void parse() throws Exception { - BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFile); - OboParser parser = new OboParser(); - List terms = parser.parseOBO(bufferedReader, "Human Phenotype Ontology"); - for (OntologyTerm term : terms) { - term.setSource("HP"); - serializer.serialize(term); - } + logger.info(BUILDING_LOG_MESSAGE, ONTOLOGY_NAME); - bufferedReader = FileUtils.newBufferedReader(goFile); - terms = parser.parseOBO(bufferedReader, "Gene Ontology"); - for (OntologyTerm term : terms) { - term.setSource("GO"); - serializer.serialize(term); - } + // Sanity check + checkDirectory(oboDownloadPath, REGULATION_NAME); - bufferedReader = FileUtils.newBufferedReader(doidFile); - terms = parser.parseOBO(bufferedReader, "Human Disease Ontology"); - for (OntologyTerm term : terms) { - term.setSource("DOID"); - serializer.serialize(term); - } + // Check ontology files + List hpoFiles = checkOboFiles(oboDownloadPath.resolve(HPO_OBO_VERSION_FILENAME), HPO_OBO_NAME); + List goFiles = checkOboFiles(oboDownloadPath.resolve(GO_OBO_VERSION_FILENAME), GO_OBO_NAME); + List doidFiles = checkOboFiles(oboDownloadPath.resolve(DOID_OBO_VERSION_FILENAME), DOID_OBO_NAME); + List mondoFiles = checkOboFiles(oboDownloadPath.resolve(MONDO_OBO_VERSION_FILENAME), MONDO_OBO_NAME); - bufferedReader = FileUtils.newBufferedReader(mondoFile); - terms = parser.parseOBO(bufferedReader, "Mondo Ontology"); - for (OntologyTerm term : terms) { - term.setSource("MONDO"); - serializer.serialize(term); - } + // Parse OBO files and build + parseOboFile(hpoFiles.get(0), HPO_OBO_NAME); + parseOboFile(goFiles.get(0), GO_OBO_NAME); + parseOboFile(doidFiles.get(0), DOID_OBO_NAME); + parseOboFile(mondoFiles.get(0), MONDO_OBO_NAME); + // Close serializer serializer.close(); + + logger.info(BUILDING_DONE_LOG_MESSAGE, ONTOLOGY_NAME); + } + + private void parseOboFile(File oboFile, String name) throws IOException { + logger.info(PARSING_LOG_MESSAGE, oboFile); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(oboFile.toPath())) { + OboParser parser = new OboParser(); + List terms = parser.parseOBO(bufferedReader, name); + for (OntologyTerm term : terms) { + serializer.serialize(term); + } + } + logger.info(PARSING_DONE_LOG_MESSAGE, oboFile); + } + + private List checkOboFiles(Path versionFilePath, String name) throws IOException, CellBaseException { + List files = checkFiles(dataSourceReader.readValue(versionFilePath.toFile()), oboDownloadPath, ONTOLOGY_NAME + "/" + name); + if (files.size() != 1) { + throw new CellBaseException("One " + name + " file is expected, but currently there are " + files.size() + " files"); + } + return files; } } From 5c3dae0c6ddd23345f1e03bf9ec0873a833d0788 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 16:28:58 +0200 Subject: [PATCH 084/107] lib: improve the PharmGKB downloader by moving the function to unzip PharmGKB files from the downloader to the PharmGKB builder and adding log messages, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/download/PharmGKBDownloadManager.java | 44 +++++-------------- 2 files changed, 12 insertions(+), 33 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 83b6d2d562..56ad6dae8b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -155,6 +155,7 @@ public final class EtlCommons { public static final String SPLICE_SCORE_DATA = "splice_score"; // Pharmacogenomics + public static final String PHARMACOGENOMICS_NAME = "Pharmacogenomics"; public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; public static final String PHARMACOGENOMICS_SUBDIRECTORY = "pharmacogenomics"; // PharmGKB diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index f52c3f8a23..873387f94b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -19,16 +19,11 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.commons.exec.Command; -import org.opencb.commons.utils.FileUtils; import java.io.IOException; -import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import java.util.Map; @@ -43,49 +38,32 @@ public PharmGKBDownloadManager(String species, String assembly, Path targetDirec @Override public List download() throws IOException, InterruptedException, CellBaseException { - DownloadProperties.URLProperties pharmGKB = configuration.getDownload().getPharmGKB(); + logger.info(DOWNLOADING_LOG_MESSAGE, PHARMACOGENOMICS_NAME); + Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_SUBDIRECTORY).resolve(PHARMGKB_SUBDIRECTORY); Files.createDirectories(pharmgkbDownloadFolder); - logger.info("Downloading {} files at {} ...", PHARMGKB_DATA, pharmgkbDownloadFolder); + + DownloadProperties.URLProperties pharmGKBProps = configuration.getDownload().getPharmGKB(); List urls = new ArrayList<>(); List downloadFiles = new ArrayList<>(); - String host = pharmGKB.getHost(); - for (Map.Entry entry : pharmGKB.getFiles().entrySet()) { + String host = pharmGKBProps.getHost(); + for (Map.Entry entry : pharmGKBProps.getFiles().entrySet()) { String url = host + entry.getValue(); urls.add(url); - Path downloadedFileName = Paths.get(new URL(url).getPath()).getFileName(); - Path downloadedFilePath = pharmgkbDownloadFolder.resolve(downloadedFileName); - logger.info("Downloading file {} to {}", url, downloadedFilePath); + Path downloadedFilePath = pharmgkbDownloadFolder.resolve(getFilenameFromUrl(url)); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, downloadedFilePath); DownloadFile downloadFile = downloadFile(url, downloadedFilePath.toString()); downloadFiles.add(downloadFile); - - // Unzip downloaded file - unzip(downloadedFilePath.getParent(), downloadedFileName.toString(), Collections.emptyList(), - pharmgkbDownloadFolder.resolve(downloadedFileName.toString().split("\\.")[0])); } // Save versions - saveDataSource(PHARMGKB_NAME, PHARMACOGENOMICS_DATA, pharmGKB.getVersion(), getTimeStamp(), urls, + saveDataSource(PHARMGKB_NAME, PHARMACOGENOMICS_NAME, pharmGKBProps.getVersion(), getTimeStamp(), urls, pharmgkbDownloadFolder.resolve(PHARMGKB_VERSION_FILENAME)); - return downloadFiles; - } - - private void unzip(Path inPath, String zipFilename, List outFilenames, Path outPath) throws IOException { - // Check zip file exists - FileUtils.checkFile(inPath.resolve(zipFilename)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, PHARMACOGENOMICS_NAME); - // Unzip files if output dir does NOT exist - if (!outPath.toFile().exists()) { - logger.info("Unzipping {} into {}", zipFilename, outPath); - Command cmd = new Command("unzip -d " + outPath + " " + inPath.resolve(zipFilename)); - cmd.run(); - // Check if expected files exist - for (String outFilename : outFilenames) { - FileUtils.checkFile(outPath.resolve(outFilename)); - } - } + return downloadFiles; } } From 971235e7bff2dbe1ce27efa6898ebf520757b8b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 18:47:04 +0200 Subject: [PATCH 085/107] lib: improve the PharmGKB builder by adding checks and log messages; and move the function to unzip PharmGKB files from the downloader to the builder, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 23 ++-- .../org/opencb/cellbase/lib/EtlCommons.java | 8 +- .../lib/builders/PharmGKBBuilder.java | 127 +++++++++++++----- 3 files changed, 104 insertions(+), 54 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 42392f523b..8225648820 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -409,22 +409,15 @@ private CellBaseBuilder buildPubMed() throws IOException { return new PubMedBuilder(pubmedInputFolder, serializer); } - private CellBaseBuilder buildPharmacogenomics() throws IOException { - Path inFolder = downloadFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA); - Path outFolder = buildFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA); - if (!outFolder.toFile().exists()) { - outFolder.toFile().mkdirs(); - } - - logger.info("Copying PharmGKB version file..."); - if (inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME).toFile().exists()) { - Files.copy(inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME), - outFolder.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME), - StandardCopyOption.REPLACE_EXISTING); - } + private CellBaseBuilder buildPharmacogenomics() throws CellBaseException { + // Sanity check + Path pharmGkbDownloadPath = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + Path pharmGkbBuildPath = buildFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + copyVersionFiles(Arrays.asList(pharmGkbDownloadPath.resolve(PHARMGKB_VERSION_FILENAME)), pharmGkbBuildPath); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder); - return new PharmGKBBuilder(inFolder, serializer); + // Create the file serializer and the PharmGKB feature builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pharmGkbBuildPath); + return new PharmGKBBuilder(pharmGkbDownloadPath, serializer); } private void checkVersionFiles(List versionPaths) throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 56ad6dae8b..8b56fc9d0f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -161,8 +161,8 @@ public final class EtlCommons { // PharmGKB public static final String PHARMGKB_NAME = "PharmGKB"; public static final String PHARMGKB_DATA = "pharmgkb"; - public static final String PHARMGKB_SUBDIRECTORY = PHARMGKB_DATA; - public static final String PHARMGKB_VERSION_FILENAME = PHARMGKB_DATA + SUFFIX_VERSION_FILENAME; + public static final String PHARMGKB_SUBDIRECTORY = "pharmgkb"; + public static final String PHARMGKB_VERSION_FILENAME = "pharmGKB" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String PHARMGKB_GENES_FILE_ID = "GENES"; public static final String PHARMGKB_CHEMICALS_FILE_ID = "CHEMICALS"; @@ -396,8 +396,8 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat // Check process output if (process.exitValue() != 0) { - String msg = "Error executing command '" + binPath + "'; error code = " + process.exitValue() + ". More info in log file: " - + logFilePath; + String msg = "Error executing command '" + binPath + "'; args = " + args + ", error code = " + process.exitValue() + + ". More info in log file: " + logFilePath; logger.error(msg); throw new CellBaseException(msg); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java index 1f7a4836ca..0e6017fc01 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java @@ -23,13 +23,16 @@ import org.opencb.biodata.models.core.Xref; import org.opencb.biodata.models.pharma.*; import org.opencb.biodata.models.pharma.guideline.BasicObject; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; @@ -37,8 +40,7 @@ public class PharmGKBBuilder extends CellBaseBuilder { - private final Path inputDir; - private final Path pharmGKBDir; + private final Path pharmGkbDownloadPath; private static final String CHEMICALS_BASENAME = "chemicals"; private static final String CHEMICALS_TSV_FILENAME = "chemicals.tsv"; @@ -88,21 +90,24 @@ public class PharmGKBBuilder extends CellBaseBuilder { private static final String PHARMGKB_LAST_UPDATE_DATE_KEY = "PHARMGKB_LAST_UPDATE_DATE"; private static final String PHARMGKB_IS_VIP_KEY = "PHARMGKB_IS_VIP"; - public PharmGKBBuilder(Path inputDir, CellBaseFileSerializer serializer) { + public PharmGKBBuilder(Path parmGkbDownloadPath, CellBaseFileSerializer serializer) { super(serializer); - - this.inputDir = inputDir; - this.pharmGKBDir = inputDir.resolve(PHARMGKB_DATA); + this.pharmGkbDownloadPath = parmGkbDownloadPath; } @Override public void parse() throws Exception { - // Check input folder - FileUtils.checkDirectory(inputDir); + logger.info(BUILDING_LOG_MESSAGE, PHARMGKB_NAME); + + // Sanity check + checkDirectory(pharmGkbDownloadPath, PHARMGKB_NAME); - // PharmGKB - FileUtils.checkDirectory(pharmGKBDir); - logger.info("Parsing {} files and building the data models...", PHARMGKB_NAME); + // Check PharmGKB files + List pharmGkbFiles = checkFiles(dataSourceReader.readValue(pharmGkbDownloadPath.resolve(PHARMGKB_VERSION_FILENAME).toFile()), + pharmGkbDownloadPath, PHARMACOGENOMICS_NAME + "/" + PHARMGKB_NAME); + + // Unzip downloaded file + unzipDownloadedFiles(pharmGkbFiles); // Parse chemical file Map chemicalsMap = parseChemicalFile(); @@ -113,8 +118,6 @@ public void parse() throws Exception { // Parse gene file parseGeneFile(chemicalsMap); - logger.info("Parsing {} files finished.", PHARMGKB_NAME); - // Generation the pharmacogenomics JSON file logger.info("Writing {} JSON file to {} ...", PHARMACOGENOMICS_DATA, serializer.getOutdir()); int counter = 0; @@ -125,11 +128,14 @@ public void parse() throws Exception { } } serializer.close(); - logger.info("Writing {} JSON file done!", PHARMACOGENOMICS_DATA); + + logger.info(BUILDING_DONE_LOG_MESSAGE, PHARMGKB_NAME); } private Map parseChemicalFile() throws IOException { - Path chemicalsFile = pharmGKBDir.resolve(CHEMICALS_BASENAME).resolve(CHEMICALS_TSV_FILENAME); + Path chemicalsFile = serializer.getOutdir().resolve(CHEMICALS_BASENAME).resolve(CHEMICALS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, chemicalsFile); + Map chemicalsMap = new HashMap<>(); try (BufferedReader br = FileUtils.newBufferedReader(chemicalsFile)) { // Skip first line, i.e. the header line @@ -177,6 +183,7 @@ private Map parseChemicalFile() throws IOException { } logger.info("Number of Chemical items read {}", chemicalsMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, chemicalsFile); return chemicalsMap; } @@ -192,8 +199,9 @@ private void parseClinicalAnnotationFiles(Map chemicalsM Map> variantMap = parseVariantFile(); // clinical_annotations.tsv - try (BufferedReader br = FileUtils.newBufferedReader(pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME) - .resolve(CLINICAL_ANNOTATIONS_TSV_FILENAME))) { + Path clinAnnotPath = serializer.getOutdir().resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, clinAnnotPath); + try (BufferedReader br = FileUtils.newBufferedReader(clinAnnotPath)) { // Skip first line, i.e. the header line String line = br.readLine(); while ((line = br.readLine()) != null) { @@ -278,6 +286,7 @@ private void parseClinicalAnnotationFiles(Map chemicalsM } } } + logger.info(PARSING_DONE_LOG_MESSAGE, clinAnnotPath); // Update the clinical annotation map by parsing the clinical annotation evidences parseClinicalAnnotationEvidenceFile(variantAnnotationMap); @@ -300,7 +309,9 @@ private void parseClinicalAnnotationFiles(Map chemicalsM private Map> parseVariantFile() throws IOException { Map> variantMap = new HashMap<>(); // Parse the variant file (i.e., variants.tsv) - Path varPath = pharmGKBDir.resolve(VARIANTS_BASENAME).resolve(VARIANTS_TSV_FILENAME); + Path varPath = serializer.getOutdir().resolve(VARIANTS_BASENAME).resolve(VARIANTS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varPath); + try (BufferedReader br = FileUtils.newBufferedReader(varPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -367,6 +378,7 @@ private Map> parseVariantFile() throws IOException { } logger.info("Number of variants = {}", variantMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, varPath); return variantMap; } @@ -385,7 +397,8 @@ private void parseClinicalAnnotationEvidenceFile(Map variantAnnotationMap) throws IOException { // Parse the clinical annotation alleles file (i.e., clinical_ann_alleles.tsv) - Path allelesPath = pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_ALLELES_TSV_FILENAME); + Path allelesPath = serializer.getOutdir().resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_ALLELES_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, allelesPath); try (BufferedReader br = FileUtils.newBufferedReader(allelesPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -502,12 +520,14 @@ private void parseClinicalAnnotationAlleleFile(Map variantAssociationMap) throws IOException { // For CellBase, variant association corresponds to PharmGKB variant annotation // Parse the variant annotation file (i.e., var_drug_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(VARIANT_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(VARIANT_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -562,6 +582,7 @@ private void parseVariantAnnotationFile(Map va } } logger.info("Number of variant annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private Map parseGuidelineAnnotationFiles() throws IOException { @@ -571,7 +592,7 @@ private Map parseGuidelineAnnotationFiles() t ObjectReader objectReader = mapper.readerFor(PharmaGuidelineAnnotation.class); // Parse the guideline annotations JSON files - Path guidelinesPath = pharmGKBDir.resolve(GUIDELINE_ANNOTATIONS_BASENAME); + Path guidelinesPath = serializer.getOutdir().resolve(GUIDELINE_ANNOTATIONS_BASENAME); FileUtils.checkDirectory(guidelinesPath); for (File file : Objects.requireNonNull(guidelinesPath.toFile().listFiles())) { if (file.getName().endsWith("json")) { @@ -593,7 +614,8 @@ private Map parseGuidelineAnnotationFiles() t private Map parseDrugLabelAnnotationFile() throws IOException { Map drugLabelAnnotationMap = new HashMap<>(); // Parse the drug labels annotations file (i.e., drugLabels.tsv) - Path drugLabelPath = pharmGKBDir.resolve(DRUG_LABELS_BASENAME).resolve(DRUG_LABELS_TSV_FILENAME); + Path drugLabelPath = serializer.getOutdir().resolve(DRUG_LABELS_BASENAME).resolve(DRUG_LABELS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, drugLabelPath); try (BufferedReader br = FileUtils.newBufferedReader(drugLabelPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -631,12 +653,15 @@ private Map parseDrugLabelAnnotationFile() th } logger.info("Number of drug label annotations = {}", drugLabelAnnotationMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, drugLabelPath); return drugLabelAnnotationMap; } private void parsePhenotypeAnnotationFile(Map variantAssociationMap) throws IOException { // Parse the variant annotation file (i.e., var_pheno_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(PHENOTYPE_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(PHENOTYPE_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); + int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -691,11 +716,13 @@ private void parsePhenotypeAnnotationFile(Map } } logger.info("Number of phenotype annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private void parseFunctionalAnnotationFile(Map variantAssociationMap) throws IOException { // Parse the variant annotation file (i.e., var_fa_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(FUNCTIONAL_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(FUNCTIONAL_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -751,12 +778,14 @@ private void parseFunctionalAnnotationFile(Map } } logger.info("Number of variant annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private void parseStudyParameterFile(Map variantAssociationMap) throws IOException { Map> studyParametersMap = new HashMap<>(); // Parse the study parameters file (i.e., study_parameters.tsv) - Path studyParamsPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(STUDY_PARAMETERS_TSV_FILENAME); + Path studyParamsPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(STUDY_PARAMETERS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, studyParamsPath); try (BufferedReader br = FileUtils.newBufferedReader(studyParamsPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -807,6 +836,7 @@ private void parseStudyParameterFile(Map varia } } logger.info("Number of study parameters lines = {}", studyParametersMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, studyParamsPath); for (Map.Entry> entry : studyParametersMap.entrySet()) { if (variantAssociationMap.containsKey(entry.getKey())) { @@ -861,7 +891,8 @@ private void parseGeneFile(Map chemicalsMap) throws IOEx // Parse the genes file (i.e., genes.tsv) Map geneAnnotationMapByPgkbGeneId = new HashMap<>(); - Path genesPath = pharmGKBDir.resolve(GENES_BASENAME).resolve(GENES_TSV_FILENAME); + Path genesPath = serializer.getOutdir().resolve(GENES_BASENAME).resolve(GENES_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, genesPath); try (BufferedReader br = FileUtils.newBufferedReader(genesPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -940,13 +971,15 @@ private void parseGeneFile(Map chemicalsMap) throws IOEx } logger.info("Number of parsed genes = {}", geneAnnotationMapByPgkbGeneId.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, genesPath); } private void parseChemicalGeneRelationships(Map> pgkbGeneIdMapByChemicalName, Map geneAnnotationMapByPgkbGeneId) throws IOException { int counter = 0; // Parse the genes file (i.e., relationships.tsv) - Path relationshipsPath = pharmGKBDir.resolve(RELATIONSHIPS_BASENAME).resolve(RELATIONSHIPS_TSV_FILENAME); + Path relationshipsPath = serializer.getOutdir().resolve(RELATIONSHIPS_BASENAME).resolve(RELATIONSHIPS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, relationshipsPath); try (BufferedReader br = FileUtils.newBufferedReader(relationshipsPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -986,6 +1019,7 @@ private void parseChemicalGeneRelationships(Map> pgkbGeneIdM } } logger.info("Number of parsed {}-{} relationships = {}", GENE_ENTITY, CHEMICAL_ENTITY, counter); + logger.info(PARSING_DONE_LOG_MESSAGE, relationshipsPath); } private List stringFieldToList(String field) { @@ -1011,6 +1045,29 @@ private boolean isHaplotype(String value) { } private List getHaplotypeList(String value) { - return Arrays.stream(value.split(",")).map(s -> s.trim()).collect(Collectors.toList()); + return Arrays.stream(value.split(",")).map(String::trim).collect(Collectors.toList()); + } + + private void unzipDownloadedFiles(List pharmGkbFiles) throws CellBaseException { + // Unzip + for (File pharmGgkFile : pharmGkbFiles) { + logger.info("Unzip file: {}", pharmGgkFile); + try { + String outPath = serializer.getOutdir().resolve(pharmGgkFile.getName().split("\\.")[0]).toString(); + List params = Arrays.asList("-d", outPath, "-o", pharmGgkFile.toString()); + EtlCommons.runCommandLineProcess(null, "unzip", params, Paths.get(outPath + ".log").toString()); + } catch (CellBaseException e) { + if (pharmGgkFile.getName().contains(GUIDELINE_ANNOTATIONS_BASENAME)) { + // It fails because of long filenames, so it does not raise any exception + logger.warn(e.getMessage()); + } + } catch (IOException e) { + throw new CellBaseException("Error executing unzip in file " + pharmGgkFile, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing unzip in file " + pharmGgkFile, e); + } + } } } From cd444b0a037c3754f22ad0d25c04fc1c6c7f8d5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 25 Apr 2024 19:05:22 +0200 Subject: [PATCH 086/107] lib: improve the PubMed downloader by adding log messages and fixing sonnar issues, #TASK-5775, #TASK-5564 --- .../executors/DownloadCommandExecutor.java | 1 + .../org/opencb/cellbase/lib/EtlCommons.java | 7 ++--- .../lib/download/PubMedDownloadManager.java | 26 ++++++++++++------- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index 8a763ae3c9..8da49800df 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -112,6 +112,7 @@ public void execute() throws CellBaseException { Thread.currentThread().interrupt(); throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); } catch (Exception e) { + e.printStackTrace(); throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 8b56fc9d0f..3733f6fb59 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -371,9 +371,10 @@ public final class EtlCommons { // PubMed public static final String PUBMED_NAME = "PubMed"; public static final String PUBMED_DATA = "pubmed"; - public static final String PUBMED_SUBDIRECTORY = PUBMED_DATA; - public static final String PUBMED_VERSION_FILENAME = PUBMED_DATA + SUFFIX_VERSION_FILENAME; - public static final String PUBMED_REGEX_FILE_ID = "PUBMED"; + public static final String PUBMED_SUBDIRECTORY = "pubmed"; + public static final String PUBMED_VERSION_FILENAME = "pubMed" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String PUBMED_REGEX_FILE_ID = "PUBMED_REGEX"; private EtlCommons() { throw new IllegalStateException("Utility class"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index 87e4ec8b98..106e3be709 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -27,6 +27,8 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class PubMedDownloadManager extends AbstractDownloadManager { public PubMedDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) @@ -36,13 +38,14 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto @Override public List download() throws IOException, InterruptedException, CellBaseException { - Path pubmedFolder = downloadFolder.resolve(EtlCommons.PUBMED_SUBDIRECTORY); + logger.info(DOWNLOADING_LOG_MESSAGE, PUBMED_NAME); + + Path pubmedFolder = downloadFolder.resolve(PUBMED_SUBDIRECTORY); Files.createDirectories(pubmedFolder); - logger.info("Downloading {} files at {} ...", EtlCommons.PUBMED_DATA, pubmedFolder); // Downloads PubMed XML files - String url = configuration.getDownload().getPubmed().getHost(); - String regexp = configuration.getDownload().getPubmed().getFiles().get(EtlCommons.PUBMED_REGEX_FILE_ID); + String host = configuration.getDownload().getPubmed().getHost(); + String regexp = configuration.getDownload().getPubmed().getFiles().get(PUBMED_REGEX_FILE_ID); String[] name = regexp.split("[\\[\\]]"); String[] split = name[1].split("\\.\\."); int start = Integer.parseInt(split[0]); @@ -51,13 +54,18 @@ public List download() throws IOException, InterruptedException, C List downloadFiles = new ArrayList<>(); for (int i = start; i <= end; i++) { - String filename = name[0] + String.format("%0" + padding + "d", i) + name[2]; - logger.info("\tDownloading from {} to {} ", url + "/" + filename, pubmedFolder.resolve(filename)); - downloadFiles.add(downloadFile(url + "/" + filename, pubmedFolder.resolve(filename).toString())); + String padString = "%0" + padding + "d"; + String filename = name[0] + String.format(padString, i) + name[2]; + String url = host + filename; + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, pubmedFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, pubmedFolder.resolve(filename).toString())); } - saveDataSource(EtlCommons.PUBMED_NAME, EtlCommons.PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), - Collections.singletonList(url), pubmedFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME)); + // Save data source + saveDataSource(EtlCommons.PUBMED_NAME, PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), + Collections.singletonList(host), pubmedFolder.resolve(PUBMED_VERSION_FILENAME)); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, PUBMED_NAME); return downloadFiles; } From e19fe73dd77a1225bd088c368c890bf63071ef67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 11:13:48 +0200 Subject: [PATCH 087/107] lib: create maps to get the names, categories and version filenames from a given data, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 43 +++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 3733f6fb59..833767a1dc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -33,7 +33,9 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** * Created by fjlopez on 03/06/16. @@ -369,13 +371,27 @@ public final class EtlCommons { public static final String HGMD_FILE = "hgmd.vcf"; // PubMed - public static final String PUBMED_NAME = "PubMed"; public static final String PUBMED_DATA = "pubmed"; - public static final String PUBMED_SUBDIRECTORY = "pubmed"; - public static final String PUBMED_VERSION_FILENAME = "pubMed" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String PUBMED_REGEX_FILE_ID = "PUBMED_REGEX"; + // Utilities maps + private static Map dataNamesMap = new HashMap<>(); + private static Map dataCategoriesMap = new HashMap<>(); + private static Map dataVersionFilenamesMap = new HashMap<>(); + + static { + + // Populate data names map + dataNamesMap.put(PUBMED_DATA, "PubMed"); + + // Populate data categories map + dataCategoriesMap.put(PUBMED_DATA, "Publication"); + + // Populate data version filenames Map + dataVersionFilenamesMap.put(PUBMED_DATA, "pubMed" + SUFFIX_VERSION_FILENAME); + } + private EtlCommons() { throw new IllegalStateException("Utility class"); } @@ -551,4 +567,25 @@ public static void checkDirectory(Path path, String name) throws CellBaseExcepti private static String getMissingFileIdMessage(String fileId) { return "File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase configuration file"; } + + public static String getDataName(String data) throws CellBaseException { + if (!dataNamesMap.containsKey(data)) { + throw new CellBaseException("Name not found for data " + data); + } + return dataNamesMap.get(data); + } + + public static String getDataCategory(String data) throws CellBaseException { + if (!dataCategoriesMap.containsKey(data)) { + throw new CellBaseException("Category not found for data " + data); + } + return dataCategoriesMap.get(data); + } + + public static String getDataVersionFilename(String data) throws CellBaseException { + if (!dataVersionFilenamesMap.containsKey(data)) { + throw new CellBaseException("Version filename not found for data " + data); + } + return dataVersionFilenamesMap.get(data); + } } From a29afe3ac11fb660ce9fa256b254cbe0df2953c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 11:15:10 +0200 Subject: [PATCH 088/107] lib: update according to the EtlCommons changes, #TASK-5775, #TASK-5564 --- .../admin/executors/LoadCommandExecutor.java | 16 ++++--- .../lib/download/AbstractDownloadManager.java | 17 +++++++- .../lib/download/PubMedDownloadManager.java | 43 +++++++++++-------- 3 files changed, 51 insertions(+), 25 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index ca1a4a9a71..c750beb6aa 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -44,6 +44,8 @@ import java.util.List; import java.util.concurrent.ExecutionException; +import static org.opencb.cellbase.lib.EtlCommons.PUBMED_DATA; + /** * Created by imedina on 03/02/15. */ @@ -81,7 +83,7 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, EtlCommons.ONTOLOGY_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, - EtlCommons.PUBMED_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; + PUBMED_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { loadOptions = loadCommandOptions.data.split(","); } @@ -289,7 +291,7 @@ public void execute() throws CellBaseException { loadSpliceScores(); break; } - case EtlCommons.PUBMED_DATA: { + case PUBMED_DATA: { // Load data, create index and update release loadPubMed(); break; @@ -536,7 +538,7 @@ private void loadSpliceScores(Path spliceFolder) throws IOException, ExecutionEx } private void loadPubMed() throws CellBaseException { - Path pubmedPath = input.resolve(EtlCommons.PUBMED_DATA); + Path pubmedPath = input.resolve(PUBMED_DATA); if (Files.exists(pubmedPath)) { // Load data @@ -544,7 +546,7 @@ private void loadPubMed() throws CellBaseException { if (file.isFile() && (file.getName().endsWith("gz"))) { logger.info("Loading file '{}'", file.getName()); try { - loadRunner.load(file.toPath(), EtlCommons.PUBMED_DATA, dataRelease); + loadRunner.load(file.toPath(), PUBMED_DATA, dataRelease); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException | IllegalAccessException | ExecutionException | IOException | InterruptedException | LoaderException e) { logger.error("Error loading file '{}': {}", file.getName(), e.toString()); @@ -552,11 +554,11 @@ private void loadPubMed() throws CellBaseException { } } // Create index - createIndex(EtlCommons.PUBMED_DATA); + createIndex(PUBMED_DATA); // Update release (collection and sources) - List sources = Collections.singletonList(pubmedPath.resolve(EtlCommons.PUBMED_VERSION_FILENAME)); - dataReleaseManager.update(dataRelease, EtlCommons.PUBMED_DATA, EtlCommons.PUBMED_DATA, sources); + List sources = Collections.singletonList(pubmedPath.resolve(EtlCommons.getDataVersionFilename(PUBMED_DATA))); + dataReleaseManager.update(dataRelease, PUBMED_DATA, PUBMED_DATA, sources); } else { logger.warn("PubMed folder {} not found", pubmedPath); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 193f2e146d..35fcc5a470 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -47,7 +47,7 @@ import java.time.LocalDateTime; import java.util.*; -import static org.opencb.cellbase.lib.EtlCommons.getFilenameFromUrl; +import static org.opencb.cellbase.lib.EtlCommons.*; public abstract class AbstractDownloadManager { @@ -201,6 +201,21 @@ protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblPrope return downloadFile(url, outFile.toString()); } + protected void saveDataSource(String data, String version, String date, List urls, Path versionFilePath) + throws IOException, CellBaseException { + String name = getDataName(data); + String category = getDataCategory(data); + DataSource dataSource = new DataSource(name, category, version, date, urls); + + if (StringUtils.isEmpty(version)) { + logger.warn("Version missing for data source {}/{}, using the date as version: {}", category, name, date); + dataSource.setVersion(date); + } + + dataSourceWriter.writeValue(versionFilePath.toFile(), dataSource); + } + + @Deprecated protected void saveDataSource(String name, String category, String version, String date, List urls, Path versionFilePath) throws IOException { DataSource dataSource = new DataSource(name, category, version, date, urls); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index 106e3be709..6451fd76aa 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -17,8 +17,8 @@ package org.opencb.cellbase.lib.download; import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; @@ -38,35 +38,44 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, PUBMED_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(PUBMED_DATA)); - Path pubmedFolder = downloadFolder.resolve(PUBMED_SUBDIRECTORY); - Files.createDirectories(pubmedFolder); + Path pubmedDownloadFolder = downloadFolder.resolve(PUBMED_DATA); + Files.createDirectories(pubmedDownloadFolder); // Downloads PubMed XML files String host = configuration.getDownload().getPubmed().getHost(); - String regexp = configuration.getDownload().getPubmed().getFiles().get(PUBMED_REGEX_FILE_ID); + List filenames = getPubMedFilenames(configuration.getDownload().getPubmed()); + List downloadFiles = new ArrayList<>(); + for (String filename : filenames) { + String url = host + filename; + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, pubmedDownloadFolder.resolve(filename)); + downloadFiles.add(downloadFile(url, pubmedDownloadFolder.resolve(filename).toString())); + } + + // Save data source + saveDataSource(PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), Collections.singletonList(host), + pubmedDownloadFolder.resolve(getDataVersionFilename(PUBMED_DATA))); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(PUBMED_DATA)); + + return downloadFiles; + } + + public static List getPubMedFilenames(DownloadProperties.URLProperties pubMedProps) { + String regexp = pubMedProps.getFiles().get(PUBMED_REGEX_FILE_ID); String[] name = regexp.split("[\\[\\]]"); String[] split = name[1].split("\\.\\."); int start = Integer.parseInt(split[0]); int end = Integer.parseInt(split[1]); int padding = Integer.parseInt(split[2]); - List downloadFiles = new ArrayList<>(); + List filenames = new ArrayList<>(); for (int i = start; i <= end; i++) { String padString = "%0" + padding + "d"; String filename = name[0] + String.format(padString, i) + name[2]; - String url = host + filename; - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, pubmedFolder.resolve(filename)); - downloadFiles.add(downloadFile(url, pubmedFolder.resolve(filename).toString())); + filenames.add(filename); } - - // Save data source - saveDataSource(EtlCommons.PUBMED_NAME, PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(), - Collections.singletonList(host), pubmedFolder.resolve(PUBMED_VERSION_FILENAME)); - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, PUBMED_NAME); - - return downloadFiles; + return filenames; } } From 377ee9c8292cd1c619c7eff039d292be5e32dc53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 11:16:17 +0200 Subject: [PATCH 089/107] lib: improve PubMed builder by adding checks, log messages and fixing sonnar issues, #TAK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 23 +++--- .../cellbase/lib/builders/PubMedBuilder.java | 72 ++++++++++--------- 2 files changed, 48 insertions(+), 47 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 8225648820..620f1973b2 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -391,22 +391,15 @@ private CellBaseBuilder buildSplice() throws IOException { return new SpliceBuilder(spliceInputFolder, serializer); } - private CellBaseBuilder buildPubMed() throws IOException { - Path pubmedInputFolder = downloadFolder.resolve(EtlCommons.PUBMED_DATA); - Path pubmedOutputFolder = buildFolder.resolve(EtlCommons.PUBMED_DATA); - if (!pubmedOutputFolder.toFile().exists()) { - pubmedOutputFolder.toFile().mkdirs(); - } - - logger.info("Copying PubMed version file..."); - if (pubmedInputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME).toFile().exists()) { - Files.copy(pubmedInputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME), - pubmedOutputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME), - StandardCopyOption.REPLACE_EXISTING); - } + private CellBaseBuilder buildPubMed() throws IOException, CellBaseException { + // Sanity check + Path pubMedDownloadPath = downloadFolder.resolve(PUBMED_DATA); + Path pubMedBuildPath = buildFolder.resolve(PUBMED_DATA); + copyVersionFiles(Collections.singletonList(pubMedDownloadPath.resolve(getDataVersionFilename(PUBMED_DATA))), pubMedBuildPath); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pubmedOutputFolder); - return new PubMedBuilder(pubmedInputFolder, serializer); + // Create the file serializer and the PubMed builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pubMedBuildPath); + return new PubMedBuilder(pubMedDownloadPath, serializer, configuration); } private CellBaseBuilder buildPharmacogenomics() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java index 8aba7c9dda..348d22a07d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java @@ -16,63 +16,71 @@ package org.opencb.cellbase.lib.builders; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.ObjectWriter; import org.opencb.biodata.formats.pubmed.PubMedParser; import org.opencb.biodata.formats.pubmed.v233jaxb.PubmedArticle; import org.opencb.biodata.formats.pubmed.v233jaxb.PubmedArticleSet; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.download.PubMedDownloadManager; import org.opencb.commons.utils.FileUtils; -import org.slf4j.LoggerFactory; -import java.io.File; +import java.nio.file.Files; import java.nio.file.Path; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.PUBMED_DATA; +import static org.opencb.cellbase.lib.EtlCommons.getDataName; + public class PubMedBuilder extends CellBaseBuilder { - private Path pubmedDir; - private CellBaseFileSerializer fileSerializer; + private Path pubMedDownloadPath; + private CellBaseConfiguration configuration; - public PubMedBuilder(Path pubmedDir, CellBaseFileSerializer serializer) { + public PubMedBuilder(Path pubMedDownloadPath, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) { super(serializer); - - this.fileSerializer = serializer; - this.pubmedDir = pubmedDir; - - logger = LoggerFactory.getLogger(PubMedBuilder.class); + this.pubMedDownloadPath = pubMedDownloadPath; + this.configuration = configuration; } @Override public void parse() throws Exception { - // Check input folder - FileUtils.checkPath(pubmedDir); + logger.info(BUILDING_LOG_MESSAGE, getDataName(PUBMED_DATA)); - logger.info("Parsing PubMed files..."); + // Check input folder + FileUtils.checkPath(pubMedDownloadPath); - for (File file : pubmedDir.toFile().listFiles()) { - if (file.isFile() && (file.getName().endsWith("gz") || file.getName().endsWith("xml"))) { - String name = file.getName().split("\\.")[0]; + // Check PubMed files before parsing them + List pubMedFilenames = PubMedDownloadManager.getPubMedFilenames(configuration.getDownload().getPubmed()); + for (String pubMedFilename : pubMedFilenames) { + Path pubMedPath = pubMedDownloadPath.resolve(pubMedFilename); + if (!Files.exists(pubMedPath)) { + throw new CellBaseException("Expected PubMed file " + pubMedFilename + ", but it was not found at " + pubMedDownloadPath); + } + } + for (String pubMedFilename : pubMedFilenames) { + Path pubMedPath = pubMedDownloadPath.resolve(pubMedFilename); + String basename = pubMedFilename.split("\\.")[0]; - ObjectWriter objectWriter = new ObjectMapper().writerFor(PubmedArticle.class); - PubmedArticleSet pubmedArticleSet = (PubmedArticleSet) PubMedParser.loadXMLInfo(file.getAbsolutePath()); + PubmedArticleSet pubmedArticleSet = (PubmedArticleSet) PubMedParser.loadXMLInfo(pubMedPath.toAbsolutePath().toString()); - List objects = pubmedArticleSet.getPubmedArticleOrPubmedBookArticle(); - logger.info("Parsing PubMed file {} of {} articles ...", file.getName(), objects.size()); - int counter = 0; - for (Object object : objects) { - PubmedArticle pubmedArticle = (PubmedArticle) object; - fileSerializer.serialize(pubmedArticle, name); - if (++counter % 2000 == 0) { - logger.info("\t\t" + counter + " articles"); - } + List objects = pubmedArticleSet.getPubmedArticleOrPubmedBookArticle(); + logger.info(PARSING_LOG_MESSAGE, pubMedPath); + int counter = 0; + for (Object object : objects) { + PubmedArticle pubmedArticle = (PubmedArticle) object; + ((CellBaseFileSerializer) serializer).serialize(pubmedArticle, basename); + if (++counter % 2000 == 0) { + logger.info("{} articles", counter); } - fileSerializer.close(); - logger.info("\t\tDone: " + counter + " articles."); } + serializer.close(); + + String logMsg = pubMedPath + " (" + counter + " articles)"; + logger.info(PARSING_DONE_LOG_MESSAGE, logMsg); } - logger.info("Parsing PubMed files finished."); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PUBMED_DATA)); } } From 997c8ec62d1e4fa3463a78a10483f9cddcaa668c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 15:38:44 +0200 Subject: [PATCH 090/107] lib: update CADD downloader according to last changes, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 15 +++++----- .../lib/download/AbstractDownloadManager.java | 25 +++++++++++++++- .../lib/download/CaddDownloadManager.java | 29 ++++++++++--------- 3 files changed, 48 insertions(+), 21 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 833767a1dc..da209c66fc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -265,10 +265,8 @@ public final class EtlCommons { // Variation functional score public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score"; - public static final String VARIATION_FUNCTIONAL_SCORE_SUBDIRECTORY = "variation_functional_score"; // CADD scores - public static final String CADD_NAME = "CADD"; - public static final String CADD_VERSION_FILENAME = "cadd" + SUFFIX_VERSION_FILENAME; + public static final String CADD_DATA = "cadd"; // Must match the configuration file public static final String CADD_FILE_ID = "CADD"; @@ -299,7 +297,6 @@ public final class EtlCommons { public static final String GENOME_INFO_DATA = "genome_info"; public static final String DISGENET_DATA = "disgenet"; public static final String HPO_DATA = "hpo"; - public static final String CADD_DATA = "cadd"; public static final String PPI_DATA = "ppi"; public static final String DRUG_DATA = "drug"; @@ -384,12 +381,16 @@ public final class EtlCommons { // Populate data names map dataNamesMap.put(PUBMED_DATA, "PubMed"); + dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Scores"); + dataNamesMap.put(CADD_DATA, "CADD"); // Populate data categories map dataCategoriesMap.put(PUBMED_DATA, "Publication"); + dataCategoriesMap.put(CADD_DATA, dataNamesMap.get(VARIATION_FUNCTIONAL_SCORE_DATA)); // Populate data version filenames Map dataVersionFilenamesMap.put(PUBMED_DATA, "pubMed" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CADD_DATA, "cadd" + SUFFIX_VERSION_FILENAME); } private EtlCommons() { @@ -570,21 +571,21 @@ private static String getMissingFileIdMessage(String fileId) { public static String getDataName(String data) throws CellBaseException { if (!dataNamesMap.containsKey(data)) { - throw new CellBaseException("Name not found for data " + data); + throw new CellBaseException("Name not found for data '" + data + "'"); } return dataNamesMap.get(data); } public static String getDataCategory(String data) throws CellBaseException { if (!dataCategoriesMap.containsKey(data)) { - throw new CellBaseException("Category not found for data " + data); + throw new CellBaseException("Category not found for data '" + data + "'"); } return dataCategoriesMap.get(data); } public static String getDataVersionFilename(String data) throws CellBaseException { if (!dataVersionFilenamesMap.containsKey(data)) { - throw new CellBaseException("Version filename not found for data " + data); + throw new CellBaseException("Version filename not found for data '" + data + "'"); } return dataVersionFilenamesMap.get(data); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 35fcc5a470..a05760f686 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -52,8 +52,10 @@ public abstract class AbstractDownloadManager { protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; - protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {} done!"; + protected static final String CATEGORY_DOWNLOADING_LOG_MESSAGE = "Downloading {}/{} ..."; + protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {}/{} done!"; + protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; protected String species; protected String assembly; @@ -141,12 +143,33 @@ protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) return hasInfo; } + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String data, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadAndSaveDataSource(props, fileId, data, null, outPath); + } + + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String data, String chromosome, + Path outPath) throws IOException, InterruptedException, CellBaseException { + String versionFilename = getDataVersionFilename(data); + + // Download file + DownloadFile downloadFile = downloadDataSource(props, fileId, chromosome, outPath); + + // Save data source + saveDataSource(data, props.getVersion(), getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + outPath.resolve(versionFilename)); + + return downloadFile; + } + + @Deprecated protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, String versionFilename, Path outPath) throws IOException, InterruptedException, CellBaseException { return downloadAndSaveDataSource(props, fileId, name, category, null, versionFilename, outPath); } + @Deprecated protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, String chromosome, String versionFilename, Path outPath) throws IOException, InterruptedException, CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index 738c66f3f1..0b0d09f412 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -36,22 +36,25 @@ public CaddDownloadManager(String species, String assembly, Path targetDirectory @Override public List download() throws IOException, InterruptedException, CellBaseException { - if (!speciesHasInfoToDownload(speciesConfiguration, VARIATION_FUNCTIONAL_SCORE_DATA)) { - return null; + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); + + if (!speciesHasInfoToDownload(speciesConfiguration, VARIATION_FUNCTIONAL_SCORE_DATA) + || !speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info("{}/{} not supported for species {}", getDataCategory(CADD_DATA), getDataName(CADD_DATA), + speciesConfiguration.getScientificName()); + return Collections.emptyList(); } - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - Path variationFunctionalScoreFolder = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_SUBDIRECTORY); - Files.createDirectories(variationFunctionalScoreFolder); - logger.info("Downloading {} files at {} ...", CADD_NAME, variationFunctionalScoreFolder); + // Create the CADD download path + Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + Files.createDirectories(caddDownloadPath); - // Download CADD and save data source - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_NAME, - VARIATION_FUNCTIONAL_SCORE_DATA, CADD_VERSION_FILENAME, variationFunctionalScoreFolder); + // Download CADD and save data source + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_DATA, + caddDownloadPath); - return Collections.singletonList(downloadFile); - } - logger.warn("CADD scores are not supported for {}", speciesConfiguration.getScientificName()); - return Collections.emptyList(); + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); + + return Collections.singletonList(downloadFile); } } From 96078b7e22d63ee03c4c458ef22c8ef90f23c43a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 16:13:57 +0200 Subject: [PATCH 091/107] lib: improve the CADD builder by adding checks, log messages, cleaning code and fixing sonnar issues, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 17 +- .../org/opencb/cellbase/lib/EtlCommons.java | 2 + .../lib/builders/CaddScoreBuilder.java | 233 ++++++++---------- .../lib/builders/CellBaseBuilder.java | 7 +- 4 files changed, 126 insertions(+), 133 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 620f1973b2..355e218600 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -264,12 +264,15 @@ private CellBaseBuilder buildRefSeq() { return new RefSeqGeneBuilder(refseqFolderPath, speciesConfiguration, serializer); } - private CellBaseBuilder buildCadd() { - Path variationFunctionalScorePath = downloadFolder.resolve("variation_functional_score"); - copyVersionFiles(Arrays.asList(variationFunctionalScorePath.resolve("caddVersion.json"))); - Path caddFilePath = variationFunctionalScorePath.resolve("whole_genome_SNVs.tsv.gz"); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "cadd"); - return new CaddScoreBuilder(caddFilePath, serializer); + private CellBaseBuilder buildCadd() throws CellBaseException { + // Sanity check + Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + Path caddBuildPath = buildFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + copyVersionFiles(Collections.singletonList(caddDownloadPath.resolve(getDataVersionFilename(CADD_DATA))), caddBuildPath); + + // Create the file serializer and the protein builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(caddBuildPath, CADD_DATA); + return new CaddScoreBuilder(caddDownloadPath, serializer); } private CellBaseBuilder buildRevel() { @@ -391,7 +394,7 @@ private CellBaseBuilder buildSplice() throws IOException { return new SpliceBuilder(spliceInputFolder, serializer); } - private CellBaseBuilder buildPubMed() throws IOException, CellBaseException { + private CellBaseBuilder buildPubMed() throws CellBaseException { // Sanity check Path pubMedDownloadPath = downloadFolder.resolve(PUBMED_DATA); Path pubMedBuildPath = buildFolder.resolve(PUBMED_DATA); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index da209c66fc..7f0e97d900 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -267,6 +267,8 @@ public final class EtlCommons { public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score"; // CADD scores public static final String CADD_DATA = "cadd"; + public static final String CADD_RAW_DATA = "cadd_raw"; + public static final String CADD_SCALED_DATA = "cadd_scaled"; // Must match the configuration file public static final String CADD_FILE_ID = "CADD"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java index b593f44901..75b35e8a73 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java @@ -17,32 +17,33 @@ package org.opencb.cellbase.lib.builders; import org.opencb.biodata.models.core.GenomicScoreRegion; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; -import org.slf4j.LoggerFactory; import java.io.BufferedReader; +import java.io.File; import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 06/11/15. */ public class CaddScoreBuilder extends CellBaseBuilder { - private Path caddFilePath; + private Path caddDownloadPath; private static final int CHUNK_SIZE = 1000; private static final int DECIMAL_RESOLUTION = 100; - public CaddScoreBuilder(Path caddFilePath, CellBaseSerializer serializer) { + public CaddScoreBuilder(Path caddDownloadPath, CellBaseSerializer serializer) { super(serializer); - this.caddFilePath = caddFilePath; - - logger = LoggerFactory.getLogger(ConservationBuilder.class); + this.caddDownloadPath = caddDownloadPath; } /* Example: @@ -57,14 +58,25 @@ public CaddScoreBuilder(Path caddFilePath, CellBaseSerializer serializer) { */ @Override public void parse() throws Exception { - FileUtils.checkPath(caddFilePath); + String dataName = getDataName(CADD_DATA); + String dataCategory = getDataCategory(CADD_DATA); + + logger.info(CATEGORY_BUILDING_LOG_MESSAGE, dataCategory, dataName); + + // Sanity check + checkDirectory(caddDownloadPath, dataName); + + // Check ontology files + List caddFiles = checkFiles(dataSourceReader.readValue(caddDownloadPath.resolve(getDataVersionFilename(CADD_DATA)).toFile()), + caddDownloadPath, dataName); + if (caddFiles.size() != 1) { + throw new CellBaseException("One " + dataName + " file is expected, but currently there are " + caddFiles.size() + " files"); + } - BufferedReader bufferedReader = FileUtils.newBufferedReader(caddFilePath); List rawValues = new ArrayList<>(CHUNK_SIZE); List scaledValues = new ArrayList<>(CHUNK_SIZE); int start = 1; -// int end = 1999; int end = CHUNK_SIZE - 1; String line; String[] fields = new String[0]; @@ -72,8 +84,8 @@ public void parse() throws Exception { int lineCount = 0; int counter = 1; int serializedChunks = 0; - int previousPosition = 0; - int newPosition = 0; + int prevPos = 0; + int newPos = 0; String chromosome = null; String[] nucleotides = new String[]{"A", "C", "G", "T"}; @@ -81,127 +93,100 @@ public void parse() throws Exception { long scaledLongValue = 0; Map rawScoreValuesMap = new HashMap<>(); Map scaledScoreValuesMap = new HashMap<>(); - while ((line = bufferedReader.readLine()) != null) { - if (!line.startsWith("#")) { - fields = line.split("\t"); - newPosition = Integer.parseInt(fields[1]); -// if (fields[0].equals("1") && fields[1].equals("249240621")) { -// if (fields[0].equals("1") && fields[1].equals("69100")) { -// if (fields[0].equals("1") && fields[1].equals("144854598")) { -// logger.debug("line {} reached", line); -// logger.debug("Associated chunk count {}", serializedChunks); -// logger.debug("start {}", start); -// logger.debug("end {}", end); -// logger.debug("chunk size {}", CHUNK_SIZE); -// } - // this only happens the first time, when we start reading the file - if (chromosome == null) { - logger.info("Parsing chr {} ", fields[0]); - chromosome = fields[0]; - - start = newPosition; - previousPosition = newPosition; - end = start + CHUNK_SIZE - 2; - } - if (!chromosome.equals(fields[0])) { - logger.info("Parsing chr {} ", fields[0]); - // both raw and scaled are serialized - GenomicScoreRegion genomicScoreRegion = - new GenomicScoreRegion<>(chromosome, start, previousPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); - - genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, previousPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); - - serializedChunks++; - chromosome = fields[0]; - start = newPosition; -// end = CHUNK_SIZE - 1; - end = start + CHUNK_SIZE - 2; - - counter = 0; - rawValues.clear(); - scaledValues.clear(); -// rawLongValue = 0; -// lineCount = 0; -// rawScoreValuesMap.clear(); -// scaledScoreValuesMap.clear(); - // The series of cadd scores is not continuous through the whole chromosome - } else if (end < newPosition || (newPosition - previousPosition) > 1) { - // both raw and scaled are serialized - GenomicScoreRegion genomicScoreRegion - = new GenomicScoreRegion<>(fields[0], start, previousPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); - - genomicScoreRegion - = new GenomicScoreRegion<>(fields[0], start, previousPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); - - serializedChunks++; - start = newPosition; -// start = end + 1; -// end += CHUNK_SIZE; - end = (start / CHUNK_SIZE) * CHUNK_SIZE + CHUNK_SIZE - 1; - - counter = 0; - rawValues.clear(); - scaledValues.clear(); - } + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(caddFiles.get(0).toPath())) { + while ((line = bufferedReader.readLine()) != null) { + if (!line.startsWith("#")) { + fields = line.split("\t"); + newPos = Integer.parseInt(fields[1]); + String message = "chrom. " + fields[0]; + // This only happens the first time, when we start reading the file + if (chromosome == null) { + logger.info(PARSING_LOG_MESSAGE, message); + chromosome = fields[0]; + + start = newPos; + prevPos = newPos; + end = start + CHUNK_SIZE - 2; + } - rawScoreValuesMap.put(fields[3], Float.valueOf(fields[4])); - scaledScoreValuesMap.put(fields[3], Float.valueOf(fields[5])); - - if (++lineCount == 3) { -// if (fields[0].equals("1") && fields[1].equals("249240621")) { -// if (fields[0].equals("1") && fields[1].equals("69100")) { -// if (fields[0].equals("1") && fields[1].equals("144854598")) { -// logger.info("offset: {}", rawValues.size()); -// } - - for (String nucleotide : nucleotides) { - // raw CADD score values can be negative, we add 10 to make positive - float a = rawScoreValuesMap.getOrDefault(nucleotide, 10f) + 10.0f; - v = (short) (a * DECIMAL_RESOLUTION); - rawLongValue = (rawLongValue << 16) | v; - - // scaled CADD scores are always positive - a = scaledScoreValuesMap.getOrDefault(nucleotide, 0f); - v = (short) (a * DECIMAL_RESOLUTION); - scaledLongValue = (scaledLongValue << 16) | v; + if (!chromosome.equals(fields[0])) { + logger.info(PARSING_LOG_MESSAGE, message); + + // Both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, prevPos, CADD_RAW_DATA, + rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, prevPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); + + serializedChunks++; + chromosome = fields[0]; + start = newPos; + end = start + CHUNK_SIZE - 2; + + counter = 0; + rawValues.clear(); + scaledValues.clear(); + // The series of cadd scores is not continuous through the whole chromosome + } else if (end < newPos || (newPos - prevPos) > 1) { + // Both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, prevPos, CADD_RAW_DATA, + rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, prevPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); + + serializedChunks++; + start = newPos; + end = (start / CHUNK_SIZE) * CHUNK_SIZE + CHUNK_SIZE - 1; + + counter = 0; + rawValues.clear(); + scaledValues.clear(); } -// if (rawLongValue < 0 || scaledLongValue < 0) { -// logger.error("raw/scaled Long Values cannot be 0"); -// logger.error("Last read line {}", line); -// System.exit(1); -// } - rawValues.add(rawLongValue); - scaledValues.add(scaledLongValue); - - counter++; - rawLongValue = 0; - lineCount = 0; - rawScoreValuesMap.clear(); - scaledScoreValuesMap.clear(); + rawScoreValuesMap.put(fields[3], Float.valueOf(fields[4])); + scaledScoreValuesMap.put(fields[3], Float.valueOf(fields[5])); + + if (++lineCount == 3) { + for (String nucleotide : nucleotides) { + // Raw CADD score values can be negative, we add 10 to make positive + float a = rawScoreValuesMap.getOrDefault(nucleotide, 10f) + 10.0f; + v = (short) (a * DECIMAL_RESOLUTION); + rawLongValue = (rawLongValue << 16) | v; + + // Scaled CADD scores are always positive + a = scaledScoreValuesMap.getOrDefault(nucleotide, 0f); + v = (short) (a * DECIMAL_RESOLUTION); + scaledLongValue = (scaledLongValue << 16) | v; + } + + rawValues.add(rawLongValue); + scaledValues.add(scaledLongValue); + + counter++; + rawLongValue = 0; + lineCount = 0; + rawScoreValuesMap.clear(); + scaledScoreValuesMap.clear(); + } + prevPos = newPos; } - previousPosition = newPosition; } - } - // Last chunks can be incomplete for both raw and scaled are serialized -// GenomicScoreRegion genomicScoreRegion = -// new GenomicScoreRegion<>(fields[0], start, start + rawValues.size() - 1, "cadd_raw", rawValues); - GenomicScoreRegion genomicScoreRegion = - new GenomicScoreRegion<>(fields[0], start, newPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); + // Last chunks can be incomplete for both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPos, CADD_RAW_DATA, rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); -// genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, start + scaledValues.size() - 1, "cadd_scaled", scaledValues); - genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); + serializer.close(); + } - serializer.close(); - bufferedReader.close(); - logger.info("Parsing finished."); + logger.info(CATEGORY_BUILDING_DONE_LOG_MESSAGE, dataCategory, dataName); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index 49d847c033..f5e79320d7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -44,8 +44,11 @@ public abstract class CellBaseBuilder { public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done."; - public static final String PARSING_LOG_MESSAGE = "Parsing file {} ..."; - public static final String PARSING_DONE_LOG_MESSAGE = "Parsing file {} done."; + public static final String CATEGORY_BUILDING_LOG_MESSAGE = "Building {}/{} ..."; + public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building {}/{} done."; + + public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; + public static final String PARSING_DONE_LOG_MESSAGE = "Parsing {} done."; public CellBaseBuilder(CellBaseSerializer serializer) { From 3163a90cfbfda44ec205279cbf21f2b947cc10a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Apr 2024 16:47:16 +0200 Subject: [PATCH 092/107] lib: update the REVEL downloader according to the last changes, and add log messages, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 8 +++-- .../MissenseScoresDownloadManager.java | 33 ++++++++++++++----- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 7f0e97d900..28a349e028 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -177,11 +177,9 @@ public final class EtlCommons { public static final String PHARMGKB_RELATIONSHIPS_FILE_ID = "RELATIONSHIPS"; // Missense variantion functional score - public static final String MISSENSE_VARIATION_SCORE_NAME = "Missense Variation Functional Scores"; public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; // Revel - public static final String REVEL_NAME = "Revel"; - public static final String REVEL_VERSION_FILENAME = "revel" + SUFFIX_VERSION_FILENAME; + public static final String REVEL_DATA = "revel"; // Must match the configuration file public static final String REVEL_FILE_ID = "REVEL"; @@ -385,14 +383,18 @@ public final class EtlCommons { dataNamesMap.put(PUBMED_DATA, "PubMed"); dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Scores"); dataNamesMap.put(CADD_DATA, "CADD"); + dataNamesMap.put(MISSENSE_VARIATION_SCORE_DATA, "Missense Variation Scores"); + dataNamesMap.put(REVEL_DATA, "Revel"); // Populate data categories map dataCategoriesMap.put(PUBMED_DATA, "Publication"); dataCategoriesMap.put(CADD_DATA, dataNamesMap.get(VARIATION_FUNCTIONAL_SCORE_DATA)); + dataCategoriesMap.put(REVEL_DATA, dataNamesMap.get(MISSENSE_VARIATION_SCORE_DATA)); // Populate data version filenames Map dataVersionFilenamesMap.put(PUBMED_DATA, "pubMed" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(CADD_DATA, "cadd" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REVEL_DATA, "revel" + SUFFIX_VERSION_FILENAME); } private EtlCommons() { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java index ca491a97fe..b2c102a10e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/MissenseScoresDownloadManager.java @@ -37,18 +37,33 @@ public MissenseScoresDownloadManager(String species, String assembly, Path targe @Override public List download() throws IOException, InterruptedException, CellBaseException { - return Collections.singletonList(downloadRevel()); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MISSENSE_VARIATION_SCORE_DATA)); + + DownloadFile downloadFile = downloadRevel(); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MISSENSE_VARIATION_SCORE_DATA)); + + return Collections.singletonList(downloadFile); } public DownloadFile downloadRevel() throws IOException, InterruptedException, CellBaseException { - if (speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { - Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - Files.createDirectories(missensePredictionScorePath); - - logger.info("Downloading {}/{} ...", MISSENSE_VARIATION_SCORE_NAME, REVEL_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getRevel(), REVEL_FILE_ID, REVEL_NAME, - MISSENSE_VARIATION_SCORE_DATA, REVEL_VERSION_FILENAME, missensePredictionScorePath); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REVEL_DATA)); + if (!speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { + logger.info("{}/{} not supported for species {}", getDataCategory(REVEL_DATA), getDataName(REVEL_DATA), + speciesConfiguration.getScientificName()); + return null; } - return null; + + // Create the REVEL download path + Path revelDownloadPath = downloadFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); + Files.createDirectories(revelDownloadPath); + + // Download REVEL and save data source + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getRevel(), REVEL_FILE_ID, REVEL_DATA, + revelDownloadPath); + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REVEL_DATA)); + + return downloadFile; } } From bc22fadd56a701b8ed5012dfa0485603323892a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 08:14:40 +0200 Subject: [PATCH 093/107] lib: add log messages, #TASK-5776, #TASK-5564 --- .../java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java index 75b35e8a73..d0597c4c2a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java @@ -94,6 +94,7 @@ public void parse() throws Exception { Map rawScoreValuesMap = new HashMap<>(); Map scaledScoreValuesMap = new HashMap<>(); + logger.info(PARSING_LOG_MESSAGE, caddFiles.get(0)); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(caddFiles.get(0).toPath())) { while ((line = bufferedReader.readLine()) != null) { if (!line.startsWith("#")) { @@ -186,6 +187,7 @@ public void parse() throws Exception { serializer.close(); } + logger.info(PARSING_DONE_LOG_MESSAGE, caddFiles.get(0)); logger.info(CATEGORY_BUILDING_DONE_LOG_MESSAGE, dataCategory, dataName); } From 0c9a29958198d13e925b9219a9b438171a814d59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 08:16:44 +0200 Subject: [PATCH 094/107] lib: improve the Revel builder by fixing sonnar issues and adding checks and log messages, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 14 +- .../lib/builders/RevelScoreBuilder.java | 126 ++++++++++-------- 2 files changed, 82 insertions(+), 58 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 355e218600..f1fdcbbb19 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -275,11 +275,15 @@ private CellBaseBuilder buildCadd() throws CellBaseException { return new CaddScoreBuilder(caddDownloadPath, serializer); } - private CellBaseBuilder buildRevel() { - Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - copyVersionFiles(Arrays.asList(missensePredictionScorePath.resolve("revelVersion.json"))); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - return new RevelScoreBuilder(missensePredictionScorePath, serializer); + private CellBaseBuilder buildRevel() throws CellBaseException { + // Sanity check + Path revelDownloadPath = downloadFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); + Path revelBuildPath = buildFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); + copyVersionFiles(Collections.singletonList(revelDownloadPath.resolve(getDataVersionFilename(REVEL_DATA))), revelBuildPath); + + // Create the file serializer and the regulatory feature builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(revelBuildPath, REVEL_DATA); + return new RevelScoreBuilder(revelDownloadPath, serializer); } private CellBaseBuilder buildRegulation() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java index 2ccf0cb2a1..68c6128f25 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java @@ -19,8 +19,8 @@ import org.opencb.biodata.models.core.MissenseVariantFunctionalScore; import org.opencb.biodata.models.core.TranscriptMissenseVariantFunctionalScore; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.slf4j.LoggerFactory; import java.io.*; import java.nio.file.Path; @@ -30,75 +30,95 @@ import java.util.zip.ZipFile; import java.util.zip.ZipInputStream; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class RevelScoreBuilder extends CellBaseBuilder { - private Path revelFilePath = null; - private static final String SOURCE = "revel"; + private Path revelDownloadPath = null; - public RevelScoreBuilder(Path revelDirectoryPath, CellBaseSerializer serializer) { + public RevelScoreBuilder(Path revelDownloadPath, CellBaseSerializer serializer) { super(serializer); - this.revelFilePath = revelDirectoryPath.resolve("revel-v1.3_all_chromosomes.zip"); - logger = LoggerFactory.getLogger(ConservationBuilder.class); - + this.revelDownloadPath = revelDownloadPath; } @Override - public void parse() throws IOException { - logger.error("processing Revel file at " + revelFilePath.toAbsolutePath()); - ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelFilePath))); + public void parse() throws IOException, CellBaseException { + String dataName = getDataName(REVEL_DATA); + String dataCategory = getDataCategory(REVEL_DATA); + + logger.info(CATEGORY_BUILDING_LOG_MESSAGE, dataCategory, dataName); + + // Sanity check + checkDirectory(revelDownloadPath, dataName); + + // Check ontology files + List revelFiles = checkFiles(dataSourceReader.readValue(revelDownloadPath.resolve(getDataVersionFilename(REVEL_DATA)) + .toFile()), revelDownloadPath, dataName); + if (revelFiles.size() != 1) { + throw new CellBaseException("One " + dataName + " file is expected, but currently there are " + revelFiles.size() + " files"); + } + + ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelDownloadPath))); ZipEntry zipEntry = zis.getNextEntry(); - ZipFile zipFile = new ZipFile(String.valueOf(revelFilePath)); - InputStream inputStream = zipFile.getInputStream(zipEntry); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); - - // skip header - String line = bufferedReader.readLine(); - String[] fields = null; - String lastEntry = null; - String currentEntry = null; - List scores = new ArrayList<>(); - MissenseVariantFunctionalScore predictions = null; - while ((line = bufferedReader.readLine()) != null) { - fields = line.split(","); - String chromosome = fields[0]; - if (".".equalsIgnoreCase(fields[2])) { - // 1,12855835,.,C,A,A,D,0.175 - // skip if invalid position - continue; - } - int position = Integer.parseInt(fields[2]); - String reference = fields[3]; - String alternate = fields[4]; - String aaReference = fields[5]; - String aaAlternate = fields[6]; - double score = Double.parseDouble(fields[7]); - - currentEntry = chromosome + position; - - // new chromosome + position, store previous entry - if (lastEntry != null && !currentEntry.equals(lastEntry)) { - serializer.serialize(predictions); - scores = new ArrayList<>(); - predictions = null; - } + logger.info(PARSING_LOG_MESSAGE, revelFiles.get(0)); - if (predictions == null) { - predictions = new MissenseVariantFunctionalScore(chromosome, position, reference, SOURCE, scores); + ZipFile zipFile = new ZipFile(revelFiles.get(0).toString()); + InputStream inputStream = zipFile.getInputStream(zipEntry); + try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream))) { + // Skip header + bufferedReader.readLine(); + String[] fields; + String lastEntry = null; + String currentEntry; + List scores = new ArrayList<>(); + MissenseVariantFunctionalScore predictions = null; + String line; + while ((line = bufferedReader.readLine()) != null) { + fields = line.split(","); + String chromosome = fields[0]; + if (".".equalsIgnoreCase(fields[2])) { + // 1,12855835,.,C,A,A,D,0.175 + // skip if invalid position + continue; + } + int position = Integer.parseInt(fields[2]); + String reference = fields[3]; + String alternate = fields[4]; + String aaReference = fields[5]; + String aaAlternate = fields[6]; + double score = Double.parseDouble(fields[7]); + + currentEntry = chromosome + position; + + // new chromosome + position, store previous entry + if (lastEntry != null && !currentEntry.equals(lastEntry)) { + serializer.serialize(predictions); + scores = new ArrayList<>(); + predictions = null; + } + + if (predictions == null) { + predictions = new MissenseVariantFunctionalScore(chromosome, position, reference, REVEL_DATA, scores); + } + + TranscriptMissenseVariantFunctionalScore predictedScore = new TranscriptMissenseVariantFunctionalScore("", alternate, + aaReference, aaAlternate, score); + scores.add(predictedScore); + lastEntry = chromosome + position; } - TranscriptMissenseVariantFunctionalScore predictedScore = new TranscriptMissenseVariantFunctionalScore("", - alternate, aaReference, aaAlternate, score); - scores.add(predictedScore); - lastEntry = chromosome + position; + // Serialise last entry + serializer.serialize(predictions); } - // serialise last entry - serializer.serialize(predictions); + logger.info(PARSING_DONE_LOG_MESSAGE, revelFiles.get(0)); + // Close zis.close(); zipFile.close(); inputStream.close(); - bufferedReader.close(); + + logger.info(CATEGORY_BUILDING_DONE_LOG_MESSAGE, dataCategory, dataName); } } From 4f9e39a057b1d5ea42f8f6b36984731f4378857b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 11:51:43 +0200 Subject: [PATCH 095/107] lib: update CellBase downloaders according to the last changes, #TASK-5775, #TASK-5564 --- .../org/opencb/cellbase/lib/EtlCommons.java | 273 +++++++++++------- .../lib/download/AbstractDownloadManager.java | 19 ++ .../lib/download/GeneDownloadManager.java | 210 ++++++++------ .../lib/download/GenomeDownloadManager.java | 66 ++--- .../lib/download/OntologyDownloadManager.java | 24 +- .../lib/download/PharmGKBDownloadManager.java | 13 +- .../lib/download/ProteinDownloadManager.java | 18 +- .../download/RegulationDownloadManager.java | 39 +-- 8 files changed, 383 insertions(+), 279 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 28a349e028..f2cc152005 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -21,6 +21,7 @@ import org.apache.logging.log4j.core.config.Configurator; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.lib.download.DownloadFile; import org.opencb.commons.utils.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,6 +37,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; /** * Created by fjlopez on 03/06/16. @@ -43,7 +45,7 @@ public final class EtlCommons { // Ensembl - public static final String ENSEMBL_NAME = "Ensembl"; + public static final String ENSEMBL_DATA = "ensembl"; public static final String PUT_RELEASE_HERE_MARK = "put_release_here"; public static final String PUT_SPECIES_HERE_MARK = "put_species_here"; public static final String PUT_CAPITAL_SPECIES_HERE_MARK = "put_capital_species_here"; @@ -67,88 +69,65 @@ public final class EtlCommons { public static final String SUFFIX_VERSION_FILENAME = "Version.json"; - // Genome (Ensembl) - public static final String GENOME_NAME = "Genome"; + // Genome public static final String GENOME_DATA = "genome"; - public static final String GENOME_SUBDIRECTORY = GENOME_DATA; - public static final String GENOME_VERSION_FILENAME = GENOME_DATA + SUFFIX_VERSION_FILENAME; - // Gene (Ensembl) + // Gene public static final String GENE_DATA = "gene"; - public static final String ENSEMBL_CORE_VERSION_FILENAME = "ensemblCore" + SUFFIX_VERSION_FILENAME; + public static final String GENE_ANNOTATION_DATA = "gene_annotation"; + public static final String GENE_DISEASE_ANNOTATION_DATA = "gene_disease_annotation"; // RefSeq - public static final String REFSEQ_NAME = "RefSeq"; public static final String REFSEQ_DATA = "refseq"; - public static final String REFSEQ_VERSION_FILENAME = "refSeq" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String REFSEQ_GENOMIC_GTF_FILE_ID = "GENOMIC_GTF"; public static final String REFSEQ_GENOMIC_FNA_FILE_ID = "GENOMIC_FNA"; public static final String REFSEQ_PROTEIN_FAA_FILE_ID = "PROTEIN_FAA"; public static final String REFSEQ_RNA_FNA_FILE_ID = "RNA_FNA"; - // MANE Select - public static final String MANE_SELECT_NAME = "MANE Select"; - public static final String MANE_SELECT_VERSION_FILENAME = "maneSelect" + SUFFIX_VERSION_FILENAME; + // Gene annotation + // - MANE Select + public static final String MANE_SELECT_DATA = "MANE Select"; // Must match the configuration file public static final String MANE_SELECT_FILE_ID = "MANE_SELECT"; - - // LRG - public static final String LRG_NAME = "LRG"; - public static final String LRG_VERSION_FILENAME = "lrg" + SUFFIX_VERSION_FILENAME; + // - LRG + public static final String LRG_DATA = "lrg"; // Must match the configuration file public static final String LRG_FILE_ID = "LRG"; - - // HGNC - public static final String HGNC_NAME = "HGNC Gene"; - public static final String HGNC_VERSION_FILENAME = "hgnc" + SUFFIX_VERSION_FILENAME; + // - HGNC + public static final String HGNC_DATA = "hgnc"; // Must match the configuration file public static final String HGNC_FILE_ID = "HGNC"; - - // Cancer HotSpot - public static final String CANCER_HOTSPOT_NAME = "Cancer HotSpot"; - public static final String CANCER_HOTSPOT_VERSION_FILENAME = "cancerHotSpot" + SUFFIX_VERSION_FILENAME; + // - Cancer HotSpot + public static final String CANCER_HOTSPOT_DATA = "cancer_hotspot"; // Must match the configuration file public static final String CANCER_HOTSPOT_FILE_ID = "CANCER_HOTSPOT"; - - // DGID (drug) - public static final String DGIDB_NAME = "DGIdb"; - public static final String DGIDB_VERSION_FILENAME = "dgidb" + SUFFIX_VERSION_FILENAME; + // - DGID (drug) + public static final String DGIDB_DATA = "dgidb"; // Must match the configuration file public static final String DGIDB_FILE_ID = "DGIDB"; - - // UniProt Xref - public static final String UNIPROT_XREF_NAME = "UniProt Xref"; - public static final String UNIPROT_XREF_VERSION_FILENAME = "uniprotXref" + SUFFIX_VERSION_FILENAME; + // - UniProt Xref + public static final String UNIPROT_XREF_DATA = "uniprot_xref"; // Must match the configuration file public static final String UNIPROT_XREF_FILE_ID = "UNIPROT_XREF"; - - // Gene Expression Atlas - public static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas"; - public static final String GENE_EXPRESSION_ATLAS_VERSION_FILENAME = "geneExpressionAtlas" + SUFFIX_VERSION_FILENAME; + // - Gene Expression Atlas + public static final String GENE_EXPRESSION_ATLAS_DATA = "gene_expression_atlas"; // Must match the configuration file public static final String GENE_EXPRESSION_ATLAS_FILE_ID = "GENE_EXPRESSION_ATLAS"; - - // Gene Disease Annotation + // - Gene Disease Annotation public static final String GENE_DISEASE_ANNOTATION_NAME = "Gene Disease Annotation"; - // HPO - public static final String HPO_NAME = "HPO"; - public static final String HPO_VERSION_FILENAME = "hpo" + SUFFIX_VERSION_FILENAME; - // DISGENET - public static final String DISGENET_NAME = "DisGeNet"; - public static final String DISGENET_VERSION_FILENAME = "disGeNet" + SUFFIX_VERSION_FILENAME; + // - HPO + public static final String HPO_DATA = "hpo"; + // - DISGENET + public static final String DISGENET_DATA = "disgenet"; // Must match the configuration file public static final String DISGENET_FILE_ID = "DISGENET"; - - // gnomAD Constraints - public static final String GNOMAD_CONSTRAINTS_NAME = "gnomAD Constraints"; - public static final String GNOMAD_CONSTRAINTS_VERSION_FILENAME = "gnomadConstraints" + SUFFIX_VERSION_FILENAME; + // - gnomAD Constraints + public static final String GNOMAD_CONSTRAINTS_DATA = "gnomad_constraints"; // Must match the configuration file public static final String GNOMAD_CONSTRAINTS_FILE_ID = "GNOMAD_CONSTRAINTS"; - - // GO Annotation - public static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation"; - public static final String GO_ANNOTATION_VERSION_FILENAME = "goAnnotation" + SUFFIX_VERSION_FILENAME; + // - GO Annotation + public static final String GO_ANNOTATION_DATA = "go_annotation"; // Must match the configuration file public static final String GO_ANNOTATION_FILE_ID = "GO_ANNOTATION"; @@ -157,14 +136,9 @@ public final class EtlCommons { public static final String SPLICE_SCORE_DATA = "splice_score"; // Pharmacogenomics - public static final String PHARMACOGENOMICS_NAME = "Pharmacogenomics"; public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; - public static final String PHARMACOGENOMICS_SUBDIRECTORY = "pharmacogenomics"; // PharmGKB - public static final String PHARMGKB_NAME = "PharmGKB"; public static final String PHARMGKB_DATA = "pharmgkb"; - public static final String PHARMGKB_SUBDIRECTORY = "pharmgkb"; - public static final String PHARMGKB_VERSION_FILENAME = "pharmGKB" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String PHARMGKB_GENES_FILE_ID = "GENES"; public static final String PHARMGKB_CHEMICALS_FILE_ID = "CHEMICALS"; @@ -211,50 +185,42 @@ public final class EtlCommons { public static final String GWAS_FILE_ID = "GWAS"; // Repeats - public static final String REPEATS_NAME = "Repeats"; public static final String REPEATS_DATA = "repeats"; - public static final String REPEATS_SUBDIRECTORY = GENOME_SUBDIRECTORY; /** * @deprecated (when refactoring downloaders, builders and loaders) */ @Deprecated public static final String REPEATS_JSON = "repeats"; // Simple repeats - public static final String TRF_NAME = "Tandem Repeats Finder"; - public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME; + public static final String TRF_DATA = "trf"; + // Must match the configuration file public static final String SIMPLE_REPEATS_FILE_ID = "SIMPLE_REPEATS"; // Genomic super duplications - public static final String GSD_NAME = "Genomic Super Duplications"; - public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME; + public static final String GSD_DATA = "gsd"; + // Must match the configuration file public static final String GENOMIC_SUPER_DUPS_FILE_ID = "GENOMIC_SUPER_DUPS"; // Window masker - public static final String WM_NAME = "Window Masker"; - public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME; + public static final String WM_DATA = "wm"; + // Must match the configuration file public static final String WINDOW_MASKER_FILE_ID = "WINDOW_MASKER"; // Ontology - public static final String ONTOLOGY_NAME = "Ontology"; public static final String ONTOLOGY_DATA = "ontology"; - public static final String ONTOLOGY_SUBDIRECTORY = ONTOLOGY_DATA; public static final String OBO_BASENAME = "ontology"; // HPO - public static final String HPO_OBO_NAME = "HPO"; - public static final String HPO_OBO_VERSION_FILENAME = "hpoObo" + SUFFIX_VERSION_FILENAME; + public static final String HPO_OBO_DATA = "hpo"; // Must match the configuration file public static final String HPO_OBO_FILE_ID = "HPO"; // GO - public static final String GO_OBO_NAME = "GO"; - public static final String GO_OBO_VERSION_FILENAME = "goObo" + SUFFIX_VERSION_FILENAME; + public static final String GO_OBO_DATA = "go"; // Must match the configuration file public static final String GO_OBO_FILE_ID = "GO"; // DOID - public static final String DOID_OBO_NAME = "DOID"; - public static final String DOID_OBO_VERSION_FILENAME = "doidObo" + SUFFIX_VERSION_FILENAME; + public static final String DOID_OBO_DATA = "doid"; // Must match the configuration file public static final String DOID_OBO_FILE_ID = "DOID"; // MONDO - public static final String MONDO_OBO_NAME = "Mondo"; - public static final String MONDO_OBO_VERSION_FILENAME = "mondoObo" + SUFFIX_VERSION_FILENAME; + public static final String MONDO_OBO_DATA = "mondo"; // Must match the configuration file public static final String MONDO_OBO_FILE_ID = "MONDO"; @@ -271,79 +237,54 @@ public final class EtlCommons { public static final String CADD_FILE_ID = "CADD"; // Regulation - public static final String REGULATION_NAME = "Regulation"; public static final String REGULATION_DATA = "regulation"; - public static final String REGULATION_SUBDIRECTORY = REGULATION_DATA; public static final String REGULATORY_PFM_BASENAME = "regulatory_pfm"; public static final String REGULATORY_REGION_BASENAME = "regulatory_region"; // Regulatory build and motif features (see Ensembl files: regulatory build and motif features files) - public static final String REGULATORY_BUILD_NAME = "Regulatory Build"; - public static final String REGULATORY_BUILD_VERSION_FILENAME = "regulatoryBuild" + SUFFIX_VERSION_FILENAME; + public static final String REGULATORY_BUILD_DATA = "regulatory_build"; // Motif features (see Ensembl files) - public static final String MOTIF_FEATURES_NAME = "Motif Features"; - public static final String MOTIF_FEATURES_VERSION_FILENAME = "motifFeatures" + SUFFIX_VERSION_FILENAME; + public static final String MOTIF_FEATURES_DATA = "motif_features"; // miRBase - public static final String MIRBASE_NAME = "miRBase"; - public static final String MIRBASE_VERSION_FILENAME = "mirbase" + SUFFIX_VERSION_FILENAME; + public static final String MIRBASE_DATA = "mirbase"; // Must match the configuration file public static final String MIRBASE_FILE_ID = "MIRBASE"; // miRTarBase - public static final String MIRTARBASE_NAME = "miRTarBase"; - public static final String MIRTARBASE_VERSION_FILENAME = "mirTarBase" + SUFFIX_VERSION_FILENAME; + public static final String MIRTARBASE_DATA = "mirtarbase"; // Must match the configuration file public static final String MIRTARBASE_FILE_ID = "MIRTARBASE"; - // Build specific data options - public static final String GENOME_INFO_DATA = "genome_info"; - public static final String DISGENET_DATA = "disgenet"; - public static final String HPO_DATA = "hpo"; - public static final String PPI_DATA = "ppi"; - public static final String DRUG_DATA = "drug"; - // Load specific data options public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; // Protein - public static final String PROTEIN_NAME = "Protein"; public static final String PROTEIN_DATA = "protein"; - public static final String PROTEIN_SUBDIRECTORY = "protein"; // UniProt - public static final String UNIPROT_NAME = "UniProt"; + public static final String UNIPROT_DATA = "uniprot"; public static final String UNIPROT_CHUNKS_SUBDIRECTORY = "uniprot_chunks"; - public static final String UNIPROT_VERSION_FILENAME = "uniprot" + SUFFIX_VERSION_FILENAME; // Must match the configuration file public static final String UNIPROT_FILE_ID = "UNIPROT"; // InterPro - public static final String INTERPRO_NAME = "InterPro"; - public static final String INTERPRO_VERSION_FILENAME = "interpro" + SUFFIX_VERSION_FILENAME; + public static final String INTERPRO_DATA = "interpro"; // Must match the configuration file public static final String INTERPRO_FILE_ID = "INTERPRO"; // IntAct - public static final String INTACT_NAME = "IntAct"; - public static final String INTACT_VERSION_FILENAME = "intact" + SUFFIX_VERSION_FILENAME; + public static final String INTACT_DATA = "intact"; // Must match the configuration file public static final String INTACT_FILE_ID = "INTACT"; // Conservation scores - public static final String CONSERVATION_NAME = "Conservation"; public static final String CONSERVATION_DATA = "conservation"; - public static final String CONSERVATION_SUBDIRECTORY = "conservation"; // GERP - public static final String GERP_NAME = "GERP++"; - public static final String GERP_SUBDIRECTORY = "gerp"; - public static final String GERP_VERSION_FILENAME = "gerp" + SUFFIX_VERSION_FILENAME; + public static final String GERP_DATA = "gerp"; + // Must match the configuration file public static final String GERP_FILE_ID = "GERP"; // PHASTCONS - public static final String PHASTCONS_NAME = "PhastCons"; public static final String PHASTCONS_DATA = "phastCons"; - public static final String PHASTCONS_SUBDIRECTORY = PHASTCONS_DATA; - public static final String PHASTCONS_VERSION_FILENAME = PHASTCONS_DATA + SUFFIX_VERSION_FILENAME; + // Must match the configuration file public static final String PHASTCONS_FILE_ID = "PHASTCONS"; // PHYLOP - public static final String PHYLOP_NAME = "PhyloP"; public static final String PHYLOP_DATA = "phylop"; - public static final String PHYLOP_SUBDIRECTORY = PHYLOP_DATA; - public static final String PHYLOP_VERSION_FILENAME = PHYLOP_DATA + SUFFIX_VERSION_FILENAME; + // Must match the configuration file public static final String PHYLOP_FILE_ID = "PHYLOP"; // Splice scores @@ -380,19 +321,125 @@ public final class EtlCommons { static { // Populate data names map + dataNamesMap.put(ENSEMBL_DATA, "Ensembl"); + dataNamesMap.put(REFSEQ_DATA, "RefSeq"); + dataNamesMap.put(GENOME_DATA, "Genome"); + dataNamesMap.put(GENE_DATA, "Gene"); + dataNamesMap.put(GENE_ANNOTATION_DATA, "Gene Annotation"); + dataCategoriesMap.put(REFSEQ_DATA, "Gene"); + dataNamesMap.put(MANE_SELECT_DATA, "MANE Select"); + dataNamesMap.put(LRG_DATA, "LRG"); + dataNamesMap.put(HGNC_DATA, "HGNC Gene"); + dataNamesMap.put(CANCER_HOTSPOT_DATA, "Cancer HotSpot"); + dataNamesMap.put(DGIDB_DATA, "DGIdb"); + dataNamesMap.put(UNIPROT_XREF_DATA, "UniProt Xref"); + dataNamesMap.put(GENE_EXPRESSION_ATLAS_DATA, "Gene Expression Atlas"); + dataNamesMap.put(GENE_DISEASE_ANNOTATION_DATA, "Gene Disease Annotation"); + dataNamesMap.put(HPO_DATA, "HPO"); + dataNamesMap.put(DISGENET_DATA, "DisGeNet"); + dataNamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomAD Constraints"); + dataNamesMap.put(GO_ANNOTATION_DATA, "EBI Gene Ontology Annotation"); + dataNamesMap.put(PROTEIN_DATA, "Protein"); + dataNamesMap.put(UNIPROT_DATA, "UniProt"); + dataNamesMap.put(INTERPRO_DATA, "InterPro"); + dataNamesMap.put(INTACT_DATA, "IntAct"); + dataNamesMap.put(CONSERVATION_DATA, "Conservation"); + dataNamesMap.put(GERP_DATA, "GERP++"); + dataNamesMap.put(PHASTCONS_DATA, "PhastCons"); + dataNamesMap.put(PHYLOP_DATA, "PhyloP"); + dataNamesMap.put(REPEATS_DATA, "Repeats"); + dataNamesMap.put(TRF_DATA, "Tandem Repeats Finder"); + dataNamesMap.put(WM_DATA, "Window Masker"); + dataNamesMap.put(GSD_DATA, "Genomic Super Duplications"); + dataNamesMap.put(REGULATION_DATA, "Regulation"); + dataNamesMap.put(REGULATORY_BUILD_DATA, "Regulatory Build"); + dataNamesMap.put(MOTIF_FEATURES_DATA, "Motif Features"); + dataNamesMap.put(MIRBASE_DATA, "miRBase"); + dataNamesMap.put(MIRTARBASE_DATA, "miRTarBase"); + dataNamesMap.put(ONTOLOGY_DATA, "Ontology"); + dataNamesMap.put(HPO_OBO_DATA, "HPO"); + dataNamesMap.put(GO_OBO_DATA, "GO"); + dataNamesMap.put(DOID_OBO_DATA, "DOID"); + dataNamesMap.put(MONDO_OBO_DATA, "Mondo"); dataNamesMap.put(PUBMED_DATA, "PubMed"); + dataNamesMap.put(PHARMACOGENOMICS_DATA, "Pharmacogenomics"); + dataNamesMap.put(PHARMGKB_DATA, "PharmGKB"); dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Scores"); dataNamesMap.put(CADD_DATA, "CADD"); dataNamesMap.put(MISSENSE_VARIATION_SCORE_DATA, "Missense Variation Scores"); dataNamesMap.put(REVEL_DATA, "Revel"); // Populate data categories map + dataCategoriesMap.put(ENSEMBL_DATA, "Gene"); + dataCategoriesMap.put(REFSEQ_DATA, "Gene"); + dataCategoriesMap.put(GENOME_DATA, dataNamesMap.get(ENSEMBL_DATA)); + dataCategoriesMap.put(MANE_SELECT_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(LRG_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(HGNC_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(CANCER_HOTSPOT_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(DGIDB_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(UNIPROT_XREF_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GENE_EXPRESSION_ATLAS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(HPO_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(DISGENET_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GNOMAD_CONSTRAINTS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GO_ANNOTATION_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(UNIPROT_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(INTERPRO_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(INTACT_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(GERP_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(PHASTCONS_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(PHYLOP_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(TRF_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(WM_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(GSD_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(REGULATORY_BUILD_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MOTIF_FEATURES_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MIRBASE_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MIRTARBASE_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(HPO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(GO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(DOID_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(MONDO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); dataCategoriesMap.put(PUBMED_DATA, "Publication"); + dataCategoriesMap.put(PHARMGKB_DATA, dataNamesMap.get(PHARMACOGENOMICS_DATA)); dataCategoriesMap.put(CADD_DATA, dataNamesMap.get(VARIATION_FUNCTIONAL_SCORE_DATA)); dataCategoriesMap.put(REVEL_DATA, dataNamesMap.get(MISSENSE_VARIATION_SCORE_DATA)); // Populate data version filenames Map + dataVersionFilenamesMap.put(ENSEMBL_DATA, "ensemblCore" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REFSEQ_DATA, "refSeqCore" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GENOME_DATA, "genome" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MANE_SELECT_DATA, "maneSelect" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(LRG_DATA, "lrg" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HGNC_DATA, "hgnc" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CANCER_HOTSPOT_DATA, "cancerHotSpot" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DGIDB_DATA, "dgidb" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(UNIPROT_XREF_DATA, "uniProtXref" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GENE_EXPRESSION_ATLAS_DATA, "geneExpressionAtlas" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HPO_DATA, "hpo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DISGENET_DATA, "disGeNet" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomadConstraints" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GO_ANNOTATION_DATA, "goAnnotation" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(UNIPROT_DATA, "uniProt" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(INTERPRO_DATA, "interPro" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(INTACT_DATA, "intAct" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GERP_DATA, "gerp" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHASTCONS_DATA, "phastCons" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHYLOP_DATA, "phyloP" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(TRF_DATA, "simpleRepeat" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(WM_DATA, "windowMasker" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GSD_DATA, "genomicSuperDups" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REGULATORY_BUILD_DATA, "regulatoryBuild" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MOTIF_FEATURES_DATA, "motifFeatures" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MIRBASE_DATA, "mirBase" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MIRTARBASE_DATA, "mirTarBase" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HPO_OBO_DATA, "hpoObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GO_OBO_DATA, "goObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DOID_OBO_DATA, "doidObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MONDO_OBO_DATA, "mondoObo" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(PUBMED_DATA, "pubMed" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHARMGKB_DATA, "pharmGKB" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(CADD_DATA, "cadd" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(REVEL_DATA, "revel" + SUFFIX_VERSION_FILENAME); } @@ -593,4 +640,8 @@ public static String getDataVersionFilename(String data) throws CellBaseExceptio } return dataVersionFilenamesMap.get(data); } + + public static List getUrls(List downloadFiles) { + return downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index a05760f686..7c4e331f18 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -183,6 +183,25 @@ protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLPropertie return downloadFile; } + protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String data, + Path outPath) throws IOException, InterruptedException, CellBaseException { + return downloadAndSaveEnsemblDataSource(ensemblProps, fileId, data, null, outPath); + } + + protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String data, + String chromosome, Path outPath) + throws IOException, InterruptedException, CellBaseException { + // Download file + DownloadFile downloadFile = downloadEnsemblDataSource(ensemblProps, fileId, chromosome, outPath); + + // Save data source + saveDataSource(data, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + outPath.resolve(getDataVersionFilename(data))); + + return downloadFile; + } + + @Deprecated protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String name, String category, String chromosome, String versionFilename, Path outPath) throws IOException, InterruptedException, CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 7ea434c24c..ee332dd8ea 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -17,14 +17,13 @@ package org.opencb.cellbase.lib.download; import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; -import java.util.stream.Collectors; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -49,182 +48,223 @@ public GeneDownloadManager(String species, String assembly, Path targetDirectory @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info("Downloading gene information ..."); - Path geneFolder = downloadFolder.resolve("gene"); - Files.createDirectories(geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_DATA)); - Path refseqFolder = downloadFolder.resolve("refseq"); - Files.createDirectories(refseqFolder); + // Create gene folder + Path geneDownloadPath = downloadFolder.resolve(GENE_DATA); + + // Create Ensembl folder + Path ensemblDownloadPath = geneDownloadPath.resolve(ENSEMBL_DATA); + Files.createDirectories(ensemblDownloadPath); + + // Create RefSeq folder + Path refSeqDownloadPath = geneDownloadPath.resolve(REFSEQ_DATA); + Files.createDirectories(refSeqDownloadPath); List downloadFiles = new ArrayList<>(); - downloadFiles.addAll(downloadEnsemblData(geneFolder)); - downloadFiles.addAll(downloadRefSeq(refseqFolder)); - downloadFiles.add(downloadMane(geneFolder)); - downloadFiles.add(downloadLrg(geneFolder)); - downloadFiles.add(downloadHgnc(geneFolder)); - downloadFiles.add(downloadCancerHotspot(geneFolder)); - downloadFiles.add(downloadDrugData(geneFolder)); - downloadFiles.add(downloadGeneUniprotXref(geneFolder)); - downloadFiles.add(downloadGeneExpressionAtlas(geneFolder)); - downloadFiles.add(downloadGeneDiseaseAnnotation(geneFolder)); - downloadFiles.add(downloadGnomadConstraints(geneFolder)); - downloadFiles.add(downloadGO(geneFolder)); + // Ensembl + downloadFiles.addAll(downloadEnsemblData(ensemblDownloadPath)); + + // RefSeq + downloadFiles.addAll(downloadRefSeq(refSeqDownloadPath)); + + // Gene annotation + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); + downloadFiles.add(downloadMane(geneDownloadPath)); + downloadFiles.add(downloadLrg(geneDownloadPath)); + downloadFiles.add(downloadHgnc(geneDownloadPath)); + downloadFiles.add(downloadCancerHotspot(geneDownloadPath)); + downloadFiles.add(downloadDrugData(geneDownloadPath)); + downloadFiles.add(downloadGeneUniprotXref(geneDownloadPath)); + downloadFiles.add(downloadGeneExpressionAtlas(geneDownloadPath)); + downloadFiles.add(downloadGeneDiseaseAnnotation(geneDownloadPath)); + downloadFiles.add(downloadGnomadConstraints(geneDownloadPath)); + downloadFiles.add(downloadGO(geneDownloadPath)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); return downloadFiles; } - private List downloadEnsemblData(Path geneFolder) throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, ENSEMBL_NAME); + private List downloadEnsemblData(Path ensemblDownloadPath) throws IOException, InterruptedException, CellBaseException { + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); List downloadFiles = new ArrayList<>(); + DownloadProperties.EnsemblProperties ensemblProps = configuration.getDownload().getEnsembl(); // GTF - downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_GTF_FILE_ID, geneFolder)); + downloadFiles.add(downloadEnsemblDataSource(ensemblProps, ENSEMBL_GTF_FILE_ID, ensemblDownloadPath)); // PEP - downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_PEP_FA_FILE_ID, geneFolder)); + downloadFiles.add(downloadEnsemblDataSource(ensemblProps, ENSEMBL_PEP_FA_FILE_ID, ensemblDownloadPath)); // CDNA - downloadFiles.add(downloadEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_CDNA_FA_FILE_ID, geneFolder)); + downloadFiles.add(downloadEnsemblDataSource(ensemblProps, ENSEMBL_CDNA_FA_FILE_ID, ensemblDownloadPath)); // Save data source (i.e., metadata) - saveDataSource(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), - downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()), - geneFolder.resolve(ENSEMBL_CORE_VERSION_FILENAME)); + saveDataSource(ENSEMBL_DATA, ensemblVersion, getTimeStamp(), getUrls(downloadFiles), + ensemblDownloadPath.resolve(getDataVersionFilename(ENSEMBL_DATA))); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, ENSEMBL_NAME); + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); return downloadFiles; } - private List downloadRefSeq(Path refSeqFolder) throws IOException, InterruptedException, CellBaseException { + private List downloadRefSeq(Path refSeqDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, REFSEQ_NAME); + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); List downloadFiles = new ArrayList<>(); + DownloadProperties.URLProperties refSeqProps = configuration.getDownload().getRefSeq(); // GTF - downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_GENOMIC_GTF_FILE_ID, refSeqFolder)); + downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_GENOMIC_GTF_FILE_ID, refSeqDownloadPath)); // Genomic FASTA - downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_GENOMIC_FNA_FILE_ID, refSeqFolder)); + downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_GENOMIC_FNA_FILE_ID, refSeqDownloadPath)); // Protein FASTA - downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_PROTEIN_FAA_FILE_ID, refSeqFolder)); + downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_PROTEIN_FAA_FILE_ID, refSeqDownloadPath)); // cDNA - downloadFiles.add(downloadDataSource(configuration.getDownload().getRefSeq(), REFSEQ_RNA_FNA_FILE_ID, refSeqFolder)); + downloadFiles.add(downloadDataSource(refSeqProps, REFSEQ_RNA_FNA_FILE_ID, refSeqDownloadPath)); // Save data source (i.e., metadata) - saveDataSource(REFSEQ_NAME, GENE_DATA, configuration.getDownload().getRefSeq().getVersion(), getTimeStamp(), - downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()), - refSeqFolder.resolve(REFSEQ_VERSION_FILENAME)); + saveDataSource(REFSEQ_DATA, refSeqProps.getVersion(), getTimeStamp(), getUrls(downloadFiles), + refSeqDownloadPath.resolve(getDataVersionFilename(REFSEQ_DATA))); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, REFSEQ_NAME); + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); return downloadFiles; } return Collections.emptyList(); } - private DownloadFile downloadMane(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadMane(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, MANE_SELECT_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MANE_SELECT_DATA)); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, - MANE_SELECT_NAME, GENE_DATA, MANE_SELECT_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, MANE_SELECT_NAME); + MANE_SELECT_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MANE_SELECT_DATA)); return downloadFile; } return null; } - private DownloadFile downloadLrg(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadLrg(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, LRG_NAME); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_NAME, GENE_DATA, - LRG_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, LRG_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(LRG_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_DATA, + geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(LRG_DATA)); return downloadFile; } return null; } - private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadHgnc(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, HGNC_NAME); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_NAME, GENE_DATA, - HGNC_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, HGNC_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(HGNC_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA, + geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(HGNC_DATA)); return downloadFile; } return null; } - private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadCancerHotspot(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, CANCER_HOTSPOT_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA)); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, - CANCER_HOTSPOT_NAME, GENE_DATA, CANCER_HOTSPOT_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, CANCER_HOTSPOT_NAME); + CANCER_HOTSPOT_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA)); return downloadFile; } return null; } - private DownloadFile downloadDrugData(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadDrugData(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, DGIDB_NAME); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_NAME, - GENE_DATA, DGIDB_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, DGIDB_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(DGIDB_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA, + geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(DGIDB_DATA)); return downloadFile; } return null; } - private DownloadFile downloadGeneUniprotXref(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadGeneUniprotXref(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) { - logger.info(DOWNLOADING_LOG_MESSAGE, UNIPROT_XREF_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA)); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), UNIPROT_XREF_FILE_ID, - UNIPROT_XREF_NAME, GENE_DATA, UNIPROT_XREF_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, UNIPROT_XREF_NAME); + UNIPROT_XREF_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA)); return downloadFile; } return null; } - private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, GENE_EXPRESSION_ATLAS_NAME); + private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), - GENE_EXPRESSION_ATLAS_FILE_ID, GENE_EXPRESSION_ATLAS_NAME, GENE_DATA, GENE_EXPRESSION_ATLAS_VERSION_FILENAME, geneFolder); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENE_EXPRESSION_ATLAS_NAME); + GENE_EXPRESSION_ATLAS_FILE_ID, GENE_EXPRESSION_ATLAS_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); return downloadFile; } - private DownloadFile downloadGeneDiseaseAnnotation(Path geneFolder) throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, GENE_DISEASE_ANNOTATION_NAME); + private DownloadFile downloadGeneDiseaseAnnotation(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); + // HPO // IMPORTANT !!! logger.warn("{} must be downloaded manually from {} and then create the file {} with data ({}), name ({}) and the version", - HPO_NAME, configuration.getDownload().getHpo().getHost(), HPO_VERSION_FILENAME, GENE_DATA, HPO_NAME); - saveDataSource(HPO_NAME, GENE_DISEASE_ANNOTATION_NAME, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), - Collections.singletonList(configuration.getDownload().getHpo().getHost()), geneFolder.resolve(HPO_VERSION_FILENAME)); + getDataName(HPO_DATA), configuration.getDownload().getHpo().getHost(), getDataVersionFilename(HPO_DATA), + getDataCategory(HPO_DATA), getDataName(HPO_DATA)); + saveDataSource(HPO_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), + Collections.singletonList(configuration.getDownload().getHpo().getHost()), + geneDownloadPath.resolve(getDataVersionFilename(HPO_DATA))); - DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_NAME, - GENE_DISEASE_ANNOTATION_NAME, DISGENET_VERSION_FILENAME, geneFolder); + // DisGeNet + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), DISGENET_FILE_ID, DISGENET_DATA, + geneDownloadPath); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENE_DISEASE_ANNOTATION_NAME); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); return downloadFile; } - private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadGnomadConstraints(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, GNOMAD_CONSTRAINTS_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), GNOMAD_CONSTRAINTS_FILE_ID, - GNOMAD_CONSTRAINTS_NAME, GENE_DATA, GNOMAD_CONSTRAINTS_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), + GNOMAD_CONSTRAINTS_FILE_ID, GNOMAD_CONSTRAINTS_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA)); + return downloadFile; } return null; } - private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException, CellBaseException { + private DownloadFile downloadGO(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, GO_ANNOTATION_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), GO_ANNOTATION_FILE_ID, GO_ANNOTATION_NAME, - GENE_DATA, GO_ANNOTATION_VERSION_FILENAME, geneFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(), GO_ANNOTATION_FILE_ID, + GO_ANNOTATION_DATA, geneDownloadPath); + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); + return downloadFile; } return null; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 210d5bc39f..289ec23258 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -19,7 +19,6 @@ import com.beust.jcommander.ParameterException; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; @@ -44,8 +43,8 @@ public List download() throws IOException, InterruptedException, C } public List downloadReferenceGenome() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, GENOME_NAME); - Path sequenceFolder = downloadFolder.resolve(GENOME_SUBDIRECTORY); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_DATA)); + Path sequenceFolder = downloadFolder.resolve(GENOME_DATA); Files.createDirectories(sequenceFolder); // Reference genome sequences are downloaded from Ensembl @@ -54,10 +53,10 @@ public List downloadReferenceGenome() throws IOException, Interrup sequenceFolder); // Save data source - saveDataSource(ENSEMBL_NAME, EtlCommons.GENOME_DATA, ensemblVersion, getTimeStamp(), - Collections.singletonList(downloadFile.getUrl()), sequenceFolder.resolve(GENOME_VERSION_FILENAME)); + saveDataSource(GENOME_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + sequenceFolder.resolve(getDataVersionFilename(GENOME_DATA))); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, GENOME_NAME); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_DATA)); return Collections.singletonList(downloadFile); } @@ -75,13 +74,13 @@ public List downloadConservation() throws IOException, Interrupted } List downloadFiles = new ArrayList<>(); if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, CONSERVATION_NAME); - Path conservationFolder = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); + Path conservationFolder = downloadFolder.resolve(CONSERVATION_DATA); Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve(GERP_SUBDIRECTORY)); - Files.createDirectories(conservationFolder.resolve(PHASTCONS_SUBDIRECTORY)); - Files.createDirectories(conservationFolder.resolve(PHYLOP_SUBDIRECTORY)); + Files.createDirectories(conservationFolder.resolve(GERP_DATA)); + Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA)); + Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA)); String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M", }; @@ -93,14 +92,14 @@ public List downloadConservation() throws IOException, Interrupted List phastconsUrls = new ArrayList<>(chromosomes.length); List phyloPUrls = new ArrayList<>(chromosomes.length); // Downloading PhastCons and PhyloP - logger.info(DOWNLOADING_LOG_MESSAGE, (PHASTCONS_NAME + "/" + PHYLOP_NAME)); + logger.info(DOWNLOADING_LOG_MESSAGE, (getDataName(PHASTCONS_DATA) + "/" + getDataName(PHYLOP_DATA))); for (String chromosome : chromosomes) { // PhastCons String phastConsUrl = configuration.getDownload().getPhastCons().getHost() + configuration.getDownload().getPhastCons() .getFiles().get(PHASTCONS_FILE_ID).replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phastConsUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHASTCONS_SUBDIRECTORY).resolve(filename); + outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phastConsUrl, outputPath); downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); phastconsUrls.add(phastConsUrl); @@ -110,30 +109,30 @@ public List downloadConservation() throws IOException, Interrupted .getFiles().get(PHYLOP_FILE_ID).replaceAll(PUT_ASSEMBLY_HERE_MARK, assembly) .replace(PUT_CHROMOSOME_HERE_MARK, chromosome); filename = Paths.get(phyloPUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHYLOP_SUBDIRECTORY).resolve(filename); + outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phyloPUrl, outputPath); downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); phyloPUrls.add(phyloPUrl); } // Downloading Gerp - logger.info(DOWNLOADING_LOG_MESSAGE, GERP_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); String gerpUrl = configuration.getDownload().getGerp().getHost() + configuration.getDownload().getGerp().getFiles() .get(GERP_FILE_ID); filename = Paths.get(gerpUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(GERP_SUBDIRECTORY).resolve(filename); + outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, gerpUrl, outputPath); downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); // Save data version - saveDataSource(PHASTCONS_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhastCons().getVersion(), - getTimeStamp(), phastconsUrls, conservationFolder.resolve(PHASTCONS_VERSION_FILENAME)); - saveDataSource(PHYLOP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getPhylop().getVersion(), - getTimeStamp(), phyloPUrls, conservationFolder.resolve(PHYLOP_VERSION_FILENAME)); - saveDataSource(GERP_NAME, EtlCommons.CONSERVATION_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), - Collections.singletonList(gerpUrl), conservationFolder.resolve(GERP_VERSION_FILENAME)); + saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, + conservationFolder.resolve(getDataVersionFilename(PHASTCONS_DATA))); + saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, + conservationFolder.resolve(getDataVersionFilename(PHYLOP_DATA))); + saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), + Collections.singletonList(gerpUrl), conservationFolder.resolve(getDataVersionFilename(GERP_DATA))); } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, CONSERVATION_NAME); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); } return downloadFiles; @@ -144,8 +143,8 @@ public List downloadRepeats() throws IOException, InterruptedExcep return Collections.emptyList(); } if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, REPEATS_NAME); - Path repeatsFolder = downloadFolder.resolve(EtlCommons.REPEATS_SUBDIRECTORY); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REPEATS_DATA)); + Path repeatsFolder = downloadFolder.resolve(REPEATS_DATA); Files.createDirectories(repeatsFolder); List downloadFiles = new ArrayList<>(); String pathParam; @@ -160,36 +159,33 @@ public List downloadRepeats() throws IOException, InterruptedExcep // Download tandem repeat finder String url = configuration.getDownload().getSimpleRepeats().getHost() + configuration.getDownload().getSimpleRepeats() .getFiles().get(SIMPLE_REPEATS_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveDataSource(TRF_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), - Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.TRF_VERSION_FILENAME)); - Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + saveDataSource(TRF_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), + Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(TRF_DATA))); // Download genomic super duplications url = configuration.getDownload().getGenomicSuperDups().getHost() + configuration.getDownload().getGenomicSuperDups() .getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveDataSource(GSD_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), - getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.GSD_VERSION_FILENAME)); - outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + saveDataSource(GSD_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), + Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(GSD_DATA))); // Download WindowMasker if (!pathParam.equalsIgnoreCase(HG19_NAME)) { url = configuration.getDownload().getWindowMasker().getHost() + configuration.getDownload().getWindowMasker().getFiles() .get(WINDOW_MASKER_FILE_ID).replace(PUT_ASSEMBLY_HERE_MARK, pathParam); - saveDataSource(WM_NAME, EtlCommons.REPEATS_DATA, configuration.getDownload().getWindowMasker().getVersion(), - getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.WM_VERSION_FILENAME)); - outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + saveDataSource(WM_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), + Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(WM_DATA))); } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, REPEATS_NAME); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REPEATS_DATA)); return downloadFiles; } return Collections.emptyList(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index 4a91d84225..53ff518323 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -40,9 +40,10 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec } public List download() throws IOException, InterruptedException, CellBaseException { - Path oboFolder = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); + + Path oboFolder = downloadFolder.resolve(ONTOLOGY_DATA); Files.createDirectories(oboFolder); - logger.info(DOWNLOADING_LOG_MESSAGE, ONTOLOGY_NAME); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); @@ -50,33 +51,32 @@ public List download() throws IOException, InterruptedException, C // HPO downloadFile = downloadDataSource(configuration.getDownload().getHpoObo(), HPO_OBO_FILE_ID, oboFolder); String version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(HPO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(HPO_OBO_VERSION_FILENAME)); + saveDataSource(HPO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(HPO_OBO_DATA))); downloadFiles.add(downloadFile); // GO downloadFile = downloadDataSource(configuration.getDownload().getGoObo(), GO_OBO_FILE_ID, oboFolder); version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(GO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(GO_OBO_VERSION_FILENAME)); + saveDataSource(GO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(GO_OBO_DATA))); downloadFiles.add(downloadFile); // DOID downloadFile = downloadDataSource(configuration.getDownload().getDoidObo(), DOID_OBO_FILE_ID, oboFolder); version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(DOID_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(DOID_OBO_VERSION_FILENAME)); + saveDataSource(DOID_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(DOID_OBO_DATA))); downloadFiles.add(downloadFile); // Mondo downloadFile = downloadDataSource(configuration.getDownload().getMondoObo(), MONDO_OBO_FILE_ID, oboFolder); version = getVersionFromOboFile(oboFolder.resolve(downloadFile.getOutputFile())); - saveDataSource(MONDO_OBO_NAME, ONTOLOGY_NAME, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - oboFolder.resolve(MONDO_OBO_VERSION_FILENAME)); + saveDataSource(MONDO_OBO_DATA, version, getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + oboFolder.resolve(getDataVersionFilename(MONDO_OBO_DATA))); downloadFiles.add(downloadFile); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, ONTOLOGY_NAME); - + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 873387f94b..2eeac8415f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -38,9 +38,9 @@ public PharmGKBDownloadManager(String species, String assembly, Path targetDirec @Override public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, PHARMACOGENOMICS_NAME); + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); - Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_SUBDIRECTORY).resolve(PHARMGKB_SUBDIRECTORY); + Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); Files.createDirectories(pharmgkbDownloadFolder); DownloadProperties.URLProperties pharmGKBProps = configuration.getDownload().getPharmGKB(); @@ -58,12 +58,11 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadFile); } - // Save versions - saveDataSource(PHARMGKB_NAME, PHARMACOGENOMICS_NAME, pharmGKBProps.getVersion(), getTimeStamp(), urls, - pharmgkbDownloadFolder.resolve(PHARMGKB_VERSION_FILENAME)); - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, PHARMACOGENOMICS_NAME); + // Save data source + saveDataSource(PHARMGKB_DATA, pharmGKBProps.getVersion(), getTimeStamp(), urls, + pharmgkbDownloadFolder.resolve(getDataVersionFilename(PHARMGKB_DATA))); + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataCategory(PHARMGKB_DATA), getDataName(PHARMGKB_DATA)); return downloadFiles; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 5cb8a4c1f0..ba75a8e162 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -44,34 +44,30 @@ public ProteinDownloadManager(String species, String assembly, Path targetDirect * @throws CellBaseException if there is an error in the CelllBase configuration file */ public List download() throws IOException, InterruptedException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, PROTEIN_NAME); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); if (!speciesHasInfoToDownload(speciesConfiguration, PROTEIN_DATA)) { - logger.info("{} not supported for the species {}", PROTEIN_NAME, speciesConfiguration.getScientificName()); + logger.info("{} not supported for the species {}", getDataName(PROTEIN_DATA), speciesConfiguration.getScientificName()); return Collections.emptyList(); } - Path proteinFolder = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); + Path proteinFolder = downloadFolder.resolve(PROTEIN_DATA); Files.createDirectories(proteinFolder); DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // Uniprot - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_NAME, PROTEIN_DATA, - UNIPROT_VERSION_FILENAME, proteinFolder); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getUniprot(), UNIPROT_FILE_ID, UNIPROT_DATA, proteinFolder); downloadFiles.add(downloadFile); // InterPro - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_NAME, PROTEIN_DATA, - INTERPRO_VERSION_FILENAME, proteinFolder); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getInterpro(), INTERPRO_FILE_ID, INTERPRO_DATA, proteinFolder); downloadFiles.add(downloadFile); // Intact - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_FILE_ID, INTACT_NAME, PROTEIN_DATA, - INTACT_VERSION_FILENAME, proteinFolder); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getIntact(), INTACT_FILE_ID, INTACT_DATA, proteinFolder); downloadFiles.add(downloadFile); - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, PROTEIN_NAME); - + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(PROTEIN_DATA)); return downloadFiles; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 56d15bf844..0c87775f5c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -40,12 +40,13 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C @Override public List download() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(REGULATION_DATA)); if (!speciesHasInfoToDownload(speciesConfiguration, REGULATION_DATA)) { + logger.info("{} not supported for the species {}", getDataName(REGULATION_DATA), speciesConfiguration.getScientificName()); return Collections.emptyList(); } - regulationFolder = downloadFolder.resolve(REGULATION_SUBDIRECTORY); + regulationFolder = downloadFolder.resolve(REGULATION_DATA); Files.createDirectories(regulationFolder); - logger.info("Downloading {} files at {} ...", REGULATION_DATA, regulationFolder); List downloadFiles = new ArrayList<>(); @@ -53,6 +54,7 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadMiRTarBase()); downloadFiles.add(downloadMirna()); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(REGULATION_DATA)); return downloadFiles; } @@ -62,19 +64,12 @@ public List download() throws IOException, InterruptedException, C * @throws InterruptedException Any issue downloading files */ private List downloadRegulatoryaAndMotifFeatures() throws IOException, InterruptedException, CellBaseException { -// String baseUrl; -// if (configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) { -// baseUrl = ensemblHostUrl + ensemblRelease + "/"; -// } else { -// baseUrl = ensemblHostUrl + ensemblRelease + "/" + getPhylo(speciesConfiguration) + "/"; -// } - DownloadFile downloadFile; List downloadFiles = new ArrayList<>(); // Regulatory build downloadFile = downloadAndSaveEnsemblDataSource(configuration.getDownload().getEnsembl(), ENSEMBL_REGULATORY_BUILD_FILE_ID, - REGULATORY_BUILD_NAME, REGULATION_DATA, null, REGULATORY_BUILD_VERSION_FILENAME, regulationFolder); + REGULATORY_BUILD_DATA, regulationFolder); downloadFiles.add(downloadFile); // Motifs features @@ -89,21 +84,29 @@ private List downloadRegulatoryaAndMotifFeatures() throws IOExcept downloadFiles.add(downloadFile); urls.add(downloadFile.getUrl()); // Save data source (name, category, version,...) - saveDataSource(MOTIF_FEATURES_NAME, REGULATION_DATA, "(" + ENSEMBL_NAME + " " + ensemblVersion + ")", getTimeStamp(), urls, - regulationFolder.resolve(MOTIF_FEATURES_VERSION_FILENAME)); + saveDataSource(MOTIF_FEATURES_DATA, "(" + getDataName(ENSEMBL_DATA) + " " + ensemblVersion + ")", getTimeStamp(), urls, + regulationFolder.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))); return downloadFiles; } private DownloadFile downloadMirna() throws IOException, InterruptedException, CellBaseException { - logger.info("Downloading {} ...", MIRBASE_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_FILE_ID, MIRBASE_NAME, REGULATION_DATA, - MIRBASE_VERSION_FILENAME, regulationFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MIRBASE_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMirbase(), MIRBASE_FILE_ID, MIRBASE_DATA, + regulationFolder); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MIRBASE_DATA)); + return downloadFile; } private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException, CellBaseException { - logger.info("Downloading {} ...", MIRTARBASE_NAME); - return downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), MIRTARBASE_FILE_ID, MIRTARBASE_NAME, REGULATION_DATA, - MIRTARBASE_VERSION_FILENAME, regulationFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MIRTARBASE_DATA)); + + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getMiRTarBase(), MIRTARBASE_FILE_ID, + MIRTARBASE_DATA, regulationFolder); + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(MIRTARBASE_DATA)); + return downloadFile; } } From 1586a77d87a96716bf6d6a2db2cd4713104f8474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 12:53:57 +0200 Subject: [PATCH 096/107] app: update load command executor according to the EtlCommons changes, #TASK-6142, #TASK-5564 --- .../app/cli/admin/executors/LoadCommandExecutor.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index c750beb6aa..166c4e7a6f 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -44,7 +44,7 @@ import java.util.List; import java.util.concurrent.ExecutionException; -import static org.opencb.cellbase.lib.EtlCommons.PUBMED_DATA; +import static org.opencb.cellbase.lib.EtlCommons.*; /** * Created by imedina on 03/02/15. @@ -486,9 +486,9 @@ private void loadRepeats() { // Update release (collection and sources) List sources = new ArrayList<>(Arrays.asList( - input.resolve(EtlCommons.TRF_VERSION_FILENAME), - input.resolve(EtlCommons.GSD_VERSION_FILENAME), - input.resolve(EtlCommons.WM_VERSION_FILENAME) + input.resolve(getDataVersionFilename(TRF_DATA)), + input.resolve(getDataVersionFilename(GSD_DATA)), + input.resolve(getDataVersionFilename(WM_DATA)) )); dataReleaseManager.update(dataRelease, "repeats", EtlCommons.REPEATS_DATA, sources); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException @@ -587,7 +587,7 @@ private void loadPharmacogenomica() throws IOException, CellBaseException { createIndex(EtlCommons.PHARMACOGENOMICS_DATA); // Update release (collection and sources) - List sources = Collections.singletonList(pharmaPath.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME)); + List sources = Collections.singletonList(pharmaPath.resolve(getDataVersionFilename(PHARMGKB_DATA))); dataReleaseManager.update(dataRelease, EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PHARMACOGENOMICS_DATA, sources); } From c7c398ab101ef654295acd01778cf3c33b135806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 12:54:46 +0200 Subject: [PATCH 097/107] lib: update CellBase builders according to the EtlCommons changes, #TASK-5776, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 59 +++++++++--------- .../org/opencb/cellbase/lib/EtlCommons.java | 1 + .../lib/builders/ConservationBuilder.java | 41 +++++++------ .../lib/builders/OntologyBuilder.java | 29 ++++----- .../lib/builders/PharmGKBBuilder.java | 14 +++-- .../cellbase/lib/builders/ProteinBuilder.java | 41 ++++++------- .../builders/RegulatoryFeatureBuilder.java | 21 ++++--- .../cellbase/lib/builders/RepeatsBuilder.java | 61 +++++++++++-------- 8 files changed, 141 insertions(+), 126 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index f1fdcbbb19..0cf6b17899 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -193,24 +193,24 @@ public void execute() throws CellBaseException { private CellBaseBuilder buildRepeats() throws CellBaseException { // Sanity check - Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_SUBDIRECTORY); - List versionPaths = Arrays.asList(repeatsDownloadPath.resolve(TRF_VERSION_FILENAME), - repeatsDownloadPath.resolve(GSD_VERSION_FILENAME), - repeatsDownloadPath.resolve(WM_VERSION_FILENAME)); - copyVersionFiles(versionPaths, buildFolder.resolve(REPEATS_SUBDIRECTORY)); + Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_DATA); + List versionPaths = Arrays.asList(repeatsDownloadPath.resolve(getDataVersionFilename(TRF_DATA)), + repeatsDownloadPath.resolve(getDataVersionFilename(GSD_DATA)), + repeatsDownloadPath.resolve(getDataVersionFilename(WM_DATA))); + copyVersionFiles(versionPaths, buildFolder.resolve(REPEATS_DATA)); // Create serializer and return the repeats builder - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_SUBDIRECTORY), REPEATS_DATA); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_DATA), REPEATS_BASENAME); return new RepeatsBuilder(repeatsDownloadPath, serializer, configuration); } private CellBaseBuilder buildObo() throws CellBaseException { - Path oboDownloadPath = downloadFolder.resolve(ONTOLOGY_SUBDIRECTORY); - Path oboBuildPath = buildFolder.resolve(ONTOLOGY_SUBDIRECTORY); - List versionPaths = Arrays.asList(oboDownloadPath.resolve(HPO_OBO_VERSION_FILENAME), - oboDownloadPath.resolve(GO_OBO_VERSION_FILENAME), - oboDownloadPath.resolve(DOID_OBO_VERSION_FILENAME), - oboDownloadPath.resolve(MONDO_OBO_VERSION_FILENAME)); + Path oboDownloadPath = downloadFolder.resolve(ONTOLOGY_DATA); + Path oboBuildPath = buildFolder.resolve(ONTOLOGY_DATA); + List versionPaths = Arrays.asList(oboDownloadPath.resolve(getDataVersionFilename(HPO_OBO_DATA)), + oboDownloadPath.resolve(getDataVersionFilename(GO_OBO_DATA)), + oboDownloadPath.resolve(getDataVersionFilename(DOID_OBO_DATA)), + oboDownloadPath.resolve(getDataVersionFilename(MONDO_OBO_DATA))); copyVersionFiles(versionPaths, oboBuildPath); // Create serializer and return the ontology builder @@ -234,14 +234,14 @@ private void copyVersionFiles(List pathList) { private CellBaseBuilder buildGenomeSequence() throws CellBaseException { // Sanity check - Path genomeVersionPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(GENOME_VERSION_FILENAME); - copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_SUBDIRECTORY)); + Path genomeVersionPath = downloadFolder.resolve(GENOME_DATA).resolve(getDataVersionFilename(GENOME_DATA)); + copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_DATA)); // Get FASTA path Path fastaPath = getFastaReferenceGenome(); // Create serializer and return the genome builder - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(GENOME_SUBDIRECTORY), GENOME_DATA); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(GENOME_DATA), GENOME_DATA); return new GenomeSequenceFastaBuilder(fastaPath, serializer); } @@ -290,8 +290,8 @@ private CellBaseBuilder buildRegulation() throws CellBaseException { // Sanity check Path regulationDownloadPath = downloadFolder.resolve(REGULATION_DATA); Path regulationBuildPath = buildFolder.resolve(REGULATION_DATA); - copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(REGULATORY_BUILD_VERSION_FILENAME), - regulationDownloadPath.resolve(MOTIF_FEATURES_VERSION_FILENAME)), regulationBuildPath); + copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)), + regulationDownloadPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))), regulationBuildPath); // Create the file serializer and the regulatory feature builder CellBaseSerializer serializer = new CellBaseJsonFileSerializer(regulationBuildPath, REGULATORY_REGION_BASENAME); @@ -300,10 +300,10 @@ private CellBaseBuilder buildRegulation() throws CellBaseException { private CellBaseBuilder buildProtein() throws CellBaseException { // Sanity check - Path proteinDownloadPath = downloadFolder.resolve(PROTEIN_SUBDIRECTORY); - Path proteinBuildPath = buildFolder.resolve(PROTEIN_SUBDIRECTORY); - copyVersionFiles(Arrays.asList(proteinDownloadPath.resolve(UNIPROT_VERSION_FILENAME), - proteinDownloadPath.resolve(INTERPRO_VERSION_FILENAME)), proteinBuildPath); + Path proteinDownloadPath = downloadFolder.resolve(PROTEIN_DATA); + Path proteinBuildPath = buildFolder.resolve(PROTEIN_DATA); + copyVersionFiles(Arrays.asList(proteinDownloadPath.resolve(getDataVersionFilename(UNIPROT_DATA)), + proteinDownloadPath.resolve(getDataVersionFilename(INTERPRO_DATA))), proteinBuildPath); // Create the file serializer and the protein builder CellBaseSerializer serializer = new CellBaseJsonFileSerializer(proteinBuildPath, PROTEIN_DATA); @@ -312,13 +312,14 @@ private CellBaseBuilder buildProtein() throws CellBaseException { private CellBaseBuilder buildConservation() throws CellBaseException { // Sanity check - Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY); - copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(GERP_VERSION_FILENAME), - conservationDownloadPath.resolve(PHASTCONS_VERSION_FILENAME), conservationDownloadPath.resolve(PHYLOP_VERSION_FILENAME)), - buildFolder.resolve(CONSERVATION_SUBDIRECTORY)); + Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_DATA); + Path conservationBuildPath = buildFolder.resolve(CONSERVATION_DATA); + copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(getDataVersionFilename(GERP_DATA)), + conservationDownloadPath.resolve(getDataVersionFilename(PHASTCONS_DATA)), + conservationDownloadPath.resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath); int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(CONSERVATION_SUBDIRECTORY)); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(conservationBuildPath); return new ConservationBuilder(conservationDownloadPath, conservationChunkSize, serializer); } @@ -360,7 +361,7 @@ private Path getFastaReferenceGenome() throws CellBaseException { String ensemblUrl = getEnsemblUrl(configuration.getDownload().getEnsembl(), ensemblRelease, ENSEMBL_PRIMARY_FA_FILE_ID, getSpeciesShortname(speciesConfiguration), assembly.getName(), null); String fastaFilename = Paths.get(ensemblUrl).getFileName().toString(); - Path fastaPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(fastaFilename); + Path fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename); if (fastaPath.toFile().exists()) { // Gunzip logger.info("Gunzip file: {}", fastaPath); @@ -374,7 +375,7 @@ private Path getFastaReferenceGenome() throws CellBaseException { throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); } } - fastaPath = downloadFolder.resolve(GENOME_SUBDIRECTORY).resolve(fastaFilename.replace(".gz", "")); + fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(".gz", "")); if (!fastaPath.toFile().exists()) { throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); } @@ -413,7 +414,7 @@ private CellBaseBuilder buildPharmacogenomics() throws CellBaseException { // Sanity check Path pharmGkbDownloadPath = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); Path pharmGkbBuildPath = buildFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); - copyVersionFiles(Arrays.asList(pharmGkbDownloadPath.resolve(PHARMGKB_VERSION_FILENAME)), pharmGkbBuildPath); + copyVersionFiles(Arrays.asList(pharmGkbDownloadPath.resolve(getDataVersionFilename(PHARMGKB_DATA))), pharmGkbBuildPath); // Create the file serializer and the PharmGKB feature builder CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pharmGkbBuildPath); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index f2cc152005..a836288d6f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -186,6 +186,7 @@ public final class EtlCommons { // Repeats public static final String REPEATS_DATA = "repeats"; + public static final String REPEATS_BASENAME = "repeats"; /** * @deprecated (when refactoring downloaders, builders and loaders) */ diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 79099a4d93..d43c38cb7a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -18,6 +18,7 @@ import org.opencb.biodata.models.core.GenomicScoreRegion; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.MongoDBCollectionConfiguration; @@ -55,7 +56,7 @@ public ConservationBuilder(Path conservedRegionPath, int chunkSize, CellBaseFile @Override public void parse() throws IOException, CellBaseException { - logger.info(BUILDING_LOG_MESSAGE, CONSERVATION_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); if (conservedRegionPath == null || !Files.exists(conservedRegionPath) || !Files.isDirectory(conservedRegionPath)) { throw new IOException("Conservation directory " + conservedRegionPath + " does not exist or it is not a directory or it cannot" @@ -63,25 +64,25 @@ public void parse() throws IOException, CellBaseException { } // Check GERP folder and files - Path gerpPath = conservedRegionPath.resolve(GERP_SUBDIRECTORY); - List gerpFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(GERP_VERSION_FILENAME).toFile()), gerpPath, - GERP_NAME); + Path gerpPath = conservedRegionPath.resolve(GERP_DATA); + DataSource dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(GERP_DATA)).toFile()); + List gerpFiles = checkFiles(dataSource, gerpPath, getDataName(GERP_DATA)); // Check PhastCons folder and files - Path phastConsPath = conservedRegionPath.resolve(PHASTCONS_SUBDIRECTORY); - List phastConsFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(PHASTCONS_VERSION_FILENAME).toFile()), - phastConsPath, PHASTCONS_NAME); + Path phastConsPath = conservedRegionPath.resolve(PHASTCONS_DATA); + dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHASTCONS_DATA)).toFile()); + List phastConsFiles = checkFiles(dataSource, phastConsPath, getDataName(PHASTCONS_DATA)); // Check PhyloP folder and files - Path phylopPath = conservedRegionPath.resolve(PHYLOP_SUBDIRECTORY); - List phylopFiles = checkFiles(dataSourceReader.readValue(conservedRegionPath.resolve(PHYLOP_VERSION_FILENAME).toFile()), - phylopPath, PHYLOP_NAME); + Path phylopPath = conservedRegionPath.resolve(PHYLOP_DATA); + dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHYLOP_DATA)).toFile()); + List phylopFiles = checkFiles(dataSource, phylopPath, getDataName(PHYLOP_DATA)); // GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse // this file correctly, so we transform the file into a bedGraph format which is human-readable. if (gerpFiles.size() != 1) { - throw new CellBaseException("Only one " + GERP_NAME + " file is expected, but currently there are " + gerpFiles.size() - + " files"); + throw new CellBaseException("Only one " + getDataName(GERP_DATA) + " file is expected, but currently there are " + + gerpFiles.size() + " files"); } File bigwigFile = gerpFiles.get(0); File bedgraphFile = Paths.get(gerpFiles.get(0).getAbsolutePath() + ".bedgraph").toFile(); @@ -91,8 +92,8 @@ public void parse() throws IOException, CellBaseException { if (isExecutableAvailable(exec)) { EtlCommons.runCommandLineProcess(null, exec, Arrays.asList(bigwigFile.toString(), bedgraphFile.toString()), null); } else { - throw new CellBaseException(exec + " not found in your system, install it to build " + GERP_NAME + ". It is available" - + " at http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/"); + throw new CellBaseException(exec + " not found in your system, install it to build " + getDataName(GERP_DATA) + + ". It is available at http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/"); } } catch (IOException e) { throw new CellBaseException("Error executing " + exec + " in BIGWIG file " + bigwigFile, e); @@ -131,13 +132,13 @@ public void parse() throws IOException, CellBaseException { logger.debug("Chromosomes found '{}'", chromosomes); for (String chr : chromosomes) { logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHASTCONS_DATA)); - processWigFixFile(files.get(chr + PHASTCONS_DATA), PHASTCONS_NAME); + processWigFixFile(files.get(chr + PHASTCONS_DATA), PHASTCONS_DATA); logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHYLOP_DATA)); - processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_NAME); + processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_DATA); } - logger.info(BUILDING_DONE_LOG_MESSAGE, CONSERVATION_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); } private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseException { @@ -156,8 +157,8 @@ private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseEx // Checking line if (fields.length != 4) { - throw new CellBaseException("Invalid " + GERP_NAME + " line (expecting 4 columns): " + fields.length + " items: " - + line); + throw new CellBaseException("Invalid " + getDataName(GERP_DATA) + " line (expecting 4 columns): " + fields.length + + " items: " + line); } chromosome = fields[0]; @@ -263,7 +264,7 @@ private void storeScores(int startOfBatch, String chromosome, List conser } GenomicScoreRegion conservationScoreRegion = new GenomicScoreRegion<>(chromosome, startOfBatch, - startOfBatch + conservationScores.size() - 1, GERP_NAME, conservationScores); + startOfBatch + conservationScores.size() - 1, GERP_DATA, conservationScores); fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome)); // Reset diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index 679e0d30f8..b14d20b54c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -42,34 +42,34 @@ public OntologyBuilder(Path oboDownloadPath, CellBaseSerializer serializer) { @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, ONTOLOGY_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); // Sanity check - checkDirectory(oboDownloadPath, REGULATION_NAME); + checkDirectory(oboDownloadPath, getDataName(REGULATION_DATA)); // Check ontology files - List hpoFiles = checkOboFiles(oboDownloadPath.resolve(HPO_OBO_VERSION_FILENAME), HPO_OBO_NAME); - List goFiles = checkOboFiles(oboDownloadPath.resolve(GO_OBO_VERSION_FILENAME), GO_OBO_NAME); - List doidFiles = checkOboFiles(oboDownloadPath.resolve(DOID_OBO_VERSION_FILENAME), DOID_OBO_NAME); - List mondoFiles = checkOboFiles(oboDownloadPath.resolve(MONDO_OBO_VERSION_FILENAME), MONDO_OBO_NAME); + List hpoFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(HPO_OBO_DATA)), getDataName(HPO_OBO_DATA)); + List goFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(GO_OBO_DATA)), getDataName(GO_OBO_DATA)); + List doidFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(DOID_OBO_DATA)), getDataName(DOID_OBO_DATA)); + List mondoFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(MONDO_OBO_DATA)), getDataName(MONDO_OBO_DATA)); // Parse OBO files and build - parseOboFile(hpoFiles.get(0), HPO_OBO_NAME); - parseOboFile(goFiles.get(0), GO_OBO_NAME); - parseOboFile(doidFiles.get(0), DOID_OBO_NAME); - parseOboFile(mondoFiles.get(0), MONDO_OBO_NAME); + parseOboFile(hpoFiles.get(0), HPO_OBO_DATA); + parseOboFile(goFiles.get(0), GO_OBO_DATA); + parseOboFile(doidFiles.get(0), DOID_OBO_DATA); + parseOboFile(mondoFiles.get(0), MONDO_OBO_DATA); // Close serializer serializer.close(); - logger.info(BUILDING_DONE_LOG_MESSAGE, ONTOLOGY_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); } - private void parseOboFile(File oboFile, String name) throws IOException { + private void parseOboFile(File oboFile, String data) throws IOException { logger.info(PARSING_LOG_MESSAGE, oboFile); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(oboFile.toPath())) { OboParser parser = new OboParser(); - List terms = parser.parseOBO(bufferedReader, name); + List terms = parser.parseOBO(bufferedReader, data); for (OntologyTerm term : terms) { serializer.serialize(term); } @@ -78,7 +78,8 @@ private void parseOboFile(File oboFile, String name) throws IOException { } private List checkOboFiles(Path versionFilePath, String name) throws IOException, CellBaseException { - List files = checkFiles(dataSourceReader.readValue(versionFilePath.toFile()), oboDownloadPath, ONTOLOGY_NAME + "/" + name); + List files = checkFiles(dataSourceReader.readValue(versionFilePath.toFile()), oboDownloadPath, getDataName(ONTOLOGY_DATA) + + "/" + name); if (files.size() != 1) { throw new CellBaseException("One " + name + " file is expected, but currently there are " + files.size() + " files"); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java index 0e6017fc01..1a0ba2e7d3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java @@ -24,6 +24,7 @@ import org.opencb.biodata.models.pharma.*; import org.opencb.biodata.models.pharma.guideline.BasicObject; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; @@ -97,14 +98,15 @@ public PharmGKBBuilder(Path parmGkbDownloadPath, CellBaseFileSerializer serializ @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, PHARMGKB_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(PHARMGKB_DATA)); // Sanity check - checkDirectory(pharmGkbDownloadPath, PHARMGKB_NAME); + checkDirectory(pharmGkbDownloadPath, getDataName(PHARMGKB_DATA)); // Check PharmGKB files - List pharmGkbFiles = checkFiles(dataSourceReader.readValue(pharmGkbDownloadPath.resolve(PHARMGKB_VERSION_FILENAME).toFile()), - pharmGkbDownloadPath, PHARMACOGENOMICS_NAME + "/" + PHARMGKB_NAME); + DataSource dataSource = dataSourceReader.readValue(pharmGkbDownloadPath.resolve(getDataVersionFilename(PHARMGKB_DATA)).toFile()); + List pharmGkbFiles = checkFiles(dataSource, pharmGkbDownloadPath, getDataCategory(PHARMGKB_DATA) + "/" + + getDataName(PHARMGKB_DATA)); // Unzip downloaded file unzipDownloadedFiles(pharmGkbFiles); @@ -129,7 +131,7 @@ public void parse() throws Exception { } serializer.close(); - logger.info(BUILDING_DONE_LOG_MESSAGE, PHARMGKB_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PHARMGKB_DATA)); } private Map parseChemicalFile() throws IOException { @@ -152,7 +154,7 @@ private Map parseChemicalFile() throws IOException { // Label Has Dosing Info Has Rx Annotation RxNorm Identifiers ATC Identifiers PubChem Compound Identifiers PharmaChemical pharmaChemical = new PharmaChemical() .setId(fields[0]) - .setSource(PHARMGKB_NAME) + .setSource(PHARMGKB_DATA) .setName(fields[1]) .setSmiles(fields[7]) .setInChI(fields[8]); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index eb4c04a909..d8246241e4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -22,6 +22,7 @@ import org.opencb.biodata.formats.protein.uniprot.UniProtParser; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.*; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; @@ -59,38 +60,30 @@ public ProteinBuilder(Path proteinPath, String species, CellBaseSerializer seria @Override public void parse() throws CellBaseException, IOException { - logger.info(BUILDING_LOG_MESSAGE, PROTEIN_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); // Sanity check - if (proteinPath == null) { - throw new CellBaseException(PROTEIN_NAME + " directory is missing (null)"); - } - if (!Files.exists(proteinPath)) { - throw new CellBaseException(PROTEIN_NAME + " directory " + proteinPath + " does not exist"); - } - if (!Files.isDirectory(proteinPath)) { - throw new CellBaseException(PROTEIN_NAME + " directory " + proteinPath + " is not a directory"); - } + checkDirectory(proteinPath, getDataName(PROTEIN_DATA)); // Check UniProt file - List uniProtFiles = checkFiles(dataSourceReader.readValue(proteinPath.resolve(UNIPROT_VERSION_FILENAME).toFile()), - proteinPath, PROTEIN_NAME + "/" + UNIPROT_NAME); + DataSource dataSource = dataSourceReader.readValue(proteinPath.resolve(getDataVersionFilename(UNIPROT_DATA)).toFile()); + List uniProtFiles = checkFiles(dataSource, proteinPath, getDataCategory(UNIPROT_DATA) + "/" + getDataName(UNIPROT_DATA)); if (uniProtFiles.size() != 1) { - throw new CellBaseException("Only one " + UNIPROT_NAME + " file is expected, but currently there are " + uniProtFiles.size() - + " files"); + throw new CellBaseException("Only one " + getDataName(UNIPROT_DATA) + " file is expected, but currently there are " + + uniProtFiles.size() + " files"); } // Check InterPro file - List interProFiles = checkFiles(dataSourceReader.readValue(proteinPath.resolve(INTERPRO_VERSION_FILENAME).toFile()), - proteinPath, PROTEIN_NAME + "/" + INTERPRO_NAME); + dataSource = dataSourceReader.readValue(proteinPath.resolve(getDataVersionFilename(INTERPRO_DATA)).toFile()); + List interProFiles = checkFiles(dataSource, proteinPath, getDataCategory(INTERPRO_DATA) + "/" + getDataName(INTERPRO_DATA)); if (interProFiles.size() != 1) { - throw new CellBaseException("Only one " + INTERPRO_NAME + " file is expected, but currently there are " + uniProtFiles.size() - + " files"); + throw new CellBaseException("Only one " + getDataName(INTERPRO_DATA) + " file is expected, but currently there are " + + interProFiles.size() + " files"); } // Prepare UniProt data by splitting data in chunks Path uniProtChunksPath = serializer.getOutdir().resolve(UNIPROT_CHUNKS_SUBDIRECTORY); - logger.info("Split {} file {} into chunks at {}", UNIPROT_NAME, uniProtFiles.get(0).getName(), uniProtChunksPath); + logger.info("Split {} file {} into chunks at {}", getDataName(UNIPROT_DATA), uniProtFiles.get(0).getName(), uniProtChunksPath); Files.createDirectories(uniProtChunksPath); splitUniprot(proteinPath.resolve(uniProtFiles.get(0).getName()), uniProtChunksPath); @@ -182,13 +175,13 @@ public void parse() throws CellBaseException, IOException { } if (++numInterProLinesProcessed % 10000000 == 0) { - logger.debug("{} {} lines processed. {} unique proteins processed", numInterProLinesProcessed, INTERPRO_NAME, - numUniqueProteinsProcessed); + logger.debug("{} {} lines processed. {} unique proteins processed", numInterProLinesProcessed, + getDataName(INTERPRO_DATA), numUniqueProteinsProcessed); } } logger.info(PARSING_DONE_LOG_MESSAGE, interProFiles.get(0)); } catch (IOException e) { - throw new CellBaseException("Error parsing " + INTERPRO_NAME + " file: " + interProFiles.get(0), e); + throw new CellBaseException("Error parsing " + getDataName(INTERPRO_DATA) + " file: " + interProFiles.get(0), e); } // Serialize and save results @@ -200,10 +193,10 @@ public void parse() throws CellBaseException, IOException { rocksDb.close(); } catch (JAXBException | RocksDBException | IOException e) { - throw new CellBaseException("Error parsing " + PROTEIN_NAME + " files", e); + throw new CellBaseException("Error parsing " + getDataName(PROTEIN_DATA) + " files", e); } - logger.info(BUILDING_DONE_LOG_MESSAGE, PROTEIN_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PROTEIN_DATA)); } private RocksDB getDBConnection(Path uniProtChunksPath) throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java index c8067661dc..83eccb9885 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java @@ -24,6 +24,7 @@ import org.opencb.biodata.models.core.RegulatoryFeature; import org.opencb.biodata.models.core.RegulatoryPfm; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; @@ -54,24 +55,26 @@ public RegulatoryFeatureBuilder(Path regulationPath, CellBaseSerializer serializ @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, REGULATION_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(REGULATION_DATA)); // Sanity check - checkDirectory(regulationPath, REGULATION_NAME); + checkDirectory(regulationPath, getDataName(REGULATION_DATA)); // Check build regulatory files - List regulatoryFiles = checkFiles(dataSourceReader.readValue(regulationPath.resolve(REGULATORY_BUILD_VERSION_FILENAME) - .toFile()), regulationPath, REGULATION_NAME + "/" + REGULATORY_BUILD_NAME); + DataSource dataSource = dataSourceReader.readValue(regulationPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)).toFile()); + List regulatoryFiles = checkFiles(dataSource, regulationPath, getDataCategory(REGULATORY_BUILD_DATA) + "/" + + getDataName(REGULATORY_BUILD_DATA)); if (regulatoryFiles.size() != 1) { - throw new CellBaseException("One " + REGULATORY_BUILD_NAME + " file is expected, but currently there are " + throw new CellBaseException("One " + getDataName(REGULATORY_BUILD_DATA) + " file is expected, but currently there are " + regulatoryFiles.size() + " files"); } // Check motif features files - List motifFeaturesFiles = checkFiles(dataSourceReader.readValue(regulationPath.resolve(MOTIF_FEATURES_VERSION_FILENAME) - .toFile()), regulationPath, REGULATION_NAME + "/" + MOTIF_FEATURES_NAME); + dataSource = dataSourceReader.readValue(regulationPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA)).toFile()); + List motifFeaturesFiles = checkFiles(dataSource, regulationPath, getDataCategory(MOTIF_FEATURES_DATA) + "/" + + getDataName(MOTIF_FEATURES_DATA)); if (motifFeaturesFiles.size() != 2) { - throw new CellBaseException("Two " + MOTIF_FEATURES_NAME + " files are expected, but currently there are " + throw new CellBaseException("Two " + getDataName(MOTIF_FEATURES_DATA) + " files are expected, but currently there are " + motifFeaturesFiles.size() + " files"); } @@ -82,7 +85,7 @@ public void parse() throws Exception { // Parse regulatory build features parseGffFile(regulatoryFiles.get(0).toPath()); - logger.info(BUILDING_DONE_LOG_MESSAGE, REGULATION_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(REGULATION_DATA)); } protected void parseGffFile(Path regulatoryFeatureFile) throws IOException, NoSuchMethodException, FileFormatException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java index 6cefc0266f..5ffabf747b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java @@ -20,8 +20,8 @@ import org.opencb.biodata.models.variant.avro.Repeat; import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.ProgressLogger; import org.opencb.commons.utils.FileUtils; @@ -51,53 +51,56 @@ public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer, CellBase @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, EtlCommons.REPEATS_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(REPEATS_DATA)); + + // Sanity check + checkDirectory(filesDir, getDataName(REPEATS_DATA)); // Check Simple Repeats (TRF) filename String trfFilename = Paths.get(configuration.getDownload().getSimpleRepeats().getFiles().get(SIMPLE_REPEATS_FILE_ID)).getFileName() .toString(); if (!Files.exists(filesDir.resolve(trfFilename))) { - throw new CellBaseException(TRF_NAME + " file " + trfFilename + " does not exist at " + filesDir); + throw new CellBaseException(getMessageMissingFile(TRF_DATA, trfFilename, filesDir)); } // Check Genomic Super Duplications (GSD) file String gsdFilename = Paths.get(configuration.getDownload().getGenomicSuperDups().getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID)) .getFileName().toString(); if (!Files.exists(filesDir.resolve(gsdFilename))) { - throw new CellBaseException(GSD_NAME + " file " + gsdFilename + " does not exist at " + filesDir); + throw new CellBaseException(getMessageMissingFile(GSD_DATA, gsdFilename, filesDir)); } // Check Window Masker (WM) file String wmFilename = Paths.get(configuration.getDownload().getWindowMasker().getFiles().get(WINDOW_MASKER_FILE_ID)).getFileName() .toString(); if (!Files.exists(filesDir.resolve(wmFilename))) { - throw new CellBaseException(WM_NAME + " file " + wmFilename + " does not exist at " + filesDir); + throw new CellBaseException(getMessageMissingFile(WM_DATA, wmFilename, filesDir)); } // Parse TRF file - logger.info(BUILDING_LOG_MESSAGE, TRF_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(TRF_DATA)); parseTrfFile(filesDir.resolve(trfFilename)); - logger.info(BUILDING_DONE_LOG_MESSAGE, TRF_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(TRF_DATA)); // Parse GSD file - logger.info(BUILDING_LOG_MESSAGE, GSD_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(GSD_DATA)); parseGsdFile(filesDir.resolve(gsdFilename)); - logger.info(BUILDING_DONE_LOG_MESSAGE, GSD_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(GSD_DATA)); // Parse WM file - logger.info(BUILDING_LOG_MESSAGE, WM_NAME); + logger.info(BUILDING_LOG_MESSAGE, getDataName(WM_DATA)); parseWmFile(filesDir.resolve(wmFilename)); - logger.info(BUILDING_DONE_LOG_MESSAGE, WM_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(WM_DATA)); - logger.info(BUILDING_DONE_LOG_MESSAGE, EtlCommons.REPEATS_NAME); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(REPEATS_DATA)); } - private void parseTrfFile(Path filePath) throws IOException { + private void parseTrfFile(Path filePath) throws IOException, CellBaseException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed " + TRF_NAME + " lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(TRF_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseTrfLine(line)); line = bufferedReader.readLine(); @@ -111,15 +114,15 @@ private Repeat parseTrfLine(String line) { return new Repeat(null, Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), Integer.valueOf(parts[5]), Integer.valueOf(parts[7]), - Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF_NAME); + Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF_DATA); } - private void parseGsdFile(Path filePath) throws IOException { + private void parseGsdFile(Path filePath) throws IOException, CellBaseException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed " + GSD_NAME + " lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(GSD_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseGSDLine(line)); line = bufferedReader.readLine(); @@ -133,16 +136,16 @@ private Repeat parseGSDLine(String line) { return new Repeat(parts[11], Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, 2f, Float.valueOf(parts[26]), null, - null, GSD_NAME); + null, GSD_DATA); } - private void parseWmFile(Path filePath) throws IOException { + private void parseWmFile(Path filePath) throws IOException, CellBaseException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed " + WM_NAME + " lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(WM_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseWmLine(line)); line = bufferedReader.readLine(); @@ -155,6 +158,16 @@ private Repeat parseWmLine(String line) { String[] parts = line.split("\t"); return new Repeat(parts[4].replace("\t", ""), Region.normalizeChromosome(parts[1]), - Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM_NAME); + Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM_DATA); + } + + private String getMessageMissingFile(String data, String filename, Path folder) throws CellBaseException { + return getDataName(data) + " file " + filename + " does not exist at " + folder; } + + private String getMessageParsedLines(String data) throws CellBaseException { + return "Parsed " + getDataName(data) + " lines:"; + } + } + From 754384abae10c71cb156a4c0e2abd4688199ebf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 29 Apr 2024 13:03:02 +0200 Subject: [PATCH 098/107] lib: fix revel builder, #TASK-5776, #TASK-5564 --- .../org/opencb/cellbase/lib/builders/RevelScoreBuilder.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java index 68c6128f25..06f38f28f0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java @@ -58,11 +58,11 @@ public void parse() throws IOException, CellBaseException { throw new CellBaseException("One " + dataName + " file is expected, but currently there are " + revelFiles.size() + " files"); } - ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelDownloadPath))); - ZipEntry zipEntry = zis.getNextEntry(); - logger.info(PARSING_LOG_MESSAGE, revelFiles.get(0)); + ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelFiles.get(0)))); + ZipEntry zipEntry = zis.getNextEntry(); + ZipFile zipFile = new ZipFile(revelFiles.get(0).toString()); InputStream inputStream = zipFile.getInputStream(zipEntry); try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream))) { From 24eb0911b83638028d61675b98bedbc7a8eccf1a Mon Sep 17 00:00:00 2001 From: imedina Date: Tue, 7 May 2024 01:48:53 +0100 Subject: [PATCH 099/107] configuration: update versions --- .../app/scripts/ensembl-scripts/DB_CONFIG.pm | 8 +++--- .../src/main/resources/configuration.yml | 27 ++++++++++--------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm index 70865465e9..b0edf65793 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm +++ b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm @@ -134,10 +134,10 @@ our $ENSEMBL_GENOMES_PORT = "4157"; our $ENSEMBL_GENOMES_USER = "anonymous"; ## Vertebrates -our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38"; -our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38"; -our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38"; -our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38"; +our $HOMO_SAPIENS_CORE = "homo_sapiens_core_111_38"; +our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_111_38"; +our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_111_38"; +our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_111_38"; #our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38"; #our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38"; #our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38"; diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index af817b1844..32a94fb765 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -73,7 +73,7 @@ download: host: ftp://ftp.ensemblgenomes.org/pub refSeq: host: https://ftp.ncbi.nih.gov/refseq/ - version: "October 16, 2023 (GRCh38.p14)" + version: "2023-10-11" files: GENOMIC_GTF: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz @@ -81,9 +81,9 @@ download: RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz maneSelect: host: https://ftp.ncbi.nlm.nih.gov/refseq/ - version: "1.1" + version: "1.2" files: - MANE_SELECT: MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz + MANE_SELECT: MANE/MANE_human/release_1.2/MANE.GRCh38.v1.2.summary.txt.gz lrg: host: http://ftp.ebi.ac.uk/ version: "2021-03-30" @@ -91,9 +91,9 @@ download: LRG: pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt hgnc: host: https://ftp.ebi.ac.uk/ - version: "2023-11-01" + version: "2024-04-01" files: - HGNC: pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt + HGNC: pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2024-04-01.txt cancerHotspot: host: https://www.cancerhotspots.org/ version: "v2" @@ -106,7 +106,7 @@ download: DGIDB: data/monthly_tsvs/2022-Feb/interactions.tsv geneUniprotXref: host: http://ftp.uniprot.org/ - version: "2024_01 (24-Jan-2024)" + version: "2024-03-27" files: UNIPROT_XREF: pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz geneExpressionAtlas: @@ -116,6 +116,7 @@ download: GENE_EXPRESSION_ATLAS: pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz hpo: ## NOTE: Download manually from here now + version: "2024-04-26" host: https://hpo.jax.org/app/data/annotations disgenet: host: https://www.disgenet.org/ @@ -149,12 +150,12 @@ download: ## Protein Data uniprot: host: https://ftp.uniprot.org/ - version: "2024-01-24" + version: "2024-03-27" files: UNIPROT: pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz interpro: host: https://ftp.ebi.ac.uk/ - version: "2024-01-24" + version: "2024-03-27" files: INTERPRO: pub/databases/interpro/current_release/protein2ipr.dat.gz intact: @@ -180,7 +181,7 @@ download: host: http://ftp.ensembl.org/ version: "2023-05-17" files: - GERP: pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw ## Clinical Variant clinvar: @@ -204,11 +205,11 @@ download: files: HGMD: hgmd.vcf gwasCatalog: - ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e110_r2023-12-20' + ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e111_r2024-04-22' host: https://ftp.ebi.ac.uk/ - version: "2024-02-12" + version: "2024-04-22" files: - GWAS: pub/databases/gwas/releases/2024/02/12/gwas-catalog-associations_ontology-annotated.tsv + GWAS: pub/databases/gwas/releases/2024/04/22/gwas-catalog-associations_ontology-annotated.tsv DBSNP: All.vcf.gz dgv: @@ -290,7 +291,7 @@ species: - id: hsapiens scientificName: Homo sapiens assemblies: - - ensemblVersion: '110_38' + - ensemblVersion: '111_38' name: GRCh38 - ensemblVersion: '82_37' name: GRCh37 From fc09da4b34d0865d418db6c688074e586a16dc33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 7 May 2024 11:17:17 +0200 Subject: [PATCH 100/107] app: add bash script to fix the downloaded MirTarBase file, #TASK-5775, #TASK-5564 --- .../app/scripts/mirtarbase/fix-gene-symbol.sh | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100755 cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh diff --git a/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh b/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh new file mode 100755 index 0000000000..38c7d1efa2 --- /dev/null +++ b/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# The original MirTarBase hsa_MTI.xlsx contains invalid Gene Symbols in 793 lines. +# To fix it, that file has to be converted to a CSV file, i.e.: hsa_MTI.csv +# +# After converting to CSV file, we can see the errors from the original file for the Gene Symbols (column 4), +# e.g.: 06-mar: +# MIRT050267,hsa-miR-25-3p,Homo sapiens,06-mar,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 +# MIRT051174,hsa-miR-16-5p,Homo sapiens,06-mar,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 +# +# This script fix those lines and convert the column 4 for a vaild Gene Symbol: +# +# MIRT050267,hsa-miR-25-3p,Homo sapiens,MARCHF6,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 +# MIRT051174,hsa-miR-16-5p,Homo sapiens,MARCHF6,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 + +# Check the parameters number +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Check CSV file +csv_file="$1" +if [ ! -f "$csv_file" ]; then + echo "CSV file '$csv_file' does not exist." + exit 1 +fi + +# Fix gene-symbol +while IFS=$'\t' read -r c1 c2 c3 c4 c5 c6 c7 c8 c9 || [[ -n "$c1" ]]; do + # Aplica las condiciones + if [ "$c5" = "10299" ]; then + c4="MARCHF6" + elif [ "$c5" = "51257" ]; then + c4="MARCHF2" + elif [ "$c5" = "54708" ]; then + c4="MARCHF5" + elif [ "$c5" = "54996" ]; then + c4="MTARC2" + elif [ "$c5" = "55016" ]; then + c4="MARCHF1" + elif [ "$c5" = "57574" ]; then + c4="MARCHF4" + elif [ "$c5" = "64757" ]; then + c4="MTARC1" + elif [ "$c5" = "64844" ]; then + c4="MARCHF7" + elif [ "$c5" = "92979" ]; then + c4="MARCHF9" + elif [ "$c5" = "115123" ]; then + c4="MARCHF3" + elif [ "$c5" = "220972" ]; then + c4="MARCHF8" + elif [ "$c5" = "441061" ]; then + c4="MARCHF11" + fi + + # Print line + echo -e "$c1\t$c2\t$c3\t$c4\t$c5\t$c6\t$c7\t$c8\t$c9" +done < "$csv_file" From 09d33a0ef6eb858a64c7b5b6796ae098cb5d36f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 7 May 2024 11:19:29 +0200 Subject: [PATCH 101/107] core: add some comments to the configuration file, #TASK-5775, #TASK-5564 --- cellbase-core/src/main/resources/configuration.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index af817b1844..a7b6a78f48 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -144,6 +144,8 @@ download: host: https://mirtarbase.cuhk.edu.cn/ version: "9.0" files: + # This file contains errors and has to be fixed before building + # check the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx ## Protein Data From 303585debf0fc5225d0948b25f271ca6185e6c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 7 May 2024 11:30:01 +0200 Subject: [PATCH 102/107] lib: update Ensembl/RefSeq indexers and builders (include major improvements and sonnar fixes), #TASK-5776, #TASK-5564 --- .../app/cli/admin/AdminCliOptionsParser.java | 17 +- .../cellbase/app/cli/admin/AdminMain.java | 1 + .../admin/executors/BuildCommandExecutor.java | 26 +- .../org/opencb/cellbase/lib/EtlCommons.java | 7 + .../lib/builders/CellBaseBuilder.java | 41 +- .../lib/builders/EnsemblGeneBuilder.java | 956 ++++++++++++++++++ .../builders/EnsemblGeneBuilderIndexer.java | 375 ++----- .../cellbase/lib/builders/GeneBuilder.java | 910 +---------------- .../lib/builders/GeneBuilderIndexer.java | 617 +++++++---- .../lib/builders/RefSeqGeneBuilder.java | 169 ++-- .../builders/RefSeqGeneBuilderIndexer.java | 261 +---- .../cellbase/lib/builders/RocksDbManager.java | 5 +- .../lib/builders/EnsemblGeneBuilderTest.java | 22 + .../lib/builders/GeneBuilderTest.java | 94 +- 14 files changed, 1706 insertions(+), 1795 deletions(-) create mode 100644 cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java create mode 100644 cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 55342641b3..1bda7d2793 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -24,6 +24,8 @@ import java.util.List; import java.util.Map; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 03/02/15. */ @@ -87,10 +89,10 @@ public class DownloadCommandOptions { @ParametersDelegate public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: genome, gene, " - + "variation_functional_score, missense_variation_functional_score, regulation, protein, conservation, " - + "clinical_variants, repeats, ontology, pubmed and pharmacogenomics; or use 'all' to download everything", - required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: " + GENOME_DATA + "," + GENE_DATA + + "," + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA + + "," + CONSERVATION_DATA + "," + CLINICAL_VARIANTS_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + PUBMED_DATA + + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to download everything", required = true, arity = 1) public String data; @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, @@ -104,9 +106,10 @@ public class BuildCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: genome, genome_info, " - + "gene, variation, variation_functional_score, regulation, protein, ppi, conservation, drug, " - + "clinical_variants, repeats, svs, splice_score, pubmed. 'all' builds everything.", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: " + GENOME_DATA + "," + GENE_DATA + "," + + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA + "," + + CONSERVATION_DATA + "," + CLINICAL_VARIANTS_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA + + "," + PUBMED_DATA + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to build everything", required = true, arity = 1) public String data; @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java index d77722a492..fecf57c08a 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java @@ -98,6 +98,7 @@ public static void main(String[] args) { commandExecutor.execute(); } catch (IOException | URISyntaxException | CellBaseException e) { commandExecutor.getLogger().error("Error: " + e.getMessage()); + e.printStackTrace(); System.exit(1); } } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 0cf6b17899..081880ebe3 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -64,9 +64,9 @@ public class BuildCommandExecutor extends CommandExecutor { private boolean flexibleGTFParsing; private SpeciesConfiguration speciesConfiguration; - private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, REFSEQ_DATA, - VARIATION_FUNCTIONAL_SCORE_DATA, MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, - CLINICAL_VARIANTS_DATA, REPEATS_DATA, ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); + private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, + MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANTS_DATA, REPEATS_DATA, + ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); @@ -135,9 +135,6 @@ public void execute() throws CellBaseException { case GENE_DATA: parser = buildGene(); break; - case REFSEQ_DATA: - parser = buildRefSeq(); - break; case VARIATION_FUNCTIONAL_SCORE_DATA: parser = buildCadd(); break; @@ -246,22 +243,7 @@ private CellBaseBuilder buildGenomeSequence() throws CellBaseException { } private CellBaseBuilder buildGene() throws CellBaseException { - Path geneFolderPath = downloadFolder.resolve("gene"); - copyVersionFiles(Arrays.asList(geneFolderPath.resolve("dgidbVersion.json"), - geneFolderPath.resolve("ensemblCoreVersion.json"), geneFolderPath.resolve("uniprotXrefVersion.json"), - geneFolderPath.resolve("geneExpressionAtlasVersion.json"), - geneFolderPath.resolve("hpoVersion.json"), geneFolderPath.resolve("disgenetVersion.json"), - geneFolderPath.resolve("gnomadVersion.json"))); - Path genomeFastaFilePath = getFastaReferenceGenome(); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "gene"); - return new GeneBuilder(geneFolderPath, genomeFastaFilePath, speciesConfiguration, flexibleGTFParsing, serializer); - } - - private CellBaseBuilder buildRefSeq() { - Path refseqFolderPath = downloadFolder.resolve("refseq"); - copyVersionFiles(Arrays.asList(refseqFolderPath.resolve("refSeqVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "refseq"); - return new RefSeqGeneBuilder(refseqFolderPath, speciesConfiguration, serializer); + return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing); } private CellBaseBuilder buildCadd() throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index a836288d6f..e0a19c7114 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -44,6 +44,10 @@ */ public final class EtlCommons { + // Commons + public static final String XLSX_EXTENSION = ".xlsx"; + public static final String CSV_EXTENSION = ".csv"; + // Ensembl public static final String ENSEMBL_DATA = "ensembl"; public static final String PUT_RELEASE_HERE_MARK = "put_release_here"; @@ -61,6 +65,7 @@ public final class EtlCommons { public static final String ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; + public static final String HSAPIENS_NAME= "hsapiens"; public static final String GRCH38_NAME = "GRCh38"; public static final String GRCH37_NAME = "GRCh37"; @@ -74,11 +79,13 @@ public final class EtlCommons { // Gene public static final String GENE_DATA = "gene"; + public static final String ENSEMBL_GENE_BASENAME = "ensemblGene"; public static final String GENE_ANNOTATION_DATA = "gene_annotation"; public static final String GENE_DISEASE_ANNOTATION_DATA = "gene_disease_annotation"; // RefSeq public static final String REFSEQ_DATA = "refseq"; + public static final String REFSEQ_GENE_BASENAME = "refSeqGene"; // Must match the configuration file public static final String REFSEQ_GENOMIC_GTF_FILE_ID = "GENOMIC_GTF"; public static final String REFSEQ_GENOMIC_FNA_FILE_ID = "GENOMIC_FNA"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index f5e79320d7..26fb2e838b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -18,6 +18,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; +import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; @@ -25,12 +26,15 @@ import org.slf4j.LoggerFactory; import java.io.File; +import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 30/08/14. */ @@ -39,33 +43,54 @@ public abstract class CellBaseBuilder { protected CellBaseSerializer serializer; protected ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); + protected boolean checked; + protected Logger logger; + public static final String CHECKING_BEFORE_BUILDING_LOG_MESSAGE = "Checking files before building {} ..."; + public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking done!"; + public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; - public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done."; + public static final String BUILDING_DONE_LOG_MESSAGE = "Building done!"; public static final String CATEGORY_BUILDING_LOG_MESSAGE = "Building {}/{} ..."; - public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building {}/{} done."; + public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building done!"; public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; - public static final String PARSING_DONE_LOG_MESSAGE = "Parsing {} done."; + public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done!"; public CellBaseBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); this.serializer = serializer; - //this.serializer.open(); + this.checked = false; } public abstract void parse() throws Exception; public void disconnect() { - try { - serializer.close(); - } catch (Exception e) { - logger.error("Disconnecting serializer: " + e.getMessage()); + if (serializer != null) { + try { + serializer.close(); + } catch (Exception e) { + logger.error("Error closing serializer:\n" + StringUtils.join(e.getStackTrace(), "\n")); + } + } + } + + protected List checkFiles(String data, Path downloadPath, int expectedFiles) throws CellBaseException, IOException { + return checkFiles(getDataName(data), data, downloadPath, expectedFiles); + } + + protected List checkFiles(String label, String data, Path downloadPath, int expectedFiles) throws CellBaseException, IOException { + List files = checkFiles(dataSourceReader.readValue(downloadPath.resolve(getDataVersionFilename(data)).toFile()), + downloadPath, label); + if (files.size() != expectedFiles) { + throw new CellBaseException(expectedFiles + " " + label + " files are expected at " + downloadPath + ", but currently there" + + " are " + files.size() + " files"); } + return files; } protected List checkFiles(DataSource dataSource, Path targetPath, String name) throws CellBaseException { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java new file mode 100644 index 0000000000..a7e6b9f1cf --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -0,0 +1,956 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import htsjdk.tribble.readers.TabixReader; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.formats.feature.gff.Gff2; +import org.opencb.biodata.formats.feature.gtf.Gtf; +import org.opencb.biodata.formats.feature.gtf.io.GtfReader; +import org.opencb.biodata.formats.io.FileFormatException; +import org.opencb.biodata.models.core.*; +import org.opencb.biodata.tools.sequence.FastaIndex; +import org.opencb.cellbase.core.ParamConstants; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.SpeciesConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; +import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.rocksdb.RocksDBException; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; +import java.util.stream.Collectors; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class EnsemblGeneBuilder extends CellBaseBuilder { + + private Path downloadPath; + private SpeciesConfiguration speciesConfiguration; + private boolean flexibleGTFParsing; + private CellBaseConfiguration configuration; + + private Map transcriptDict; + private Map exonDict; + + private Path gtfFile; + private Path proteinFastaFile; + private Path cDnaFastaFile; + private Path geneDescriptionFile; + private Path xrefsFile; + private Path hgncFile; + private Path maneFile; + private Path lrgFile; + private Path uniprotIdMappingFile; + private Path tfbsFile; + private Path tabixFile; + private Path geneExpressionFile; + private Path geneDrugFile; + private Path hpoFile; + private Path disgenetFile; + private Path genomeSequenceFilePath; + private Path gnomadFile; + private Path geneOntologyAnnotationFile; + private Path miRBaseFile; + private Path miRTarBaseFile; + private Path cancerGeneCensusFile; + private Path cancerHostpotFile; + private Path ensemblCanonicalFile; + private Path tso500File; + private Path eglhHaemOncFile; + + // source for genes is either ensembl or refseq + private final String SOURCE = ParamConstants.QueryParams.ENSEMBL.key(); + + private int geneCounter; + private ArrayList geneList; + private String geneName; + private int transcriptCounter; + private ArrayList transcriptList; + private String transcriptName; + private int exonCounter; + private String feature; + private Gtf nextGtfToReturn; + + public EnsemblGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, + CellBaseSerializer serializer) { + super(serializer); + + this.downloadPath = downloadPath; + this.speciesConfiguration = speciesConfiguration; + this.flexibleGTFParsing = flexibleGTFParsing; + + transcriptDict = new HashMap<>(250000); + exonDict = new HashMap<>(8000000); + } + + public void check() throws Exception { + if (checked) { + return; + } + + String ensemblGeneLabel = getDataName(ENSEMBL_DATA) + " " + getDataName(GENE_DATA); + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel); + + // Sanity check + checkDirectory(downloadPath, ensemblGeneLabel); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); + } + } + + // Check Ensembl files + List files = checkFiles(ensemblGeneLabel, ENSEMBL_DATA, downloadPath, 3); + gtfFile = files.stream().filter(f -> f.getName().contains(".gtf")).findFirst().get().toPath(); + proteinFastaFile = files.stream().filter(f -> f.getName().contains(".pep.all.fa")).findFirst().get().toPath(); + cDnaFastaFile = files.stream().filter(f -> f.getName().contains(".cdna.all.fa")).findFirst().get().toPath(); + + // Check common files + // geneDescriptionFile = + // xrefsFile = + maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + hgncFile = checkFiles(HGNC_DATA, downloadPath.getParent(), 1).get(0).toPath(); + cancerHostpotFile = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + uniprotIdMappingFile = checkFiles(UNIPROT_XREF_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneExpressionFile = checkFiles(GENE_EXPRESSION_ATLAS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // hpoFile = checkFiles(HPO_DATA, downloadPath.getParent(), 1); + disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + gnomadFile = checkFiles(GNOMAD_CONSTRAINTS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneOntologyAnnotationFile = checkFiles(GO_ANNOTATION_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // ensemblCanonicalFile = ; + // cancerGeneCensus = + // tso500File = + // eglhHaemOncFile = + + // Check regulation files + // Motif features + files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 2); + if (files.get(0).getName().endsWith("tbi")) { + tabixFile = files.get(0).toPath(); + tfbsFile = files.get(1).toPath(); + } else { + tabixFile = files.get(1).toPath(); + tfbsFile = files.get(0).toPath(); + } + // mirbase + miRBaseFile = checkFiles(MIRBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); + + // mirtarbase + // The downloaded .xlsx file contains errors and it has to be fixed manually + logger.info("Checking {} folder and files", getDataName(MIRTARBASE_DATA)); + Path downloadRegulationPath = downloadPath.getParent().getParent().resolve(REGULATION_DATA); + List mirTarBaseFiles = ((DataSource) dataSourceReader.readValue(downloadRegulationPath.resolve( + getDataVersionFilename(MIRTARBASE_DATA)).toFile())).getUrls().stream().map(u -> Paths.get(u).getFileName().toString()) + .collect(Collectors.toList()); + if (mirTarBaseFiles.size() != 1) { + throw new CellBaseException("One " + getDataName(MIRTARBASE_DATA) + " file is expected at " + downloadRegulationPath + + ", but currently there are " + mirTarBaseFiles.size() + " files"); + } + // The hsa_MIT.xlsx is fixed and converted to hsa_MIT.csv manually + if (!mirTarBaseFiles.get(0).endsWith(XLSX_EXTENSION)) { + throw new CellBaseException("A " + XLSX_EXTENSION + " " + getDataName(MIRTARBASE_DATA) + " file is expected at " + + downloadRegulationPath + ", but currently it is named " + mirTarBaseFiles.get(0)); + } + miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION)); + if (!Files.exists(miRTarBaseFile)) { + throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist"); + } + + // Check genome fasta file + genomeSequenceFilePath = checkFiles(GENOME_DATA, downloadPath.getParent().getParent().resolve(GENOME_DATA), 1).get(0).toPath(); + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE); + checked = true; + } + + public void parse() throws Exception { + check(); + + Gene gene = null; + Transcript transcript; + Exon exon = null; + int cdna = 1; + int cds = 1; + + EnsemblGeneBuilderIndexer indexer = new EnsemblGeneBuilderIndexer(serializer.getOutdir()); + + try { + // process files and put values in rocksdb + indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, + proteinFastaFile, cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, + geneDrugFile, hpoFile, disgenetFile, gnomadFile, geneOntologyAnnotationFile, miRBaseFile, + miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile, + tso500File, eglhHaemOncFile); + + TabixReader tabixReader = null; + if (!Files.exists(tfbsFile) || !Files.exists(tabixFile)) { + logger.error("Tfbs or tabix file not found. Download them and try again."); + } else { + tabixReader = new TabixReader(tfbsFile.toAbsolutePath().toString(), tabixFile.toAbsolutePath().toString()); + } + + // Preparing the fasta file for fast accessing +// System.out.println("genomeSequenceFilePath.toString() = " + genomeSequenceFilePath.toString()); + FastaIndex fastaIndex = new FastaIndex(genomeSequenceFilePath); + + // Empty transcript and exon dictionaries + transcriptDict.clear(); + exonDict.clear(); + + logger.info(PARSING_LOG_MESSAGE, gtfFile); + GtfReader gtfReader = new GtfReader(gtfFile); + + // Gene->Transcript->Feature->GTF line + Map>> gtfMap = null; + if (flexibleGTFParsing) { + gtfMap = loadGTFMap(gtfReader); + initializePointers(gtfMap); + } + + Gtf gtf; + while ((gtf = getGTFEntry(gtfReader, gtfMap)) != null) { + + if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") + || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { + continue; + } + + String geneId = gtf.getAttributes().get("gene_id"); + String transcriptId = gtf.getAttributes().get("transcript_id"); + String geneName = gtf.getAttributes().get("gene_name"); + if (newGene(gene, geneId)) { + // If new geneId is different from the current then we must serialize before data new gene + if (gene != null) { + serializer.serialize(gene); + } + + GeneAnnotation geneAnnotation = new GeneAnnotation(indexer.getExpression(geneId), indexer.getDiseases(geneName), + indexer.getDrugs(geneName), indexer.getConstraints(geneId), indexer.getMirnaTargets(geneName), + indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); + + gene = new Gene(geneId, geneName, gtf.getSequenceName().replaceFirst("chr", ""), + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), gtf.getAttributes().get("gene_version"), + gtf.getAttributes().get("gene_biotype"), "KNOWN", SOURCE, indexer.getDescription(geneId), + new ArrayList<>(), indexer.getMirnaGene(transcriptId), geneAnnotation); + } + + // Check if Transcript exist in the Gene Set of transcripts + if (!transcriptDict.containsKey(transcriptId)) { + transcript = getTranscript(gene, indexer, tabixReader, gtf, transcriptId); + } else { + transcript = gene.getTranscripts().get(transcriptDict.get(transcriptId)); + } + + // At this point gene and transcript objects are set up + // Update gene and transcript genomic coordinates, start must be the + // lower, and end the higher + updateTranscriptAndGeneCoords(transcript, gene, gtf); + + String transcriptIdWithoutVersion = transcript.getId().split("\\.")[0]; + if (gtf.getFeature().equalsIgnoreCase("exon")) { + // Obtaining the exon sequence + String exonId = gtf.getAttributes().get("exon_id") + "." + gtf.getAttributes().get("exon_version"); + String exonSequence = fastaIndex.query(gtf.getSequenceName(), gtf.getStart(), gtf.getEnd()); + + exon = new Exon(exonId, gtf.getSequenceName().replaceFirst("chr", ""), + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), 0, 0, 0, 0, 0, 0, -1, Integer.parseInt(gtf + .getAttributes().get("exon_number")), exonSequence); + transcript.getExons().add(exon); + + exonDict.put(transcriptIdWithoutVersion + "_" + exon.getExonNumber(), exon); + if (gtf.getAttributes().get("exon_number").equals("1")) { + cdna = 1; + cds = 1; + } else { + // with every exon we update cDNA length with the previous exon length + cdna += exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getEnd() + - exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getStart() + 1; + } + } else { + exon = exonDict.get(transcriptIdWithoutVersion + "_" + exon.getExonNumber()); + if (gtf.getFeature().equalsIgnoreCase("CDS")) { + // Protein ID is only present in CDS lines + String proteinId = gtf.getAttributes().get("protein_id") != null + ? gtf.getAttributes().get("protein_id") + "." + gtf.getAttributes().get("protein_version") + : ""; + transcript.setProteinId(proteinId); + transcript.setProteinSequence(indexer.getProteinFasta(proteinId)); + + if (gtf.getStrand().equals("+") || gtf.getStrand().equals("1")) { + // CDS states the beginning of coding start + exon.setGenomicCodingStart(gtf.getStart()); + exon.setGenomicCodingEnd(gtf.getEnd()); + + // cDNA coordinates + exon.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); + exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + // Set cdnaCodingEnd to prevent those cases without stop_codon + + transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + exon.setCdsStart(cds); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // increment in the coding length + cds += gtf.getEnd() - gtf.getStart() + 1; + transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon + + exon.setPhase(Integer.parseInt(gtf.getFrame())); + + if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { + transcript.setGenomicCodingStart(gtf.getStart()); + } + if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { + transcript.setGenomicCodingEnd(gtf.getEnd()); + } + // only first time + if (transcript.getCdnaCodingStart() == 0) { + transcript.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); + } + // strand - + } else { + // CDS states the beginning of coding start + exon.setGenomicCodingStart(gtf.getStart()); + exon.setGenomicCodingEnd(gtf.getEnd()); + // cDNA coordinates + // cdnaCodingStart points to the same base position than genomicCodingEnd + exon.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); + // cdnaCodingEnd points to the same base position than genomicCodingStart + exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + // Set cdnaCodingEnd to prevent those cases without stop_codon + transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + exon.setCdsStart(cds); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // increment in the coding length + cds += gtf.getEnd() - gtf.getStart() + 1; + transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon + exon.setPhase(Integer.parseInt(gtf.getFrame())); + + if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { + transcript.setGenomicCodingStart(gtf.getStart()); + } + if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { + transcript.setGenomicCodingEnd(gtf.getEnd()); + } + // only first time + if (transcript.getCdnaCodingStart() == 0) { + // cdnaCodingStart points to the same base position than genomicCodingEnd + transcript.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); + } + } + + } +// if (gtf.getFeature().equalsIgnoreCase("start_codon")) { +// // nothing to do +// System.out.println("Empty block, this should be redesigned"); +// } + if (gtf.getFeature().equalsIgnoreCase("stop_codon")) { + // setCdnaCodingEnd = false; // stop_codon found, cdnaCodingEnd will be set here, + // no need to set it at the beginning of next feature + if (exon.getStrand().equals("+")) { + updateStopCodingDataPositiveExon(exon, cdna, cds, gtf); + + cds += gtf.getEnd() - gtf.getStart(); + // If stop_codon appears, overwrite values + transcript.setGenomicCodingEnd(gtf.getEnd()); + transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + transcript.setCdsLength(cds - 1); + + } else { + updateNegativeExonCodingData(exon, cdna, cds, gtf); + + cds += gtf.getEnd() - gtf.getStart(); + // If stop_codon appears, overwrite values + transcript.setGenomicCodingStart(gtf.getStart()); + // cdnaCodingEnd points to the same base position than genomicCodingStart + transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + transcript.setCdsLength(cds - 1); + } + } + } + } + + // last gene must be serialized + serializer.serialize(gene); + + // Close + gtfReader.close(); + serializer.close(); + fastaIndex.close(); + indexer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gtfFile); + } catch (Exception e) { + indexer.close(); + throw e; + } + } + + private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, TabixReader tabixReader, Gtf gtf, String transcriptId) + throws IOException, RocksDBException { + Map gtfAttributes = gtf.getAttributes(); + + // To match Ensembl, we set the ID as transcript+version. This also matches the Ensembl website. + String transcriptIdWithVersion = transcriptId + "." + gtfAttributes.get("transcript_version"); + String biotype = gtfAttributes.get("transcript_biotype") != null ? gtfAttributes.get("transcript_biotype") : ""; + String transcriptChromosome = gtf.getSequenceName().replaceFirst("chr", ""); + List transcriptTfbses = getTranscriptTfbses(gtf, transcriptChromosome, tabixReader); + + List ontologyAnnotations = getOntologyAnnotations(indexer.getXrefs(transcriptId), indexer); + TranscriptAnnotation transcriptAnnotation = new TranscriptAnnotation(ontologyAnnotations, indexer.getConstraints(transcriptId)); + + Transcript transcript = new Transcript(transcriptIdWithVersion, gtfAttributes.get("transcript_name"), transcriptChromosome, + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, "KNOWN", + 0, 0, 0, 0, 0, + indexer.getCdnaFasta(transcriptIdWithVersion), "", "", "", + gtfAttributes.get("transcript_version"), SOURCE, new ArrayList<>(), indexer.getXrefs(transcriptId), transcriptTfbses, + new HashSet<>(), transcriptAnnotation); + + // Adding Ids appearing in the GTF to the xrefs is required, since for some unknown reason the ENSEMBL + // Perl API often doesn't return all genes resulting in an incomplete xrefs.txt file. We must ensure + // that the xrefs array contains all ids present in the GTF file + addGtfXrefs(transcript, gene, gtfAttributes); + + // Add HGNC ID mappings, with this we can know which Ensembl and Refseq transcripts match to HGNC ID + String hgncId = indexer.getHgncId(gene.getName()); + if (StringUtils.isNotEmpty(hgncId)) { + transcript.getXrefs().add(new Xref(hgncId, "hgnc_id", "HGNC ID")); + } + + // Add MANE Select mappings, with this we can know which Ensembl and Refseq transcripts match according to MANE + for (String suffix: Arrays.asList("refseq", "refseq_protein")) { + String maneRefSeq = indexer.getMane(transcriptIdWithVersion, suffix); + if (StringUtils.isNotEmpty(maneRefSeq)) { + transcript.getXrefs().add(new Xref(maneRefSeq, "mane_select_" + suffix, + "MANE Select RefSeq" + (suffix.contains("_") ? " Protein" : ""))); + } + } + + // Add LRG mappings, with this we can know which Ensembl and Refseq transcripts match according to LRG + String lrgRefSeq = indexer.getLrg(transcriptIdWithVersion, "refseq"); + if (StringUtils.isNotEmpty(lrgRefSeq)) { + transcript.getXrefs().add(new Xref(lrgRefSeq, "lrg_refseq", "LRG RefSeq")); + } + + // Add Flags + // 1. GTF tags + String tags = gtf.getAttributes().get("tag"); + if (StringUtils.isNotEmpty(tags)) { + transcript.getFlags().addAll(Arrays.asList(tags.split(","))); + } + // 2. TSL + String supportLevel = gtfAttributes.get("transcript_support_level"); + if (StringUtils.isNotEmpty(supportLevel)) { + // split on space so "5 (assigned to previous version 3)" and "5" both become "TSL:5" + String truncatedSupportLevel = supportLevel.split(" ")[0]; + transcript.getFlags().add("TSL:" + truncatedSupportLevel); + } + // 3. MANE Flag + String maneFlag = indexer.getMane(transcriptIdWithVersion, "flag"); + if (StringUtils.isNotEmpty(maneFlag)) { + transcript.getFlags().add(maneFlag); + } + // 4. LRG Flag + String lrg = indexer.getLrg(transcriptIdWithVersion, "ensembl"); + if (StringUtils.isNotEmpty(lrg)) { + transcript.getFlags().add("LRG"); + } else { + for (Xref xref : transcript.getXrefs()) { + if (xref.getId().startsWith("LRG_") && xref.getId().contains("t")) { + transcript.getFlags().add("LRG"); + } + } + } + // 5. Ensembl Canonical + String canonicalFlag = indexer.getCanonical(transcriptIdWithVersion); + if (StringUtils.isNotEmpty(canonicalFlag)) { + transcript.getFlags().add(canonicalFlag); + } + + // 6. TSO500 and EGLH HaemOnc + String maneRefSeq = indexer.getMane(transcriptIdWithVersion, "refseq"); + if (StringUtils.isNotEmpty(maneRefSeq)) { + String tso500Flag = indexer.getTSO500(maneRefSeq.split("\\.")[0]); + if (StringUtils.isNotEmpty(tso500Flag)) { + transcript.getFlags().add(tso500Flag); + } + + String eglhHaemOncFlag = indexer.getEGLHHaemOnc(maneRefSeq.split("\\.")[0]); + if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { + transcript.getFlags().add(eglhHaemOncFlag); + } + } + + gene.getTranscripts().add(transcript); + + // Do not change order!! size()-1 is the index of the transcript ID + transcriptDict.put(transcriptId, gene.getTranscripts().size() - 1); + return transcript; + } + + private List getOntologyAnnotations(List xrefs, EnsemblGeneBuilderIndexer indexer) + throws IOException, RocksDBException { + if (xrefs == null || indexer == null) { + return null; + } + List annotations = new ArrayList<>(); + for (Xref xref : xrefs) { + if (xref.getDbName().equals("uniprotkb_acc")) { + String key = xref.getId(); + if (key != null && indexer.getOntologyAnnotations(key) != null) { + annotations.addAll(indexer.getOntologyAnnotations(key)); + } + } + } + return annotations; + } + + private void updateNegativeExonCodingData(Exon exon, int cdna, int cds, Gtf gtf) { + // we need to increment 3 nts, the stop_codon length. + exon.setGenomicCodingStart(gtf.getStart()); + // cdnaCodingEnd points to the same base position than genomicCodingStart + exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined + // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding + // starts + if (exon.getGenomicCodingEnd() == 0) { + exon.setGenomicCodingEnd(exon.getGenomicCodingStart() + 2); + } + if (exon.getCdnaCodingStart() == 0) { + exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); + } + if (exon.getCdsStart() == 0) { + exon.setCdsStart(exon.getCdsEnd() - 2); + } + } + + private void updateStopCodingDataPositiveExon(Exon exon, int cdna, int cds, Gtf gtf) { + // we need to increment 3 nts, the stop_codon length. + exon.setGenomicCodingEnd(gtf.getEnd()); + exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined + // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding + // starts + if (exon.getGenomicCodingStart() == 0) { + exon.setGenomicCodingStart(exon.getGenomicCodingEnd() - 2); + } + if (exon.getCdnaCodingStart() == 0) { + exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); + } + if (exon.getCdsStart() == 0) { + exon.setCdsStart(exon.getCdsEnd() - 2); + } + } + + private void addGtfXrefs(Transcript transcript, Gene gene, Map gtfAttributes) { + if (transcript.getXrefs() == null) { + transcript.setXrefs(new ArrayList<>()); + } + + transcript.getXrefs().add(new Xref(gene.getId(), "ensembl_gene", "Ensembl Gene")); + transcript.getXrefs().add(new Xref(transcript.getId(), "ensembl_transcript", "Ensembl Transcript")); + + // Some non-coding genes do not have Gene names + if (StringUtils.isNotEmpty(gene.getName())) { + transcript.getXrefs().add(new Xref(gene.getName(), "hgnc_symbol", "HGNC Symbol")); + transcript.getXrefs().add(new Xref(transcript.getName(), "ensembl_transcript_name", "Ensembl Transcript Name")); + } + + if (gtfAttributes.get("ccds_id") != null) { + transcript.getXrefs().add(new Xref(gtfAttributes.get("ccds_id"), "ccds_id", "CCDS")); + } + } + + private void initializePointers(Map>> gtfMap) { + geneCounter = 0; + geneList = new ArrayList<>(gtfMap.keySet()); + geneName = geneList.get(geneCounter); + transcriptCounter = 0; + transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); + transcriptName = transcriptList.get(transcriptCounter); + exonCounter = 0; + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + + private Gtf getGTFEntry(GtfReader gtfReader, Map>> gtfMap) throws FileFormatException { + // Flexible parsing is deactivated, return next line + if (gtfMap == null) { + return gtfReader.read(); + // Flexible parsing activated, carefully select next line to return + } else { + // No more genes/features to return + if (nextGtfToReturn == null) { + return null; + } + Gtf gtfToReturn = nextGtfToReturn; + if (feature.equals("exon")) { +// gtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + if (gtfMap.get(geneName).get(transcriptName).containsKey("cds")) { + nextGtfToReturn = getExonCDSLine(((Gtf) ((List) gtfMap.get(geneName) + .get(transcriptName).get("exon")).get(exonCounter)).getStart(), + ((Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter)).getEnd(), + (List) gtfMap.get(geneName).get(transcriptName).get("cds")); + if (nextGtfToReturn != null) { + feature = "cds"; + return gtfToReturn; + } + } + // if no cds was found for this exon, get next exon + getFeatureFollowsExon(gtfMap); + return gtfToReturn; + } + if (feature.equals("cds") || feature.equals("stop_codon")) { + getFeatureFollowsExon(gtfMap); + return gtfToReturn; + } + if (feature.equals("start_codon")) { + feature = "stop_codon"; + nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("stop_codon"); + return gtfToReturn; + } + // The only accepted features that should appear in the gtfMap are exon, cds, start_codon and stop_codon + throw new FileFormatException("Execution cannot reach this point"); + } + } + + private Gtf getExonCDSLine(Integer exonStart, Integer exonEnd, List cdsList) { + for (Object cdsObject : cdsList) { + int cdsStart = ((Gtf) cdsObject).getStart(); + int cdsEnd = ((Gtf) cdsObject).getEnd(); + if (cdsStart <= exonEnd && cdsEnd >= exonStart) { + return (Gtf) cdsObject; + } + } + return null; + } + + private void getFeatureFollowsExon(Map>> gtfMap) { + exonCounter++; + if (exonCounter == ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).size() + || feature.equals("stop_codon")) { + // If last returned feature was a stop_codon or no start_codon is provided for this transcript, + // next transcript must be selected + if (!feature.equals("stop_codon") && gtfMap.get(geneName).get(transcriptName).containsKey("start_codon")) { + feature = "start_codon"; + nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("start_codon"); + } else { + transcriptCounter++; + // No more transcripts in this gene, check if there are more genes + if (transcriptCounter == gtfMap.get(geneName).size()) { + geneCounter++; + // No more genes available, end parsing + if (geneCounter == gtfMap.size()) { + nextGtfToReturn = null; + feature = null; + // Still more genes to parse, select next one + } else { + geneName = geneList.get(geneCounter); + transcriptCounter = 0; + transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); + } + } + // Check if a new gene was selected - null would indicate there're no more genes + if (nextGtfToReturn != null) { + transcriptName = transcriptList.get(transcriptCounter); + exonCounter = 0; + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + } + } else { + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + } + + private Map>> loadGTFMap(GtfReader gtfReader) throws FileFormatException { + Map>> gtfMap = new HashMap<>(); + Gtf gtf; + while ((gtf = gtfReader.read()) != null) { + if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") + || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { + continue; + } + + // Get GTF lines associated with this gene - create a new Map of GTF entries if it's a new gene + String geneId = gtf.getAttributes().get("gene_id"); + // Transcript -> feature -> GTF line + Map> gtfMapGeneEntry; + if (gtfMap.containsKey(geneId)) { + gtfMapGeneEntry = gtfMap.get(geneId); + } else { + gtfMapGeneEntry = new HashMap(); + gtfMap.put(geneId, gtfMapGeneEntry); + } + + // Get GTF lines associated with this transcript - create a new Map of GTF entries if it's a new gene + String transcriptId = gtf.getAttributes().get("transcript_id"); + Map gtfMapTranscriptEntry; + if (gtfMapGeneEntry.containsKey(transcriptId)) { + gtfMapTranscriptEntry = gtfMapGeneEntry.get(transcriptId); + } else { + gtfMapTranscriptEntry = new HashMap(); + gtfMapGeneEntry.put(transcriptId, gtfMapTranscriptEntry); + } + + addGTFLineToGTFMap(gtfMapTranscriptEntry, gtf); + + } + + // Exon number is mandatory for the parser to be able to properly generate the gene data model + if (!exonNumberPresent(gtfMap)) { + setExonNumber(gtfMap); + } + + return gtfMap; + } + + private boolean exonNumberPresent(Map>> gtfMap) { + Map> geneGtfMap = gtfMap.get(gtfMap.keySet().iterator().next()); + return ((Gtf) ((List) geneGtfMap.get(geneGtfMap.keySet().iterator().next()).get("exon")).get(0)) + .getAttributes().containsKey("exon_number"); + } + + private void setExonNumber(Map>> gtfMap) { + for (String gene : gtfMap.keySet()) { + for (String transcript : gtfMap.get(gene).keySet()) { + List exonList = (List) gtfMap.get(gene).get(transcript).get("exon"); + Collections.sort(exonList, (e1, e2) -> Integer.valueOf(e1.getStart()).compareTo(e2.getStart())); + if (exonList.get(0).getStrand().equals("+")) { + int exonNumber = 1; + for (Gtf gtf : exonList) { + gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); + exonNumber++; + } + } else { + int exonNumber = exonList.size(); + for (Gtf gtf : exonList) { + gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); + exonNumber--; + } + } + } + } + } + + private void addGTFLineToGTFMap(Map gtfMapTranscriptEntry, Gtf gtf) { + // Add exon/cds GTF line to the corresponding gene entry in the map + String featureType = gtf.getFeature().toLowerCase(); + if (featureType.equals("exon") || featureType.equals("cds")) { + List gtfList; + // Check if there were exons already stored + if (gtfMapTranscriptEntry.containsKey(featureType)) { + gtfList = (List) gtfMapTranscriptEntry.get(featureType); + } else { + gtfList = new ArrayList<>(); + gtfMapTranscriptEntry.put(featureType, gtfList); + } + gtfList.add(gtf); + // Only one start/stop codon can be stored per transcript - no need to check if the "start_codon"/"stop_codon" + // keys are already there + } else if (featureType.equals("start_codon") || featureType.equals("stop_codon")) { + gtfMapTranscriptEntry.put(featureType, gtf); + } + } + + private List getTranscriptTfbses(Gtf transcript, String chromosome, TabixReader tabixReader) throws IOException { + if (tabixReader == null) { + return null; + } + List transcriptTfbses = null; + + int transcriptStart = transcript.getStart(); + int transcriptEnd = transcript.getEnd(); + + + String line; + TabixReader.Iterator iter = tabixReader.query(chromosome, transcriptStart, transcriptEnd); + while ((line = iter.next()) != null) { + String[] elements = line.split("\t"); + + String sequenceName = elements[0]; + String source = elements[1]; + String feature = elements[2]; + int start = Integer.parseInt(elements[3]); + int end = Integer.parseInt(elements[4]); + String score = elements[5]; + String strand = elements[6]; + String frame = elements[7]; + String attribute = elements[8]; + + if (strand.equals(transcript.getStrand())) { + continue; + } + + if (transcript.getStrand().equals("+")) { + if (start > transcript.getStart() + 500) { + break; + } else if (end > transcript.getStart() - 2500) { + Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); + transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); + } + } else { + // transcript in negative strand + if (start > transcript.getEnd() + 2500) { + break; + } else if (start > transcript.getEnd() - 500) { + Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); + transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); + } + } + } + + return transcriptTfbses; + } + + protected List addTranscriptTfbstoList(Gff2 tfbs, Gtf transcript, String chromosome, + List transcriptTfbses) { + if (transcriptTfbses == null) { + transcriptTfbses = new ArrayList<>(); + } + + // binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116; + // stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB + String[] attributes = tfbs.getAttribute().split(";"); + + String id = null; + String pfmId = null; + List transciptionFactors = null; + + for (String attributePair : attributes) { + String[] attributePairArray = attributePair.split("="); + switch(attributePairArray[0]) { + case "binding_matrix_stable_id": + pfmId = attributePairArray[1]; + break; + case "stable_id": + id = attributePairArray[1]; + break; + case "transcription_factor_complex": + transciptionFactors = Arrays.asList(attributePairArray[1].split("(::)|(%2C)")); + break; + default: + break; + } + } + + transcriptTfbses.add(new TranscriptTfbs(id, pfmId, tfbs.getFeature(), transciptionFactors, chromosome, tfbs.getStart(), + tfbs.getEnd(), getRelativeTranscriptTfbsStart(tfbs, transcript), getRelativeTranscriptTfbsEnd(tfbs, transcript), + Float.parseFloat(tfbs.getScore()))); + return transcriptTfbses; + } + + private Integer getRelativeTranscriptTfbsStart(Gff2 tfbs, Gtf transcript) { + Integer relativeStart; + if (transcript.getStrand().equals("+")) { + if (tfbs.getStart() < transcript.getStart()) { + relativeStart = tfbs.getStart() - transcript.getStart(); + } else { + relativeStart = tfbs.getStart() - transcript.getStart() + 1; + } + } else { + // negative strand transcript + if (tfbs.getEnd() > transcript.getEnd()) { + relativeStart = transcript.getEnd() - tfbs.getEnd(); + } else { + relativeStart = transcript.getEnd() - tfbs.getEnd() + 1; + } + } + return relativeStart; + } + + private Integer getRelativeTranscriptTfbsEnd(Gff2 tfbs, Gtf transcript) { + Integer relativeEnd; + if (transcript.getStrand().equals("+")) { + if (tfbs.getEnd() < transcript.getStart()) { + relativeEnd = tfbs.getEnd() - transcript.getStart(); + } else { + relativeEnd = tfbs.getEnd() - transcript.getStart() + 1; + } + } else { + if (tfbs.getStart() > transcript.getEnd()) { + relativeEnd = transcript.getEnd() - tfbs.getStart(); + } else { + relativeEnd = transcript.getEnd() - tfbs.getStart() + 1; + } + } + return relativeEnd; + } + + + + private boolean newGene(Gene previousGene, String newGeneId) { + return previousGene == null || !newGeneId.equals(previousGene.getId()); + } + + private void updateTranscriptAndGeneCoords(Transcript transcript, Gene gene, Gtf gtf) { + if (transcript.getStart() > gtf.getStart()) { + transcript.setStart(gtf.getStart()); + } + if (transcript.getEnd() < gtf.getEnd()) { + transcript.setEnd(gtf.getEnd()); + } + if (gene.getStart() > gtf.getStart()) { + gene.setStart(gtf.getStart()); + } + if (gene.getEnd() < gtf.getEnd()) { + gene.setEnd(gtf.getEnd()); + } + } + + private void getGtfFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { + gtfFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } + + private void getProteinFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".pep.all.fa") || fileName.endsWith(".pep.all.fa.gz")) { + proteinFastaFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } + + private void getCDnaFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".cdna.all.fa") || fileName.endsWith(".cdna.all.fa.gz")) { + cDnaFastaFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java index fb67c19b8b..10f54e2ea1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java @@ -16,27 +16,44 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.core.JsonProcessingException; import org.apache.commons.lang3.StringUtils; -import org.apache.poi.hssf.usermodel.HSSFSheet; -import org.apache.poi.hssf.usermodel.HSSFWorkbook; -import org.apache.poi.ss.usermodel.*; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.opencb.biodata.formats.feature.mirbase.MirBaseParser; +import org.opencb.biodata.formats.feature.mirbase.MirBaseParserCallback; import org.opencb.biodata.formats.gaf.GafParser; import org.opencb.biodata.formats.io.FileFormatException; +import org.opencb.biodata.models.core.FeatureOntologyTermAnnotation; +import org.opencb.biodata.models.core.MiRnaGene; +import org.opencb.biodata.models.core.MirnaTarget; import org.opencb.biodata.models.core.Xref; -import org.opencb.biodata.models.core.*; -import org.opencb.biodata.models.variant.avro.*; +import org.opencb.biodata.models.variant.avro.Constraint; +import org.opencb.biodata.models.variant.avro.Expression; +import org.opencb.biodata.models.variant.avro.ExpressionCall; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.commons.utils.FileUtils; +import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.zip.GZIPInputStream; -public class EnsemblGeneBuilderIndexer extends GeneBuilderIndexer{ +import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE; + +public class EnsemblGeneBuilderIndexer extends GeneBuilderIndexer { private static final String DESCRIPTION_SUFFIX = "_description"; private static final String XREF_SUFFIX = "_xref"; @@ -56,12 +73,12 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path Path proteinFastaFile, Path cDnaFastaFile, String species, Path geneExpressionFile, Path geneDrugFile, Path hpoFile, Path disgenetFile, Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, Path cancerGeneGensusFile, Path cancerHostpotFile, Path canonicalFile, Path tso500File, Path eglhHaemOncFile) - throws IOException, RocksDBException, FileFormatException { - indexDescriptions(geneDescriptionFile); - indexXrefs(xrefsFile, uniprotIdMappingFile); + throws IOException, RocksDBException, FileFormatException, CellBaseException { +// indexDescriptions(geneDescriptionFile); +// indexXrefs(xrefsFile, uniprotIdMappingFile); indexHgncIdMapping(hgncFile); - indexManeMapping(maneFile, "ensembl"); - indexLrgMapping(lrgFile, "ensembl"); + indexManeMapping(maneFile, ENSEMBL_DATA); + indexLrgMapping(lrgFile, ENSEMBL_DATA); indexProteinSequences(proteinFastaFile); indexCdnaSequences(cDnaFastaFile); indexExpression(species, geneExpressionFile); @@ -69,13 +86,13 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path indexDiseases(hpoFile, disgenetFile); indexConstraints(gnomadFile); indexOntologyAnnotations(geneOntologyAnnotationFile); - indexMiRBase(miRBaseFile); + indexMiRBase(species, miRBaseFile); indexMiRTarBase(miRTarBaseFile); - indexCancerGeneCensus(cancerGeneGensusFile); +// indexCancerGeneCensus(cancerGeneGensusFile); indexCancerHotspot(cancerHostpotFile); - indexCanonical(canonicalFile); - indexTSO500(tso500File); - indexEGLHHaemOnc(eglhHaemOncFile); +// indexCanonical(canonicalFile); +// indexTSO500(tso500File); +// indexEGLHHaemOnc(eglhHaemOncFile); } private void indexDescriptions(Path geneDescriptionFile) throws IOException, RocksDBException { @@ -233,129 +250,6 @@ public List getExpression(String id) throws RocksDBException, IOExce return rocksDbManager.getExpression(rocksdb, key); } - private void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { - if (geneDrugFile != null && Files.exists(geneDrugFile) && Files.size(geneDrugFile) > 0) { - logger.info("Loading gene-drug interaction data from '{}'", geneDrugFile); - BufferedReader br = FileUtils.newBufferedReader(geneDrugFile); - - // Skip header - br.readLine(); - - int lineCounter = 1; - String line; - String currentGene = ""; - List drugs = new ArrayList<>(); - while ((line = br.readLine()) != null) { - String[] parts = line.split("\t"); - String geneName = parts[0]; - if (currentGene.equals("")) { - currentGene = geneName; - } else if (!currentGene.equals(geneName)) { - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - drugs = new ArrayList<>(); - currentGene = geneName; - } - - String source = null; - if (parts.length >= 4) { - source = parts[3]; - } - - String interactionType = null; - if (parts.length >= 5) { - interactionType = parts[4]; - } - - String drugName = null; - if (parts.length >= 8) { - // if drug name column is empty, use drug claim name instead - drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; - } - if (StringUtils.isEmpty(drugName)) { - // no drug name - continue; - } - - String chemblId = null; - if (parts.length >= 9) { - chemblId = parts[8]; - } - - List publications = new ArrayList<>(); - if (parts.length >= 10 && parts[9] != null) { - publications = Arrays.asList(parts[9].split(",")); - } - - GeneDrugInteraction drug = new GeneDrugInteraction( - geneName, drugName, source, null, null, interactionType, chemblId, publications); - drugs.add(drug); - lineCounter++; - } - br.close(); - // update last gene - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - } else { - logger.warn("Gene drug file " + geneDrugFile + " not found"); - logger.warn("Ignoring " + geneDrugFile); - } - } - - public List getDrugs(String id) throws RocksDBException, IOException { - String key = id + DRUGS_SUFFIX; - return rocksDbManager.getDrugs(rocksdb, key); - } - - private void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { - Map> geneDiseaseAssociationMap = new HashMap<>(50000); - String line; - - if (hpoFilePath != null && hpoFilePath.toFile().exists() && Files.size(hpoFilePath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String omimId = fields[6]; - String geneSymbol = fields[3]; - String hpoId = fields[0]; - String diseaseName = fields[1]; - GeneTraitAssociation disease = - new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), "hpo"); - addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); - } - } - } - - if (disgenetFilePath != null && disgenetFilePath.toFile().exists() && Files.size(disgenetFilePath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath)) { - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String diseaseId = fields[4]; - String diseaseName = fields[5]; - String score = fields[9]; - String numberOfPubmeds = fields[13].trim(); - String numberOfSNPs = fields[14]; - String source = fields[15]; - GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), - Integer.parseInt(numberOfPubmeds), Collections.singletonList(numberOfSNPs), Collections.singletonList(source), - "disgenet"); - addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); - } - } - } - - for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); - } - } - - public List getDiseases(String id) throws RocksDBException, IOException { - String key = id + DISEASE_SUFFIX; - return rocksDbManager.getDiseases(rocksdb, key); - } - private void indexConstraints(Path gnomadFile) throws IOException, RocksDBException { if (gnomadFile != null && Files.exists(gnomadFile) && Files.size(gnomadFile) > 0) { logger.info("Loading OE scores from '{}'", gnomadFile); @@ -384,7 +278,7 @@ private void indexConstraints(Path gnomadFile) throws IOException, RocksDBExcept rocksDbManager.update(rocksdb, transcriptIdentifier + CONSTRAINT_SUFFIX, constraints); if ("TRUE".equalsIgnoreCase(canonical)) { - rocksDbManager.update(rocksdb, geneIdentifier + CONSTRAINT_SUFFIX, constraints); + rocksDbManager.update(rocksdb, geneIdentifier + CONSTRAINT_SUFFIX, constraints); } } br.close(); @@ -432,66 +326,13 @@ public List getOntologyAnnotations(String id) thr return rocksDbManager.getOntologyAnnotations(rocksdb, key); } - private void indexMiRBase(Path miRBaseFile) throws IOException, RocksDBException { - if (miRBaseFile != null && Files.exists(miRBaseFile) && Files.size(miRBaseFile) > 0) { - logger.info("Loading mirna from '{}'", miRBaseFile); - FileInputStream fileInputStream = new FileInputStream(miRBaseFile.toFile()); - HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); - HSSFSheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - Iterator cellIterator = currentRow.iterator(); - - org.apache.poi.ss.usermodel.Cell cell = cellIterator.next(); - String miRBaseAccession = cell.getStringCellValue(); - - cell = cellIterator.next(); - String miRBaseID = cell.getStringCellValue(); - - cell = cellIterator.next(); - String status = cell.getStringCellValue(); - - cell = cellIterator.next(); - String sequence = cell.getStringCellValue(); + private void indexMiRBase(String species, Path miRBaseFile) throws IOException { + logger.info(PARSING_LOG_MESSAGE, miRBaseFile); - cell = cellIterator.next(); - String mature1Accession = cell.getStringCellValue(); + MirBaseCallback callback = new MirBaseCallback(rocksdb, rocksDbManager); + MirBaseParser.parse(miRBaseFile, species, callback); - cell = cellIterator.next(); - String mature1Id = cell.getStringCellValue(); - - cell = cellIterator.next(); - String mature1Sequence = cell.getStringCellValue(); - - String mature2Accession = ""; - String mature2Id = ""; - String mature2Sequence = ""; - if (cellIterator.hasNext()) { - cell = cellIterator.next(); - mature2Accession = cell.getStringCellValue(); - - cell = cellIterator.next(); - mature2Id = cell.getStringCellValue(); - - cell = cellIterator.next(); - mature2Sequence = cell.getStringCellValue(); - } - - MiRnaGene miRNAGene = new MiRnaGene(miRBaseAccession, miRBaseID, status, sequence, new ArrayList<>()); - int cdnaStart = sequence.indexOf(mature1Sequence); - int cdnaEnd = cdnaStart + mature1Sequence.length(); - miRNAGene.addMiRNAMature(mature1Accession, mature1Id, mature1Sequence, cdnaStart, cdnaEnd); - - cdnaStart = sequence.indexOf(mature2Sequence); - cdnaEnd = cdnaStart + mature2Sequence.length(); - miRNAGene.addMiRNAMature(mature2Accession, mature2Id, mature2Sequence, cdnaStart, cdnaEnd); - - rocksDbManager.update(rocksdb, miRBaseID + MIRBASE_SUFFIX, miRNAGene); - } - } else { - logger.error("mirna file not found"); - } + logger.info(PARSING_DONE_LOG_MESSAGE, miRBaseFile); } public MiRnaGene getMirnaGene(String transcriptId) throws RocksDBException, IOException { @@ -509,117 +350,11 @@ public MiRnaGene getMirnaGene(String transcriptId) throws RocksDBException, IOEx return null; } - private void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { - if (miRTarBaseFile != null && Files.exists(miRTarBaseFile) && Files.size(miRTarBaseFile) > 0) { - logger.info("Loading mirna targets from '{}'", miRTarBaseFile); - FileInputStream file = new FileInputStream(miRTarBaseFile.toFile()); - Workbook workbook = new XSSFWorkbook(file); - Sheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - String currentMiRTarBaseId = null; - String currentMiRNA = null; - String currentGene = null; - List targetGenes = new ArrayList<>(); - Map> geneToMirna = new HashMap<>(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - - Iterator cellIterator = currentRow.iterator(); - Cell cell = cellIterator.next(); - - // Iterate columns - String miRTarBaseId = cell.getStringCellValue(); - - // skip header - if (miRTarBaseId.startsWith("miRTarBase")) { - continue; - } - - if (currentMiRTarBaseId == null) { - currentMiRTarBaseId = miRTarBaseId; - } - - cell = cellIterator.next(); - String miRNA = cell.getStringCellValue(); - if (currentMiRNA == null) { - currentMiRNA = miRNA; - } - - // Skip species - cellIterator.next(); - - // Read target gene - cell = cellIterator.next(); - String geneName = cell.getStringCellValue(); - if (currentGene == null) { - currentGene = geneName; - } - - // Skip entrez gene - cellIterator.next(); - // Skip species - cellIterator.next(); - - if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { - // new entry, store current one - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - targetGenes = new ArrayList<>(); - currentGene = geneName; - currentMiRTarBaseId = miRTarBaseId; - currentMiRNA = miRNA; - } - - // experiment - cell = cellIterator.next(); - String experiment = cell.getStringCellValue(); - - // support type - cell = cellIterator.next(); - String supportType = cell.getStringCellValue(); - - // pubmed - cell = cellIterator.next(); - String pubmed; - // seems to vary, so check both - if (cell.getCellType().equals(CellType.NUMERIC)) { -// pubmed = String.valueOf(cell.getNumericCellValue()); - pubmed = Integer.toString(Double.valueOf(cell.getNumericCellValue()).intValue()); - } else { - pubmed = cell.getStringCellValue(); - } - - targetGenes.add(new TargetGene(experiment, supportType, pubmed)); - } - - // parse last entry - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - - for (Map.Entry> entry : geneToMirna.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); - } - } else { - logger.error("mirtarbase file not found"); - } - } - public List getMirnaTargets(String geneName) throws RocksDBException, IOException { String key = geneName + MIRTARBASE_SUFFIX; return rocksDbManager.getMirnaTargets(rocksdb, key); } - private static void addValueToMapElement(Map> map, String key, T value) { - if (map.containsKey(key)) { - map.get(key).add(value); - } else { - List valueList = new ArrayList<>(); - valueList.add(value); - map.put(key, valueList); - } - } - protected void indexCanonical(Path canonocalFile) throws IOException, RocksDBException { // Gene Transcript Canonical // ENSG00000210049.1 ENST00000387314.1 1 @@ -652,4 +387,30 @@ public String getCanonical(String transcriptId) throws RocksDBException, IOExcep } return new String(bytes); } + + // Implementation of the MirBaseParserCallback function + public class MirBaseCallback implements MirBaseParserCallback { + + private RocksDB rocksDB; + private RocksDbManager rocksDbManager; + private Logger logger; + + public MirBaseCallback(RocksDB rocksDB, RocksDbManager rocksDbManager) { + this.rocksDB = rocksDB; + this.rocksDbManager = rocksDbManager; + this.logger = LoggerFactory.getLogger(this.getClass()); + } + + @Override + public boolean processMiRnaGene(MiRnaGene miRnaGene) { + try { + rocksDbManager.update(rocksdb, miRnaGene.getId() + MIRBASE_SUFFIX, miRnaGene); + } catch (JsonProcessingException | RocksDBException e) { + logger.warn("Something wrong happened when processing miRNA gene {}: {}", miRnaGene.getId(), + StringUtils.join(e.getStackTrace(), "\t")); + return false; + } + return true; + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index cd0863a259..970f73e05a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -16,904 +16,54 @@ package org.opencb.cellbase.lib.builders; -import htsjdk.tribble.readers.TabixReader; -import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.feature.gff.Gff2; -import org.opencb.biodata.formats.feature.gtf.Gtf; -import org.opencb.biodata.formats.feature.gtf.io.GtfReader; -import org.opencb.biodata.formats.io.FileFormatException; -import org.opencb.biodata.models.core.*; -import org.opencb.biodata.tools.sequence.FastaIndex; -import org.opencb.cellbase.core.ParamConstants; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.rocksdb.RocksDBException; +import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; -import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; -public class GeneBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.*; - private Map transcriptDict; - private Map exonDict; +public class GeneBuilder extends CellBaseBuilder { - private Path gtfFile; - private Path proteinFastaFile; - private Path cDnaFastaFile; - private Path geneDescriptionFile; - private Path xrefsFile; - private Path hgncFile; - private Path maneFile; - private Path lrgFile; - private Path uniprotIdMappingFile; - private Path tfbsFile; - private Path tabixFile; - private Path geneExpressionFile; - private Path geneDrugFile; - private Path hpoFile; - private Path disgenetFile; - private Path genomeSequenceFilePath; - private Path gnomadFile; - private Path geneOntologyAnnotationFile; - private Path miRBaseFile; - private Path miRTarBaseFile; - private Path cancerGeneCensusFile; - private Path cancerHostpotFile; - private Path ensemblCanonicalFile; - private Path tso500File; - private Path eglhHaemOncFile; - private boolean flexibleGTFParsing; + private EnsemblGeneBuilder ensemblGeneBuilder; + private RefSeqGeneBuilder refSeqGeneBuilder; - // source for genes is either ensembl or refseq - private final String SOURCE = ParamConstants.QueryParams.ENSEMBL.key(); - private SpeciesConfiguration speciesConfiguration; + public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing) + throws CellBaseException { + super(null); - private int geneCounter; - private ArrayList geneList; - private String geneName; - private int transcriptCounter; - private ArrayList transcriptList; - private String transcriptName; - private int exonCounter; - private String feature; - private Gtf nextGtfToReturn; + // Create Ensembl gene builder + CellBaseJsonFileSerializer ensemblGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(ENSEMBL_DATA), + ENSEMBL_GENE_BASENAME); + this.ensemblGeneBuilder = new EnsemblGeneBuilder(downloadPath.resolve(ENSEMBL_DATA), speciesConfiguration, flexibleGTFParsing, + ensemblGeneSerializer); - public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, SpeciesConfiguration speciesConfiguration, - CellBaseSerializer serializer) throws CellBaseException { - this(geneDirectoryPath, genomeSequenceFastaFile, speciesConfiguration, false, serializer); + // Create RefSeq gene builder + CellBaseJsonFileSerializer refSeqGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(REFSEQ_DATA), + REFSEQ_GENE_BASENAME); + this.refSeqGeneBuilder = new RefSeqGeneBuilder(downloadPath.resolve(REFSEQ_DATA), speciesConfiguration, refSeqGeneSerializer); } - public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, SpeciesConfiguration speciesConfiguration, - boolean flexibleGTFParsing, CellBaseSerializer serializer) throws CellBaseException { - this(null, geneDirectoryPath.resolve("description.txt"), - geneDirectoryPath.resolve("xrefs.txt"), - geneDirectoryPath.resolve("hgnc_complete_set_2023-11-01.txt"), - geneDirectoryPath.resolve("MANE.GRCh38.v1.1.summary.txt.gz"), - geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"), - geneDirectoryPath.resolve("idmapping_selected.tab.gz"), - geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz"), - geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz.tbi"), - geneDirectoryPath.resolve("allgenes_updown_in_organism_part.tab.gz"), - geneDirectoryPath.resolve("dgidb.tsv"), - geneDirectoryPath.resolve("phenotype_to_genes.txt"), - geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"), - geneDirectoryPath.resolve("gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz"), - geneDirectoryPath.resolve("goa_human.gaf.gz"), - geneDirectoryPath.getParent().resolve("regulation/miRNA.xls"), - geneDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"), - geneDirectoryPath.resolve("cancer-gene-census.tsv"), - geneDirectoryPath.resolve("hotspots_v2.xls"), - geneDirectoryPath.resolve("ensembl_canonical.txt"), - geneDirectoryPath.resolve("TSO500_transcripts.txt"), - geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"), - genomeSequenceFastaFile, - speciesConfiguration, flexibleGTFParsing, serializer); + public void check() throws Exception { + // Check Ensembl requirements + ensemblGeneBuilder.check(); - getGtfFileFromGeneDirectoryPath(geneDirectoryPath); - getProteinFastaFileFromGeneDirectoryPath(geneDirectoryPath); - getCDnaFastaFileFromGeneDirectoryPath(geneDirectoryPath); - } - - public GeneBuilder(Path gtfFile, Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path maneFile, - Path lrgFile, Path uniprotIdMappingFile, Path tfbsFile, Path tabixFile, Path geneExpressionFile, - Path geneDrugFile, Path hpoFile, Path disgenetFile, Path gnomadFile, - Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, Path cancerGeneCensusFile, - Path cancerHostpotFile, Path ensemblCanonicalFile, Path tso500File, Path eglhHaemOncFile, - Path genomeSequenceFilePath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, - CellBaseSerializer serializer) { - super(serializer); - - this.gtfFile = gtfFile; - this.geneDescriptionFile = geneDescriptionFile; - this.xrefsFile = xrefsFile; - this.hgncFile = hgncFile; - this.maneFile = maneFile; - this.lrgFile = lrgFile; - this.uniprotIdMappingFile = uniprotIdMappingFile; - this.tfbsFile = tfbsFile; - this.tabixFile = tabixFile; - this.geneExpressionFile = geneExpressionFile; - this.geneDrugFile = geneDrugFile; - this.hpoFile = hpoFile; - this.disgenetFile = disgenetFile; - this.gnomadFile = gnomadFile; - this.geneOntologyAnnotationFile = geneOntologyAnnotationFile; - this.miRBaseFile = miRBaseFile; - this.miRTarBaseFile = miRTarBaseFile; - this.cancerGeneCensusFile = cancerGeneCensusFile; - this.cancerHostpotFile = cancerHostpotFile; - this.ensemblCanonicalFile = ensemblCanonicalFile; - this.tso500File = tso500File; - this.eglhHaemOncFile = eglhHaemOncFile; - this.genomeSequenceFilePath = genomeSequenceFilePath; - this.speciesConfiguration = speciesConfiguration; - this.flexibleGTFParsing = flexibleGTFParsing; - - transcriptDict = new HashMap<>(250000); - exonDict = new HashMap<>(8000000); + // Check RefSeq requirements + refSeqGeneBuilder.check(); } + @Override public void parse() throws Exception { - Gene gene = null; - Transcript transcript; - Exon exon = null; - int cdna = 1; - int cds = 1; - EnsemblGeneBuilderIndexer indexer = new EnsemblGeneBuilderIndexer(gtfFile.getParent()); - - try { - // process files and put values in rocksdb - indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, - proteinFastaFile, cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, - geneDrugFile, hpoFile, disgenetFile, gnomadFile, geneOntologyAnnotationFile, miRBaseFile, - miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile, - tso500File, eglhHaemOncFile); - - TabixReader tabixReader = null; - if (!Files.exists(tfbsFile) || !Files.exists(tabixFile)) { - logger.error("Tfbs or tabix file not found. Download them and try again."); - } else { - tabixReader = new TabixReader(tfbsFile.toAbsolutePath().toString(), tabixFile.toAbsolutePath().toString()); - } - - // Preparing the fasta file for fast accessing -// System.out.println("genomeSequenceFilePath.toString() = " + genomeSequenceFilePath.toString()); - FastaIndex fastaIndex = new FastaIndex(genomeSequenceFilePath); - - // Empty transcript and exon dictionaries - transcriptDict.clear(); - exonDict.clear(); - logger.info("Parsing gtf..."); - GtfReader gtfReader = new GtfReader(gtfFile); - - // Gene->Transcript->Feature->GTF line - Map>> gtfMap = null; - if (flexibleGTFParsing) { - gtfMap = loadGTFMap(gtfReader); - initializePointers(gtfMap); - } - - Gtf gtf; - while ((gtf = getGTFEntry(gtfReader, gtfMap)) != null) { - - if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") - || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { - continue; - } - - String geneId = gtf.getAttributes().get("gene_id"); - String transcriptId = gtf.getAttributes().get("transcript_id"); - String geneName = gtf.getAttributes().get("gene_name"); - if (newGene(gene, geneId)) { - // If new geneId is different from the current then we must serialize before data new gene - if (gene != null) { - serializer.serialize(gene); - } - - GeneAnnotation geneAnnotation = new GeneAnnotation(indexer.getExpression(geneId), indexer.getDiseases(geneName), - indexer.getDrugs(geneName), indexer.getConstraints(geneId), indexer.getMirnaTargets(geneName), - indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); - - gene = new Gene(geneId, geneName, gtf.getSequenceName().replaceFirst("chr", ""), - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), gtf.getAttributes().get("gene_version"), - gtf.getAttributes().get("gene_biotype"), "KNOWN", SOURCE, indexer.getDescription(geneId), - new ArrayList<>(), indexer.getMirnaGene(transcriptId), geneAnnotation); - } - - // Check if Transcript exist in the Gene Set of transcripts - if (!transcriptDict.containsKey(transcriptId)) { - transcript = getTranscript(gene, indexer, tabixReader, gtf, transcriptId); - } else { - transcript = gene.getTranscripts().get(transcriptDict.get(transcriptId)); - } - - // At this point gene and transcript objects are set up - // Update gene and transcript genomic coordinates, start must be the - // lower, and end the higher - updateTranscriptAndGeneCoords(transcript, gene, gtf); - - String transcriptIdWithoutVersion = transcript.getId().split("\\.")[0]; - if (gtf.getFeature().equalsIgnoreCase("exon")) { - // Obtaining the exon sequence - String exonId = gtf.getAttributes().get("exon_id") + "." + gtf.getAttributes().get("exon_version"); - String exonSequence = fastaIndex.query(gtf.getSequenceName(), gtf.getStart(), gtf.getEnd()); - - exon = new Exon(exonId, gtf.getSequenceName().replaceFirst("chr", ""), - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), 0, 0, 0, 0, 0, 0, -1, Integer.parseInt(gtf - .getAttributes().get("exon_number")), exonSequence); - transcript.getExons().add(exon); - - exonDict.put(transcriptIdWithoutVersion + "_" + exon.getExonNumber(), exon); - if (gtf.getAttributes().get("exon_number").equals("1")) { - cdna = 1; - cds = 1; - } else { - // with every exon we update cDNA length with the previous exon length - cdna += exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getEnd() - - exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getStart() + 1; - } - } else { - exon = exonDict.get(transcriptIdWithoutVersion + "_" + exon.getExonNumber()); - if (gtf.getFeature().equalsIgnoreCase("CDS")) { - // Protein ID is only present in CDS lines - String proteinId = gtf.getAttributes().get("protein_id") != null - ? gtf.getAttributes().get("protein_id") + "." + gtf.getAttributes().get("protein_version") - : ""; - transcript.setProteinId(proteinId); - transcript.setProteinSequence(indexer.getProteinFasta(proteinId)); - - if (gtf.getStrand().equals("+") || gtf.getStrand().equals("1")) { - // CDS states the beginning of coding start - exon.setGenomicCodingStart(gtf.getStart()); - exon.setGenomicCodingEnd(gtf.getEnd()); - - // cDNA coordinates - exon.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); - exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - // Set cdnaCodingEnd to prevent those cases without stop_codon - - transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - exon.setCdsStart(cds); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); - - // increment in the coding length - cds += gtf.getEnd() - gtf.getStart() + 1; - transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon - - exon.setPhase(Integer.parseInt(gtf.getFrame())); - - if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { - transcript.setGenomicCodingStart(gtf.getStart()); - } - if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { - transcript.setGenomicCodingEnd(gtf.getEnd()); - } - // only first time - if (transcript.getCdnaCodingStart() == 0) { - transcript.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); - } - // strand - - } else { - // CDS states the beginning of coding start - exon.setGenomicCodingStart(gtf.getStart()); - exon.setGenomicCodingEnd(gtf.getEnd()); - // cDNA coordinates - // cdnaCodingStart points to the same base position than genomicCodingEnd - exon.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); - // cdnaCodingEnd points to the same base position than genomicCodingStart - exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - // Set cdnaCodingEnd to prevent those cases without stop_codon - transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - exon.setCdsStart(cds); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); - - // increment in the coding length - cds += gtf.getEnd() - gtf.getStart() + 1; - transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon - exon.setPhase(Integer.parseInt(gtf.getFrame())); - - if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { - transcript.setGenomicCodingStart(gtf.getStart()); - } - if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { - transcript.setGenomicCodingEnd(gtf.getEnd()); - } - // only first time - if (transcript.getCdnaCodingStart() == 0) { - // cdnaCodingStart points to the same base position than genomicCodingEnd - transcript.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); - } - } - - } -// if (gtf.getFeature().equalsIgnoreCase("start_codon")) { -// // nothing to do -// System.out.println("Empty block, this should be redesigned"); -// } - if (gtf.getFeature().equalsIgnoreCase("stop_codon")) { - // setCdnaCodingEnd = false; // stop_codon found, cdnaCodingEnd will be set here, - // no need to set it at the beginning of next feature - if (exon.getStrand().equals("+")) { - updateStopCodingDataPositiveExon(exon, cdna, cds, gtf); - - cds += gtf.getEnd() - gtf.getStart(); - // If stop_codon appears, overwrite values - transcript.setGenomicCodingEnd(gtf.getEnd()); - transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - transcript.setCdsLength(cds - 1); - - } else { - updateNegativeExonCodingData(exon, cdna, cds, gtf); - - cds += gtf.getEnd() - gtf.getStart(); - // If stop_codon appears, overwrite values - transcript.setGenomicCodingStart(gtf.getStart()); - // cdnaCodingEnd points to the same base position than genomicCodingStart - transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - transcript.setCdsLength(cds - 1); - } - } - } - } - - // last gene must be serialized - serializer.serialize(gene); - - // cleaning - gtfReader.close(); - serializer.close(); - fastaIndex.close(); - indexer.close(); - } catch (Exception e) { - indexer.close(); - throw e; - } - } - - private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, TabixReader tabixReader, Gtf gtf, String transcriptId) - throws IOException, RocksDBException { - Map gtfAttributes = gtf.getAttributes(); - - // To match Ensembl, we set the ID as transcript+version. This also matches the Ensembl website. - String transcriptIdWithVersion = transcriptId + "." + gtfAttributes.get("transcript_version"); - String biotype = gtfAttributes.get("transcript_biotype") != null ? gtfAttributes.get("transcript_biotype") : ""; - String transcriptChromosome = gtf.getSequenceName().replaceFirst("chr", ""); - List transcriptTfbses = getTranscriptTfbses(gtf, transcriptChromosome, tabixReader); - - List ontologyAnnotations = getOntologyAnnotations(indexer.getXrefs(transcriptId), indexer); - TranscriptAnnotation transcriptAnnotation = new TranscriptAnnotation(ontologyAnnotations, indexer.getConstraints(transcriptId)); - - Transcript transcript = new Transcript(transcriptIdWithVersion, gtfAttributes.get("transcript_name"), transcriptChromosome, - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, "KNOWN", - 0, 0, 0, 0, 0, - indexer.getCdnaFasta(transcriptIdWithVersion), "", "", "", - gtfAttributes.get("transcript_version"), SOURCE, new ArrayList<>(), indexer.getXrefs(transcriptId), transcriptTfbses, - new HashSet<>(), transcriptAnnotation); - - // Adding Ids appearing in the GTF to the xrefs is required, since for some unknown reason the ENSEMBL - // Perl API often doesn't return all genes resulting in an incomplete xrefs.txt file. We must ensure - // that the xrefs array contains all ids present in the GTF file - addGtfXrefs(transcript, gene, gtfAttributes); - - // Add HGNC ID mappings, with this we can know which Ensembl and Refseq transcripts match to HGNC ID - String hgncId = indexer.getHgncId(gene.getName()); - if (StringUtils.isNotEmpty(hgncId)) { - transcript.getXrefs().add(new Xref(hgncId, "hgnc_id", "HGNC ID")); - } - - // Add MANE Select mappings, with this we can know which Ensembl and Refseq transcripts match according to MANE - for (String suffix: Arrays.asList("refseq", "refseq_protein")) { - String maneRefSeq = indexer.getMane(transcriptIdWithVersion, suffix); - if (StringUtils.isNotEmpty(maneRefSeq)) { - transcript.getXrefs().add(new Xref(maneRefSeq, "mane_select_" + suffix, - "MANE Select RefSeq" + (suffix.contains("_") ? " Protein" : ""))); - } - } - - // Add LRG mappings, with this we can know which Ensembl and Refseq transcripts match according to LRG - String lrgRefSeq = indexer.getLrg(transcriptIdWithVersion, "refseq"); - if (StringUtils.isNotEmpty(lrgRefSeq)) { - transcript.getXrefs().add(new Xref(lrgRefSeq, "lrg_refseq", "LRG RefSeq")); - } - - // Add Flags - // 1. GTF tags - String tags = gtf.getAttributes().get("tag"); - if (StringUtils.isNotEmpty(tags)) { - transcript.getFlags().addAll(Arrays.asList(tags.split(","))); - } - // 2. TSL - String supportLevel = gtfAttributes.get("transcript_support_level"); - if (StringUtils.isNotEmpty(supportLevel)) { - // split on space so "5 (assigned to previous version 3)" and "5" both become "TSL:5" - String truncatedSupportLevel = supportLevel.split(" ")[0]; - transcript.getFlags().add("TSL:" + truncatedSupportLevel); - } - // 3. MANE Flag - String maneFlag = indexer.getMane(transcriptIdWithVersion, "flag"); - if (StringUtils.isNotEmpty(maneFlag)) { - transcript.getFlags().add(maneFlag); - } - // 4. LRG Flag - String lrg = indexer.getLrg(transcriptIdWithVersion, "ensembl"); - if (StringUtils.isNotEmpty(lrg)) { - transcript.getFlags().add("LRG"); - } else { - for (Xref xref : transcript.getXrefs()) { - if (xref.getId().startsWith("LRG_") && xref.getId().contains("t")) { - transcript.getFlags().add("LRG"); - } - } - } - // 5. Ensembl Canonical - String canonicalFlag = indexer.getCanonical(transcriptIdWithVersion); - if (StringUtils.isNotEmpty(canonicalFlag)) { - transcript.getFlags().add(canonicalFlag); - } - - // 6. TSO500 and EGLH HaemOnc - String maneRefSeq = indexer.getMane(transcriptIdWithVersion, "refseq"); - if (StringUtils.isNotEmpty(maneRefSeq)) { - String tso500Flag = indexer.getTSO500(maneRefSeq.split("\\.")[0]); - if (StringUtils.isNotEmpty(tso500Flag)) { - transcript.getFlags().add(tso500Flag); - } - - String eglhHaemOncFlag = indexer.getEGLHHaemOnc(maneRefSeq.split("\\.")[0]); - if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { - transcript.getFlags().add(eglhHaemOncFlag); - } - } - - gene.getTranscripts().add(transcript); - - // Do not change order!! size()-1 is the index of the transcript ID - transcriptDict.put(transcriptId, gene.getTranscripts().size() - 1); - return transcript; - } - - private List getOntologyAnnotations(List xrefs, EnsemblGeneBuilderIndexer indexer) - throws IOException, RocksDBException { - if (xrefs == null || indexer == null) { - return null; - } - List annotations = new ArrayList<>(); - for (Xref xref : xrefs) { - if (xref.getDbName().equals("uniprotkb_acc")) { - String key = xref.getId(); - if (key != null && indexer.getOntologyAnnotations(key) != null) { - annotations.addAll(indexer.getOntologyAnnotations(key)); - } - } - } - return annotations; - } - - private void updateNegativeExonCodingData(Exon exon, int cdna, int cds, Gtf gtf) { - // we need to increment 3 nts, the stop_codon length. - exon.setGenomicCodingStart(gtf.getStart()); - // cdnaCodingEnd points to the same base position than genomicCodingStart - exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); - - // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined - // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding - // starts - if (exon.getGenomicCodingEnd() == 0) { - exon.setGenomicCodingEnd(exon.getGenomicCodingStart() + 2); - } - if (exon.getCdnaCodingStart() == 0) { - exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); - } - if (exon.getCdsStart() == 0) { - exon.setCdsStart(exon.getCdsEnd() - 2); - } - } - - private void updateStopCodingDataPositiveExon(Exon exon, int cdna, int cds, Gtf gtf) { - // we need to increment 3 nts, the stop_codon length. - exon.setGenomicCodingEnd(gtf.getEnd()); - exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + logger.info(BUILDING_LOG_MESSAGE, getDataName(GENE_DATA)); - // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined - // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding - // starts - if (exon.getGenomicCodingStart() == 0) { - exon.setGenomicCodingStart(exon.getGenomicCodingEnd() - 2); - } - if (exon.getCdnaCodingStart() == 0) { - exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); - } - if (exon.getCdsStart() == 0) { - exon.setCdsStart(exon.getCdsEnd() - 2); - } - } - - private void addGtfXrefs(Transcript transcript, Gene gene, Map gtfAttributes) { - if (transcript.getXrefs() == null) { - transcript.setXrefs(new ArrayList<>()); - } - - transcript.getXrefs().add(new Xref(gene.getId(), "ensembl_gene", "Ensembl Gene")); - transcript.getXrefs().add(new Xref(transcript.getId(), "ensembl_transcript", "Ensembl Transcript")); - - // Some non-coding genes do not have Gene names - if (StringUtils.isNotEmpty(gene.getName())) { - transcript.getXrefs().add(new Xref(gene.getName(), "hgnc_symbol", "HGNC Symbol")); - transcript.getXrefs().add(new Xref(transcript.getName(), "ensembl_transcript_name", "Ensembl Transcript Name")); - } - - if (gtfAttributes.get("ccds_id") != null) { - transcript.getXrefs().add(new Xref(gtfAttributes.get("ccds_id"), "ccds_id", "CCDS")); - } - } - - private void initializePointers(Map>> gtfMap) { - geneCounter = 0; - geneList = new ArrayList<>(gtfMap.keySet()); - geneName = geneList.get(geneCounter); - transcriptCounter = 0; - transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); - transcriptName = transcriptList.get(transcriptCounter); - exonCounter = 0; - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - - private Gtf getGTFEntry(GtfReader gtfReader, Map>> gtfMap) throws FileFormatException { - // Flexible parsing is deactivated, return next line - if (gtfMap == null) { - return gtfReader.read(); - // Flexible parsing activated, carefully select next line to return - } else { - // No more genes/features to return - if (nextGtfToReturn == null) { - return null; - } - Gtf gtfToReturn = nextGtfToReturn; - if (feature.equals("exon")) { -// gtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - if (gtfMap.get(geneName).get(transcriptName).containsKey("cds")) { - nextGtfToReturn = getExonCDSLine(((Gtf) ((List) gtfMap.get(geneName) - .get(transcriptName).get("exon")).get(exonCounter)).getStart(), - ((Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter)).getEnd(), - (List) gtfMap.get(geneName).get(transcriptName).get("cds")); - if (nextGtfToReturn != null) { - feature = "cds"; - return gtfToReturn; - } - } - // if no cds was found for this exon, get next exon - getFeatureFollowsExon(gtfMap); - return gtfToReturn; - } - if (feature.equals("cds") || feature.equals("stop_codon")) { - getFeatureFollowsExon(gtfMap); - return gtfToReturn; - } - if (feature.equals("start_codon")) { - feature = "stop_codon"; - nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("stop_codon"); - return gtfToReturn; - } - // The only accepted features that should appear in the gtfMap are exon, cds, start_codon and stop_codon - throw new FileFormatException("Execution cannot reach this point"); - } - } - - private Gtf getExonCDSLine(Integer exonStart, Integer exonEnd, List cdsList) { - for (Object cdsObject : cdsList) { - int cdsStart = ((Gtf) cdsObject).getStart(); - int cdsEnd = ((Gtf) cdsObject).getEnd(); - if (cdsStart <= exonEnd && cdsEnd >= exonStart) { - return (Gtf) cdsObject; - } - } - return null; - } - - private void getFeatureFollowsExon(Map>> gtfMap) { - exonCounter++; - if (exonCounter == ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).size() - || feature.equals("stop_codon")) { - // If last returned feature was a stop_codon or no start_codon is provided for this transcript, - // next transcript must be selected - if (!feature.equals("stop_codon") && gtfMap.get(geneName).get(transcriptName).containsKey("start_codon")) { - feature = "start_codon"; - nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("start_codon"); - } else { - transcriptCounter++; - // No more transcripts in this gene, check if there are more genes - if (transcriptCounter == gtfMap.get(geneName).size()) { - geneCounter++; - // No more genes available, end parsing - if (geneCounter == gtfMap.size()) { - nextGtfToReturn = null; - feature = null; - // Still more genes to parse, select next one - } else { - geneName = geneList.get(geneCounter); - transcriptCounter = 0; - transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); - } - } - // Check if a new gene was selected - null would indicate there're no more genes - if (nextGtfToReturn != null) { - transcriptName = transcriptList.get(transcriptCounter); - exonCounter = 0; - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - } - } else { - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - } - - private Map>> loadGTFMap(GtfReader gtfReader) throws FileFormatException { - Map>> gtfMap = new HashMap<>(); - Gtf gtf; - while ((gtf = gtfReader.read()) != null) { - if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") - || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { - continue; - } - - // Get GTF lines associated with this gene - create a new Map of GTF entries if it's a new gene - String geneId = gtf.getAttributes().get("gene_id"); - // Transcript -> feature -> GTF line - Map> gtfMapGeneEntry; - if (gtfMap.containsKey(geneId)) { - gtfMapGeneEntry = gtfMap.get(geneId); - } else { - gtfMapGeneEntry = new HashMap(); - gtfMap.put(geneId, gtfMapGeneEntry); - } - - // Get GTF lines associated with this transcript - create a new Map of GTF entries if it's a new gene - String transcriptId = gtf.getAttributes().get("transcript_id"); - Map gtfMapTranscriptEntry; - if (gtfMapGeneEntry.containsKey(transcriptId)) { - gtfMapTranscriptEntry = gtfMapGeneEntry.get(transcriptId); - } else { - gtfMapTranscriptEntry = new HashMap(); - gtfMapGeneEntry.put(transcriptId, gtfMapTranscriptEntry); - } - - addGTFLineToGTFMap(gtfMapTranscriptEntry, gtf); - - } - - // Exon number is mandatory for the parser to be able to properly generate the gene data model - if (!exonNumberPresent(gtfMap)) { - setExonNumber(gtfMap); - } - - return gtfMap; - } - - private boolean exonNumberPresent(Map>> gtfMap) { - Map> geneGtfMap = gtfMap.get(gtfMap.keySet().iterator().next()); - return ((Gtf) ((List) geneGtfMap.get(geneGtfMap.keySet().iterator().next()).get("exon")).get(0)) - .getAttributes().containsKey("exon_number"); - } - - private void setExonNumber(Map>> gtfMap) { - for (String gene : gtfMap.keySet()) { - for (String transcript : gtfMap.get(gene).keySet()) { - List exonList = (List) gtfMap.get(gene).get(transcript).get("exon"); - Collections.sort(exonList, (e1, e2) -> Integer.valueOf(e1.getStart()).compareTo(e2.getStart())); - if (exonList.get(0).getStrand().equals("+")) { - int exonNumber = 1; - for (Gtf gtf : exonList) { - gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); - exonNumber++; - } - } else { - int exonNumber = exonList.size(); - for (Gtf gtf : exonList) { - gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); - exonNumber--; - } - } - } - } - } - - private void addGTFLineToGTFMap(Map gtfMapTranscriptEntry, Gtf gtf) { - // Add exon/cds GTF line to the corresponding gene entry in the map - String featureType = gtf.getFeature().toLowerCase(); - if (featureType.equals("exon") || featureType.equals("cds")) { - List gtfList; - // Check if there were exons already stored - if (gtfMapTranscriptEntry.containsKey(featureType)) { - gtfList = (List) gtfMapTranscriptEntry.get(featureType); - } else { - gtfList = new ArrayList<>(); - gtfMapTranscriptEntry.put(featureType, gtfList); - } - gtfList.add(gtf); - // Only one start/stop codon can be stored per transcript - no need to check if the "start_codon"/"stop_codon" - // keys are already there - } else if (featureType.equals("start_codon") || featureType.equals("stop_codon")) { - gtfMapTranscriptEntry.put(featureType, gtf); - } - } + // Check folders and files before building + check(); - private List getTranscriptTfbses(Gtf transcript, String chromosome, TabixReader tabixReader) throws IOException { - if (tabixReader == null) { - return null; - } - List transcriptTfbses = null; - - int transcriptStart = transcript.getStart(); - int transcriptEnd = transcript.getEnd(); - - - String line; - TabixReader.Iterator iter = tabixReader.query(chromosome, transcriptStart, transcriptEnd); - while ((line = iter.next()) != null) { - String[] elements = line.split("\t"); - - String sequenceName = elements[0]; - String source = elements[1]; - String feature = elements[2]; - int start = Integer.parseInt(elements[3]); - int end = Integer.parseInt(elements[4]); - String score = elements[5]; - String strand = elements[6]; - String frame = elements[7]; - String attribute = elements[8]; - - if (strand.equals(transcript.getStrand())) { - continue; - } - - if (transcript.getStrand().equals("+")) { - if (start > transcript.getStart() + 500) { - break; - } else if (end > transcript.getStart() - 2500) { - Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); - transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); - } - } else { - // transcript in negative strand - if (start > transcript.getEnd() + 2500) { - break; - } else if (start > transcript.getEnd() - 500) { - Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); - transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); - } - } - } - - return transcriptTfbses; - } - - protected List addTranscriptTfbstoList(Gff2 tfbs, Gtf transcript, String chromosome, - List transcriptTfbses) { - if (transcriptTfbses == null) { - transcriptTfbses = new ArrayList<>(); - } - - // binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116; - // stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB - String[] attributes = tfbs.getAttribute().split(";"); - - String id = null; - String pfmId = null; - List transciptionFactors = null; - - for (String attributePair : attributes) { - String[] attributePairArray = attributePair.split("="); - switch(attributePairArray[0]) { - case "binding_matrix_stable_id": - pfmId = attributePairArray[1]; - break; - case "stable_id": - id = attributePairArray[1]; - break; - case "transcription_factor_complex": - transciptionFactors = Arrays.asList(attributePairArray[1].split("(::)|(%2C)")); - break; - default: - break; - } - } - - transcriptTfbses.add(new TranscriptTfbs(id, pfmId, tfbs.getFeature(), transciptionFactors, chromosome, tfbs.getStart(), - tfbs.getEnd(), getRelativeTranscriptTfbsStart(tfbs, transcript), getRelativeTranscriptTfbsEnd(tfbs, transcript), - Float.parseFloat(tfbs.getScore()))); - return transcriptTfbses; - } - - private Integer getRelativeTranscriptTfbsStart(Gff2 tfbs, Gtf transcript) { - Integer relativeStart; - if (transcript.getStrand().equals("+")) { - if (tfbs.getStart() < transcript.getStart()) { - relativeStart = tfbs.getStart() - transcript.getStart(); - } else { - relativeStart = tfbs.getStart() - transcript.getStart() + 1; - } - } else { - // negative strand transcript - if (tfbs.getEnd() > transcript.getEnd()) { - relativeStart = transcript.getEnd() - tfbs.getEnd(); - } else { - relativeStart = transcript.getEnd() - tfbs.getEnd() + 1; - } - } - return relativeStart; - } - - private Integer getRelativeTranscriptTfbsEnd(Gff2 tfbs, Gtf transcript) { - Integer relativeEnd; - if (transcript.getStrand().equals("+")) { - if (tfbs.getEnd() < transcript.getStart()) { - relativeEnd = tfbs.getEnd() - transcript.getStart(); - } else { - relativeEnd = tfbs.getEnd() - transcript.getStart() + 1; - } - } else { - if (tfbs.getStart() > transcript.getEnd()) { - relativeEnd = transcript.getEnd() - tfbs.getStart(); - } else { - relativeEnd = transcript.getEnd() - tfbs.getStart() + 1; - } - } - return relativeEnd; - } - - - - private boolean newGene(Gene previousGene, String newGeneId) { - return previousGene == null || !newGeneId.equals(previousGene.getId()); - } - - private void updateTranscriptAndGeneCoords(Transcript transcript, Gene gene, Gtf gtf) { - if (transcript.getStart() > gtf.getStart()) { - transcript.setStart(gtf.getStart()); - } - if (transcript.getEnd() < gtf.getEnd()) { - transcript.setEnd(gtf.getEnd()); - } - if (gene.getStart() > gtf.getStart()) { - gene.setStart(gtf.getStart()); - } - if (gene.getEnd() < gtf.getEnd()) { - gene.setEnd(gtf.getEnd()); - } - } - - private void getGtfFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { - gtfFile = geneDirectoryPath.resolve(fileName); - break; - } - } - } - - private void getProteinFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".pep.all.fa") || fileName.endsWith(".pep.all.fa.gz")) { - proteinFastaFile = geneDirectoryPath.resolve(fileName); - break; - } - } - } + // Build Ensembl/RefSeq genes + ensemblGeneBuilder.parse(); + refSeqGeneBuilder.parse(); - private void getCDnaFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".cdna.all.fa") || fileName.endsWith(".cdna.all.fa.gz")) { - cDnaFastaFile = geneDirectoryPath.resolve(fileName); - break; - } - } + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java index 285236ba60..b8941cc448 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java @@ -24,9 +24,10 @@ import org.opencb.biodata.formats.sequence.fasta.Fasta; import org.opencb.biodata.formats.sequence.fasta.io.FastaReader; import org.opencb.biodata.models.clinical.ClinicalProperty; -import org.opencb.biodata.models.core.CancerHotspot; -import org.opencb.biodata.models.core.CancerHotspotVariant; -import org.opencb.biodata.models.core.GeneCancerAssociation; +import org.opencb.biodata.models.core.*; +import org.opencb.biodata.models.variant.avro.GeneDrugInteraction; +import org.opencb.biodata.models.variant.avro.GeneTraitAssociation; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; @@ -42,8 +43,14 @@ import java.util.*; import java.util.stream.Collectors; +import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE; + public class GeneBuilderIndexer { + public static final String ROCKSDB_FOLDER = "rocksdb.idx"; + protected RocksDB rocksdb; protected RocksDbManager rocksDbManager; protected Logger logger; @@ -69,7 +76,7 @@ public GeneBuilderIndexer(Path genePath) { private void init(Path genePath) { rocksDbManager = new RocksDbManager(); - dbLocation = genePath.resolve("integration.idx").toString(); + dbLocation = genePath.resolve(ROCKSDB_FOLDER).toString(); rocksdb = rocksDbManager.getDBConnection(dbLocation); dbOption = new Options().setCreateIfMissing(true); @@ -77,18 +84,14 @@ private void init(Path genePath) { } protected void indexCdnaSequences(Path cDnaFastaFile) throws IOException, FileFormatException, RocksDBException { - logger.info("Loading RefSeq's cDNA sequences..."); - FileUtils.checkPath(cDnaFastaFile); - if (Files.size(cDnaFastaFile) > 0) { - FastaReader fastaReader = new FastaReader(cDnaFastaFile); - Fasta fasta; - while ((fasta = fastaReader.read()) != null) { - rocksDbManager.update(rocksdb, fasta.getId() + CDNA_SEQUENCE_SUFFIX, fasta.getSeq()); - } - fastaReader.close(); - } else { - logger.warn("RefSeq's cDNA sequences not loaded"); + logger.info(PARSING_LOG_MESSAGE, cDnaFastaFile); + FastaReader fastaReader = new FastaReader(cDnaFastaFile); + Fasta fasta; + while ((fasta = fastaReader.read()) != null) { + rocksDbManager.update(rocksdb, fasta.getId() + CDNA_SEQUENCE_SUFFIX, fasta.getSeq()); } + fastaReader.close(); + logger.info(PARSING_DONE_LOG_MESSAGE, cDnaFastaFile); } public String getCdnaFasta(String id) throws RocksDBException { @@ -96,18 +99,14 @@ public String getCdnaFasta(String id) throws RocksDBException { } protected void indexProteinSequences(Path proteinFastaFile) throws IOException, FileFormatException, RocksDBException { - logger.info("Loading ENSEMBL's protein sequences..."); - FileUtils.checkPath(proteinFastaFile); - if (Files.size(proteinFastaFile) > 0) { - FastaReader fastaReader = new FastaReader(proteinFastaFile); - Fasta fasta; - while ((fasta = fastaReader.read()) != null) { - rocksDbManager.update(rocksdb, fasta.getId() + PROTEIN_SEQUENCE_SUFFIX, fasta.getSeq()); - } - fastaReader.close(); - } else { - logger.warn("ENSEMBL's protein sequences not loaded"); + logger.info(PARSING_LOG_MESSAGE, proteinFastaFile); + FastaReader fastaReader = new FastaReader(proteinFastaFile); + Fasta fasta; + while ((fasta = fastaReader.read()) != null) { + rocksDbManager.update(rocksdb, fasta.getId() + PROTEIN_SEQUENCE_SUFFIX, fasta.getSeq()); } + fastaReader.close(); + logger.info(PARSING_DONE_LOG_MESSAGE, proteinFastaFile); } protected String getProteinFasta(String id) throws RocksDBException { @@ -115,22 +114,18 @@ protected String getProteinFasta(String id) throws RocksDBException { } protected void indexHgncIdMapping(Path hgncMappingFile) throws IOException, RocksDBException { - // #hgnc_id symbol name locus_group locus_type status location location_sortable ... - logger.info("Indexing HGNC ID mapping data ..."); - - // We only need the first two columns: hgnc_id -> symbol - if (hgncMappingFile != null && Files.exists(hgncMappingFile) && Files.size(hgncMappingFile) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hgncMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - String[] fields = line.split("\t", -1); - rocksDbManager.update(rocksdb, fields[1] + HGNC_ID_SUFFIX, fields[0]); - line = bufferedReader.readLine(); - } + logger.info(PARSING_LOG_MESSAGE, hgncMappingFile); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hgncMappingFile)) { + String line = bufferedReader.readLine(); + // We only need the first two columns: hgnc_id -> symbol + // #hgnc_id symbol name locus_group locus_type status location location_sortable ... + while (StringUtils.isNotEmpty(line)) { + String[] fields = line.split("\t", -1); + rocksDbManager.update(rocksdb, fields[1] + HGNC_ID_SUFFIX, fields[0]); + line = bufferedReader.readLine(); } - } else { - logger.warn("HGNC ID mapping file " + hgncMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, hgncMappingFile); } public String getHgncId(String id) throws RocksDBException { @@ -138,29 +133,25 @@ public String getHgncId(String id) throws RocksDBException { } protected void indexManeMapping(Path maneMappingFile, String referenceId) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, maneMappingFile); + int idColumn = referenceId.equalsIgnoreCase(ENSEMBL_DATA) ? 7 : 5; + // #NCBI_GeneID Ensembl_Gene HGNC_ID symbol name RefSeq_nuc RefSeq_prot Ensembl_nuc Ensembl_prot // MANE_status GRCh38_chr chr_start chr_end chr_strand - logger.info("Indexing MANE mapping data ..."); - - if (maneMappingFile != null && Files.exists(maneMappingFile) && Files.size(maneMappingFile) > 0) { - int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 7 : 5; -// BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile); - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - String[] fields = line.split("\t", -1); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq", fields[5]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq_protein", fields[6]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl", fields[7]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl_protein", fields[8]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_flag", fields[9]); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile)) { + String line = bufferedReader.readLine(); + while (StringUtils.isNotEmpty(line)) { + String[] fields = line.split("\t", -1); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq", fields[5]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq_protein", fields[6]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl", fields[7]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl_protein", fields[8]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_flag", fields[9]); - line = bufferedReader.readLine(); - } + line = bufferedReader.readLine(); } - } else { - logger.warn("MANE mapping file " + maneMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, maneMappingFile); } public String getMane(String id, String field) throws RocksDBException { @@ -168,30 +159,27 @@ public String getMane(String id, String field) throws RocksDBException { } protected void indexLrgMapping(Path lrgMappingFile, String referenceId) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, lrgMappingFile); + // # Last modified: 30-03-2021@22:00:06 // # LRG HGNC_SYMBOL REFSEQ_GENOMIC LRG_TRANSCRIPT REFSEQ_TRANSCRIPT ENSEMBL_TRANSCRIPT CCDS // LRG_1 COL1A1 NG_007400.1 t1 NM_000088.3 ENST00000225964.10 CCDS11561.1 - logger.info("Indexing LRG mapping data ..."); - - if (lrgMappingFile != null && Files.exists(lrgMappingFile) && Files.size(lrgMappingFile) > 0) { - int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 5 : 4; - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(lrgMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - String id = fields[idColumn]; - if (StringUtils.isNotEmpty(id) && !id.equals("-")) { - rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_refseq", fields[4]); - rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_ensembl", fields[5]); - } + int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 5 : 4; + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(lrgMappingFile)) { + String line = bufferedReader.readLine(); + while (StringUtils.isNotEmpty(line)) { + if (!line.startsWith("#")) { + String[] fields = line.split("\t", -1); + String id = fields[idColumn]; + if (StringUtils.isNotEmpty(id) && !id.equals("-")) { + rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_refseq", fields[4]); + rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_ensembl", fields[5]); } - line = bufferedReader.readLine(); } + line = bufferedReader.readLine(); } - } else { - logger.warn("LRG mapping file " + lrgMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, lrgMappingFile); } public String getLrg(String id, String field) throws RocksDBException { @@ -199,6 +187,8 @@ public String getLrg(String id, String field) throws RocksDBException { } protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, cgcFile); + Map tissuesMap = new HashMap<>(); tissuesMap.put("E", "epithelial"); tissuesMap.put("L", "leukaemia/lymphoma"); @@ -224,10 +214,8 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx mutationTypesMap.put("Mis", "missense"); mutationTypesMap.put("PromoterMis", "missense"); - logger.info("Indexing CANCER GENE CENSUS data ..."); - if (cgcFile != null && Files.exists(cgcFile) && Files.size(cgcFile) > 0) { + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(cgcFile)) { // Skip the first header line - BufferedReader bufferedReader = FileUtils.newBufferedReader(cgcFile); bufferedReader.readLine(); GeneCancerAssociation cancerGeneAssociation; @@ -237,9 +225,9 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx // Find Ensembl Gene Id in the last comma-separated column List synonyms = StringUtils.isNotEmpty(fields[19]) ? Arrays.stream(fields[19] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .collect(Collectors.toList()) : Collections.emptyList(); @@ -264,44 +252,44 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx : Collections.emptyList(); List tissues = StringUtils.isNotEmpty(fields[12]) ? Arrays.stream(fields[12] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .map(tissuesMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List modeOfInheritance = StringUtils.isNotEmpty(fields[13]) ? fields[13].equalsIgnoreCase("Dom/Rec") - ? Arrays.asList(moiMap.get("Dom"), moiMap.get("Rec")) - : Collections.singletonList(moiMap.get(fields[13])) + ? Arrays.asList(moiMap.get("Dom"), moiMap.get("Rec")) + : Collections.singletonList(moiMap.get(fields[13])) : Collections.emptyList(); List roleInCancer = StringUtils.isNotEmpty(fields[14]) ? Arrays.stream(fields[14] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .map(roleInCancerMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List mutationTypes = StringUtils.isNotEmpty(fields[15]) ? Arrays.stream(fields[15] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .map(mutationTypesMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List translocationPartners = StringUtils.isNotEmpty(fields[16]) ? Arrays.stream(fields[16] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .collect(Collectors.toList()) : Collections.emptyList(); List otherSyndromes = StringUtils.isNotEmpty(fields[18]) ? Arrays.stream(fields[18] - .replaceAll("\"", "") - .split("; ")) + .replaceAll("\"", "") + .split("; ")) .collect(Collectors.toList()) : Collections.emptyList(); @@ -312,10 +300,9 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx rocksDbManager.update(rocksdb, fields[0] + CANCER_GENE_CENSUS_SUFFIX, cancerGeneAssociation); } } - bufferedReader.close(); - } else { - logger.warn("CANCER GENE CENSUS file " + cgcFile + " not found"); } + + logger.info(PARSING_DONE_LOG_MESSAGE, cgcFile); } public List getCancerGeneCensus(String geneName) throws RocksDBException, IOException { @@ -324,97 +311,102 @@ public List getCancerGeneCensus(String geneName) throws R } public void indexCancerHotspot(Path cancerHotspot) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, cancerHotspot); + // Store all cancer hotspot (different gene and aminoacid position) for each gene in the same key Map> visited = new HashMap<>(); - FileInputStream fileInputStream = new FileInputStream(cancerHotspot.toFile()); - HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); - HSSFSheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - iterator.next(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - String geneName = currentRow.getCell(0).toString(); - - if (currentRow.getCell(1).toString().contains("splice")) { - continue; - } - int aminoAcidPosition = Integer.parseInt(currentRow.getCell(1).toString()); - - CancerHotspot ch = null; - // Check if ch object already exist - if (visited.containsKey(geneName)) { - for (CancerHotspot hotspot : visited.get(geneName)) { - if (hotspot.getAminoacidPosition() == aminoAcidPosition) { - ch = hotspot; - break; - } - } - } - // If not exist we create new ch - if (ch == null) { - ch = new CancerHotspot(); - ch.setScores(new HashMap<>()); - ch.setCancerTypeCount(new HashMap<>()); - ch.setOrganCount(new HashMap<>()); - ch.setVariants(new ArrayList<>()); - - // Parse new row - ch.setGeneName(geneName); - ch.setAminoacidPosition(aminoAcidPosition); - ch.getScores().put("log10Pvalue", Double.parseDouble(currentRow.getCell(2).toString())); - ch.setNumMutations(Integer.parseInt(currentRow.getCell(3).toString())); - - String[] cancerCountSplit = currentRow.getCell(11).toString().split("\\|"); - for (String cancerCount : cancerCountSplit) { - String[] split = cancerCount.split(":"); - ch.getCancerTypeCount().put(split[0], Integer.parseInt(split[2])); + try (FileInputStream fileInputStream = new FileInputStream(cancerHotspot.toFile())) { + HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); + HSSFSheet sheet = workbook.getSheetAt(0); + Iterator iterator = sheet.iterator(); + iterator.next(); + while (iterator.hasNext()) { + Row currentRow = iterator.next(); + String geneName = currentRow.getCell(0).toString(); + + if (currentRow.getCell(1).toString().contains("splice")) { + continue; } + int aminoAcidPosition = Integer.parseInt(currentRow.getCell(1).toString()); - String[] organCountSplit = currentRow.getCell(12).toString().split("\\|"); - for (String organCount : organCountSplit) { - String[] split = organCount.split(":"); - ch.getOrganCount().put(split[0], Integer.parseInt(split[2])); + CancerHotspot ch = null; + // Check if ch object already exist + if (visited.containsKey(geneName)) { + for (CancerHotspot hotspot : visited.get(geneName)) { + if (hotspot.getAminoacidPosition() == aminoAcidPosition) { + ch = hotspot; + break; + } + } } - ch.getScores().put("mutability", Double.parseDouble(currentRow.getCell(14).toString())); - ch.getScores().put("muProtein", Double.parseDouble(currentRow.getCell(15).toString())); - ch.setAnalysis(Arrays.asList(currentRow.getCell(17).toString().split(","))); - ch.getScores().put("qvalue", Double.parseDouble(currentRow.getCell(18).toString())); - ch.getScores().put("qvaluePancan", Double.parseDouble(currentRow.getCell(20).toString())); - ch.setAminoacidReference(currentRow.getCell(35).toString()); - ch.getScores().put("qvalueCancerType", Double.parseDouble(currentRow.getCell(36).toString())); - ch.setCancerType(currentRow.getCell(37).toString()); + // If not exist we create new ch + if (ch == null) { + ch = new CancerHotspot(); + ch.setScores(new HashMap<>()); + ch.setCancerTypeCount(new HashMap<>()); + ch.setOrganCount(new HashMap<>()); + ch.setVariants(new ArrayList<>()); + + // Parse new row + ch.setGeneName(geneName); + ch.setAminoacidPosition(aminoAcidPosition); + ch.getScores().put("log10Pvalue", Double.parseDouble(currentRow.getCell(2).toString())); + ch.setNumMutations(Integer.parseInt(currentRow.getCell(3).toString())); + + String[] cancerCountSplit = currentRow.getCell(11).toString().split("\\|"); + for (String cancerCount : cancerCountSplit) { + String[] split = cancerCount.split(":"); + ch.getCancerTypeCount().put(split[0], Integer.parseInt(split[2])); + } - if (visited.containsKey(geneName)) { - // Gene exists but no this aminoacid position - visited.get(geneName).add(ch); - } else { - // New gene found - visited.put(geneName, new ArrayList<>(Collections.singletonList(ch))); + String[] organCountSplit = currentRow.getCell(12).toString().split("\\|"); + for (String organCount : organCountSplit) { + String[] split = organCount.split(":"); + ch.getOrganCount().put(split[0], Integer.parseInt(split[2])); + } + + ch.getScores().put("mutability", Double.parseDouble(currentRow.getCell(14).toString())); + ch.getScores().put("muProtein", Double.parseDouble(currentRow.getCell(15).toString())); + ch.setAnalysis(Arrays.asList(currentRow.getCell(17).toString().split(","))); + ch.getScores().put("qvalue", Double.parseDouble(currentRow.getCell(18).toString())); + ch.getScores().put("qvaluePancan", Double.parseDouble(currentRow.getCell(20).toString())); + ch.setAminoacidReference(currentRow.getCell(35).toString()); + ch.getScores().put("qvalueCancerType", Double.parseDouble(currentRow.getCell(36).toString())); + ch.setCancerType(currentRow.getCell(37).toString()); + + if (visited.containsKey(geneName)) { + // Gene exists but no this aminoacid position + visited.get(geneName).add(ch); + } else { + // New gene found + visited.put(geneName, new ArrayList<>(Collections.singletonList(ch))); + } } - } - // Add cancer hotspot variant information - CancerHotspotVariant cancerHotspotVariant = new CancerHotspotVariant(); - cancerHotspotVariant.setSampleCount(new HashMap<>()); + // Add cancer hotspot variant information + CancerHotspotVariant cancerHotspotVariant = new CancerHotspotVariant(); + cancerHotspotVariant.setSampleCount(new HashMap<>()); - String[] alternateCountSplit = currentRow.getCell(8).toString().split(":"); - cancerHotspotVariant.setAminoacidAlternate(alternateCountSplit[0]); - cancerHotspotVariant.setCount(Integer.parseInt(alternateCountSplit[1])); + String[] alternateCountSplit = currentRow.getCell(8).toString().split(":"); + cancerHotspotVariant.setAminoacidAlternate(alternateCountSplit[0]); + cancerHotspotVariant.setCount(Integer.parseInt(alternateCountSplit[1])); - String[] sampleSplit = currentRow.getCell(38).toString().split("\\|"); - for (String sampleCount : sampleSplit) { - String[] sampleCountSplit = sampleCount.split(":"); - cancerHotspotVariant.getSampleCount().put(sampleCountSplit[0], Integer.parseInt(sampleCountSplit[1])); + String[] sampleSplit = currentRow.getCell(38).toString().split("\\|"); + for (String sampleCount : sampleSplit) { + String[] sampleCountSplit = sampleCount.split(":"); + cancerHotspotVariant.getSampleCount().put(sampleCountSplit[0], Integer.parseInt(sampleCountSplit[1])); + } + ch.getVariants().add(cancerHotspotVariant); } - ch.getVariants().add(cancerHotspotVariant); } - fileInputStream.close(); for (String geneName : visited.keySet()) { rocksDbManager.update(rocksdb, geneName + CANCER_HOTSPOT_SUFFIX, visited.get(geneName)); } + + logger.info(PARSING_DONE_LOG_MESSAGE, cancerHotspot); } public List getCancerHotspot(String geneName) throws RocksDBException, IOException { @@ -422,29 +414,25 @@ public List getCancerHotspot(String geneName) throws RocksDBExcep return rocksDbManager.getCancerHotspot(rocksdb, key); } - protected void indexTSO500(Path tso500Path) throws IOException, RocksDBException { - // Gene Ref Seq - // FAS NM_000043 - // AR NM_000044 - logger.info("Indexing TSO500 data ..."); - - if (tso500Path != null && Files.exists(tso500Path) && Files.size(tso500Path) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(tso500Path)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - if (fields.length == 2) { - rocksDbManager.update(rocksdb, fields[1] + TSO500_SUFFIX, "TSO500"); - } + logger.info(PARSING_LOG_MESSAGE, tso500Path); + + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(tso500Path)) { + String line = bufferedReader.readLine(); + // Gene Ref Seq + // FAS NM_000043 + // AR NM_000044 + while (StringUtils.isNotEmpty(line)) { + if (!line.startsWith("#")) { + String[] fields = line.split("\t", -1); + if (fields.length == 2) { + rocksDbManager.update(rocksdb, fields[1] + TSO500_SUFFIX, "TSO500"); } - line = bufferedReader.readLine(); } + line = bufferedReader.readLine(); } - } else { - logger.warn("Ensembl TSO500 mapping file " + tso500Path + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, tso500Path); } public String getTSO500(String transcriptId) throws RocksDBException { @@ -456,29 +444,25 @@ public String getTSO500(String transcriptId) throws RocksDBException { return new String(bytes); } - protected void indexEGLHHaemOnc(Path eglhHaemOncPath) throws IOException, RocksDBException { - // Gene Ref Seq - // GNB1 NM_002074.4 - // CSF3R NM_000760.3 - logger.info("Indexing EGLH HaemOnc data ..."); - - if (eglhHaemOncPath != null && Files.exists(eglhHaemOncPath) && Files.size(eglhHaemOncPath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(eglhHaemOncPath)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - if (fields.length == 2) { - rocksDbManager.update(rocksdb, fields[1].split("\\.")[0] + EGLH_HAEMONC_SUFFIX, "EGLH_HaemOnc"); - } + logger.info(PARSING_LOG_MESSAGE, eglhHaemOncPath); + + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(eglhHaemOncPath)) { + String line = bufferedReader.readLine(); + // Gene Ref Seq + // GNB1 NM_002074.4 + // CSF3R NM_000760.3 + while (StringUtils.isNotEmpty(line)) { + if (!line.startsWith("#")) { + String[] fields = line.split("\t", -1); + if (fields.length == 2) { + rocksDbManager.update(rocksdb, fields[1].split("\\.")[0] + EGLH_HAEMONC_SUFFIX, "EGLH_HaemOnc"); } - line = bufferedReader.readLine(); } + line = bufferedReader.readLine(); } - } else { - logger.warn("Ensembl EGLH HaemOnc mapping file " + eglhHaemOncPath + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, eglhHaemOncPath); } public String getEGLHHaemOnc(String transcriptId) throws RocksDBException { @@ -510,4 +494,219 @@ protected void close() throws IOException { rocksDbManager.closeIndex(rocksdb, dbOption, dbLocation); } + protected void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, geneDrugFile); + + String currentGene = ""; + List drugs = new ArrayList<>(); + + try (BufferedReader br = FileUtils.newBufferedReader(geneDrugFile)) { + // Skip header + br.readLine(); + + int lineCounter = 1; + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split("\t"); + String geneName = parts[0]; + if (currentGene.equals("")) { + currentGene = geneName; + } else if (!currentGene.equals(geneName)) { + rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); + drugs = new ArrayList<>(); + currentGene = geneName; + } + + String source = null; + if (parts.length >= 4) { + source = parts[3]; + } + + String interactionType = null; + if (parts.length >= 5) { + interactionType = parts[4]; + } + + String drugName = null; + if (parts.length >= 8) { + // if drug name column is empty, use drug claim name instead + drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; + } + if (StringUtils.isEmpty(drugName)) { + // no drug name + continue; + } + + String chemblId = null; + if (parts.length >= 9) { + chemblId = parts[8]; + } + + List publications = new ArrayList<>(); + if (parts.length >= 10 && parts[9] != null) { + publications = Arrays.asList(parts[9].split(",")); + } + + GeneDrugInteraction drug = new GeneDrugInteraction( + geneName, drugName, source, null, null, interactionType, chemblId, publications); + drugs.add(drug); + lineCounter++; + } + } + // update last gene + rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); + + logger.info(PARSING_DONE_LOG_MESSAGE, geneDrugFile); + } + + protected void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { + + Map> geneDiseaseAssociationMap = new HashMap<>(50000); + + String line; + + // HPO +// logger.info(PARSING_LOG_MESSAGE, hpoFilePath); +// try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { +// // Skip first header line +// bufferedReader.readLine(); +// while ((line = bufferedReader.readLine()) != null) { +// String[] fields = line.split("\t"); +// String omimId = fields[6]; +// String geneSymbol = fields[3]; +// String hpoId = fields[0]; +// String diseaseName = fields[1]; +// GeneTraitAssociation disease = +// new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), HPO_DATA); +// addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); +// } +// } +// logger.info(PARSING_DONE_LOG_MESSAGE, hpoFilePath); + + // DisGeNet + logger.info(PARSING_LOG_MESSAGE, disgenetFilePath); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath)) { + // Skip first header line + bufferedReader.readLine(); + while ((line = bufferedReader.readLine()) != null) { + String[] fields = line.split("\t"); + String diseaseId = fields[4]; + String diseaseName = fields[5]; + String score = fields[9]; + String numberOfPubmeds = fields[13].trim(); + String numberOfSNPs = fields[14]; + String source = fields[15]; + GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), + Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), DISGENET_DATA); + addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); + } + } + logger.info(PARSING_DONE_LOG_MESSAGE, disgenetFilePath); + + for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { + rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); + } + } + + protected void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, miRTarBaseFile); + + try (BufferedReader reader = Files.newBufferedReader(miRTarBaseFile)) { + String line; + // Skip header line + reader.readLine(); + + String currentMiRTarBaseId = null; + String currentMiRNA = null; + String currentGene = null; + List targetGenes = new ArrayList<>(); + Map> geneToMirna = new HashMap<>(); + + while ((line = reader.readLine()) != null) { + String[] field = line.split("\t", -1); + if (field.length != 9) { + throw new CellBaseException("Invalid number of columns " + field.length + " (expected 9 columns) parsing file " + + miRTarBaseFile + ". Line: " + line); + } + + // #0: miRTarBase ID + String miRTarBaseId = field[0]; + if (currentMiRTarBaseId == null) { + currentMiRTarBaseId = miRTarBaseId; + } + + // #1: miRNA + String miRNA = field[1]; + if (currentMiRNA == null) { + currentMiRNA = miRNA; + } + + // #2: Species (miRNA) + + // #3: Target Gene + String geneName = field[3]; + if (currentGene == null) { + currentGene = geneName; + } + + // #4: Target Gene (Entrez ID) + // #5: Species (Target Gene) + + if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { + // new entry, store current one + MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, targetGenes); + addValueToMapElement(geneToMirna, currentGene, miRnaTarget); + targetGenes = new ArrayList<>(); + currentGene = geneName; + currentMiRTarBaseId = miRTarBaseId; + currentMiRNA = miRNA; + } + + // #6: Experiments + String experiment = field[6]; + + // #7: Support Type + String supportType = field[7]; + + // #8: pubmed + String pubmed = field[8]; + + targetGenes.add(new TargetGene(experiment, supportType, pubmed)); + } + + // parse last entry + MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, MIRTARBASE_DATA, currentMiRNA, targetGenes); + addValueToMapElement(geneToMirna, currentGene, miRnaTarget); + + for (Map.Entry> entry : geneToMirna.entrySet()) { + rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); + } + } + logger.info(PARSING_DONE_LOG_MESSAGE, miRTarBaseFile); + } + + protected static void addValueToMapElement(Map> map, String key, T value) { + if (map.containsKey(key)) { + map.get(key).add(value); + } else { + List valueList = new ArrayList<>(); + valueList.add(value); + map.put(key, valueList); + } + } + + protected List getDrugs(String id) throws RocksDBException, IOException { + String key = id + DRUGS_SUFFIX; + return rocksDbManager.getDrugs(rocksdb, key); + } + + protected List getDiseases(String id) throws RocksDBException, IOException { + String key = id + DISEASE_SUFFIX; + return rocksDbManager.getDiseases(rocksdb, key); + } + + protected List getMirnaTargets(String geneName) throws RocksDBException, IOException { + String key = geneName + MIRTARBASE_SUFFIX; + return rocksDbManager.getMirnaTargets(rocksdb, key); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index 48b0cd1d0d..56e1edd6ff 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -24,26 +24,43 @@ import org.opencb.cellbase.core.ParamConstants; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.rocksdb.RocksDBException; +import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; +import java.util.stream.Collectors; + +import static org.opencb.cellbase.lib.EtlCommons.*; public class RefSeqGeneBuilder extends CellBaseBuilder { + private Path downloadPath; + private Map transcriptDict; private Map exonDict; private Path gtfFile; private Path fastaFile; - private Path proteinFastaFile, cdnaFastaFile; - private Path maneFile, lrgFile, disgenetFile, hpoFile, geneDrugFile, miRTarBaseFile; - private Path cancerGeneCensus, cancerHotspot; - private Path tso500File, eglhHaemOncFile; + private Path proteinFastaFile; + private Path cdnaFastaFile; + private Path maneFile; + private Path lrgFile; + private Path disgenetFile; + private Path hpoFile; + private Path geneDrugFile; + private Path miRTarBaseFile; + private Path cancerGeneCensus; + private Path cancerHotspot; + private Path tso500File; + private Path eglhHaemOncFile; private SpeciesConfiguration speciesConfiguration; private static final Map REFSEQ_CHROMOSOMES = new HashMap<>(); - private final String status = "KNOWN"; + private static final String KNOWN_STATUS = "KNOWN"; private static final String SOURCE = ParamConstants.QueryParams.REFSEQ.key(); private Gene gene = null; private Transcript transcript = null; @@ -52,85 +69,95 @@ public class RefSeqGeneBuilder extends CellBaseBuilder { // sometimes there are two stop codons (eg NM_018159.4). Only parse the first one, skip the second private boolean seenStopCodon = false; - - public RefSeqGeneBuilder(Path refSeqDirectoryPath, SpeciesConfiguration speciesConfiguration, CellBaseSerializer serializer) { + public RefSeqGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, CellBaseSerializer serializer) { super(serializer); + this.downloadPath = downloadPath; this.speciesConfiguration = speciesConfiguration; - getGtfFileFromDirectoryPath(refSeqDirectoryPath); - getFastaFileFromDirectoryPath(refSeqDirectoryPath); - getProteinFastaFileFromDirectoryPath(refSeqDirectoryPath); - getCdnaFastaFileFromDirectoryPath(refSeqDirectoryPath); - setAnnotationFiles(refSeqDirectoryPath); - transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); } - private void setAnnotationFiles(Path refSeqDirectoryPath) { - Path geneDirectoryPath = refSeqDirectoryPath.getParent().resolve("gene"); - maneFile = geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"); - lrgFile = geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"); - geneDrugFile = geneDirectoryPath.resolve("dgidb.tsv"); - disgenetFile = geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"); - hpoFile = geneDirectoryPath.resolve("phenotype_to_genes.txt"); - cancerGeneCensus = geneDirectoryPath.resolve("cancer-gene-census.tsv"); - cancerHotspot = geneDirectoryPath.resolve("hotspots_v2.xls"); - tso500File = geneDirectoryPath.resolve("TSO500_transcripts.txt"); - eglhHaemOncFile = geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"); - miRTarBaseFile = refSeqDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"); - } - - private void getGtfFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { - gtfFile = refSeqDirectoryPath.resolve(fileName); - break; - } + public void check() throws Exception { + if (checked) { + return; } - } - private void getFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith("genomic.fna") || fileName.endsWith("genomic.fna.gz")) { - fastaFile = refSeqDirectoryPath.resolve(fileName); - break; - } - } - } + String refSeqGeneLabel = getDataName(REFSEQ_DATA) + " " + getDataName(GENE_DATA); + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); - private void getProteinFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith(".faa") || fileName.endsWith(".faa.gz")) { - proteinFastaFile = refSeqDirectoryPath.resolve(fileName); - break; + // Sanity check + checkDirectory(downloadPath, refSeqGeneLabel); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); } } - } - private void getCdnaFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith("cdna.fna") || fileName.endsWith("cdna.fna.gz")) { - cdnaFastaFile = refSeqDirectoryPath.resolve(fileName); - break; - } - } + // Check RefSeq files + List files = checkFiles(refSeqGeneLabel, REFSEQ_DATA, downloadPath, 4); + gtfFile = files.stream().filter(f -> f.getName().contains(".gtf")).findFirst().get().toPath(); + proteinFastaFile = files.stream().filter(f -> f.getName().contains("_protein")).findFirst().get().toPath(); + cdnaFastaFile = files.stream().filter(f -> f.getName().contains("_rna")).findFirst().get().toPath(); + fastaFile = files.stream().filter(f -> f.getName().contains("_genomic.fna")).findFirst().get().toPath(); + + // Check common files + maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + cancerHotspot = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // hpoFile = checkFiles(HPO_DATA, downloadPath.getParent(), 1); + disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // cancerGeneCensus = ; + // tso500File = ; + // eglhHaemOncFile = ; + + // Check regulation files + // mirtarbase + // The downloaded .xlsx file contains errors and it has to be fixed manually + logger.info("Checking {} folder and files", getDataName(MIRTARBASE_DATA)); + Path downloadRegulationPath = downloadPath.getParent().getParent().resolve(REGULATION_DATA); + List mirTarBaseFiles = ((DataSource) dataSourceReader.readValue(downloadRegulationPath.resolve( + getDataVersionFilename(MIRTARBASE_DATA)).toFile())).getUrls().stream().map(u -> Paths.get(u).getFileName().toString()) + .collect(Collectors.toList()); + if (mirTarBaseFiles.size() != 1) { + throw new CellBaseException("One " + getDataName(MIRTARBASE_DATA) + " file is expected at " + downloadRegulationPath + + ", but currently there are " + mirTarBaseFiles.size() + " files"); + } + // The hsa_MIT.xlsx is fixed and converted to hsa_MIT.csv manually + if (!mirTarBaseFiles.get(0).endsWith(XLSX_EXTENSION)) { + throw new CellBaseException("A " + XLSX_EXTENSION + " " + getDataName(MIRTARBASE_DATA) + " file is expected at " + + downloadRegulationPath + ", but currently it is named " + mirTarBaseFiles.get(0)); + } + miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION)); + if (!Files.exists(miRTarBaseFile)) { + throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist"); + } + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE); + checked = true; } public void parse() throws Exception { + check(); + // Preparing the fasta file for fast accessing FastaIndex fastaIndex = null; if (fastaFile != null) { fastaIndex = new FastaIndex(fastaFile); } - // index protein sequences for later + // Index protein sequences for later + logger.info("Indexing gene annotation for {} ...", getDataName(REFSEQ_DATA)); RefSeqGeneBuilderIndexer indexer = new RefSeqGeneBuilderIndexer(gtfFile.getParent()); indexer.index(maneFile, lrgFile, proteinFastaFile, cdnaFastaFile, geneDrugFile, hpoFile, disgenetFile, miRTarBaseFile, cancerGeneCensus, cancerHotspot, tso500File, eglhHaemOncFile); + logger.info("Indexing done for {}", getDataName(REFSEQ_DATA)); - logger.info("Parsing RefSeq gtf..."); + logger.info(PARSING_LOG_MESSAGE, gtfFile); GtfReader gtfReader = new GtfReader(gtfFile); Gtf gtf; @@ -164,22 +191,24 @@ public void parse() throws Exception { } } - // add xrefs to last transcript + // Add xrefs to last transcript addXrefs(transcript, geneDbxrefs, exonDbxrefs); - // last gene must be serialized + // Last gene must be serialized store(); - // cleaning + // Close gtfReader.close(); serializer.close(); if (fastaIndex != null) { fastaIndex.close(); } indexer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gtfFile); } - // store right before parsing the previous gene, or the very last gene. + // Store right before parsing the previous gene, or the very last gene. private void store() { serializer.serialize(gene); reset(); @@ -235,7 +264,7 @@ private void parseGene(Gtf gtf, String chromosome, RefSeqGeneBuilderIndexer inde null, indexer.getMirnaTargets(geneName), indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); gene = new Gene(geneId, geneName, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), "1", geneBiotype, - status, SOURCE, geneDescription, new ArrayList<>(), null, geneAnnotation); + KNOWN_STATUS, SOURCE, geneDescription, new ArrayList<>(), null, geneAnnotation); geneDbxrefs = parseXrefs(gtf); } @@ -567,7 +596,7 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId if ("mRNA".equals(biotype)) { biotype = "protein_coding"; } - transcript = new Transcript(transcriptId, name, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, status, + transcript = new Transcript(transcriptId, name, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, KNOWN_STATUS, 0, 0, 0, 0, 0, indexer.getCdnaFasta(transcriptId), "", "", "", version, SOURCE, new ArrayList<>(), new ArrayList<>(), new ArrayList<>(), new HashSet<>(), new TranscriptAnnotation()); @@ -644,6 +673,20 @@ private String getSequenceName(String fullSequenceName) { return fullSequenceName; } +// private void setAnnotationFiles(Path refSeqDirectoryPath) { +// Path geneDirectoryPath = refSeqDirectoryPath.getParent().resolve("gene"); +// maneFile = geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"); +// lrgFile = geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"); +// geneDrugFile = geneDirectoryPath.resolve("dgidb.tsv"); +// disgenetFile = geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"); +// hpoFile = geneDirectoryPath.resolve("phenotype_to_genes.txt"); +// cancerGeneCensus = geneDirectoryPath.resolve("cancer-gene-census.tsv"); +// cancerHotspot = geneDirectoryPath.resolve("hotspots_v2.xls"); +// tso500File = geneDirectoryPath.resolve("TSO500_transcripts.txt"); +// eglhHaemOncFile = geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"); +// miRTarBaseFile = refSeqDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"); +// } + static { REFSEQ_CHROMOSOMES.put("NC_000001", "1"); REFSEQ_CHROMOSOMES.put("NC_000002", "2"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java index 45520161f5..9aae170ce2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java @@ -16,25 +16,16 @@ package org.opencb.cellbase.lib.builders; -import org.apache.commons.lang.StringUtils; -import org.apache.poi.ss.usermodel.*; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.opencb.biodata.formats.io.FileFormatException; -import org.opencb.biodata.models.core.MirnaTarget; -import org.opencb.biodata.models.core.TargetGene; -import org.opencb.biodata.models.variant.avro.GeneDrugInteraction; -import org.opencb.biodata.models.variant.avro.GeneTraitAssociation; -import org.opencb.commons.utils.FileUtils; +import org.opencb.cellbase.core.exception.CellBaseException; import org.rocksdb.RocksDBException; -import java.io.BufferedReader; -import java.io.FileInputStream; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; -public class RefSeqGeneBuilderIndexer extends GeneBuilderIndexer{ +import static org.opencb.cellbase.lib.EtlCommons.REFSEQ_DATA; + +public class RefSeqGeneBuilderIndexer extends GeneBuilderIndexer { public RefSeqGeneBuilderIndexer(Path refSeqDirectoryPath) { super(refSeqDirectoryPath); @@ -42,249 +33,17 @@ public RefSeqGeneBuilderIndexer(Path refSeqDirectoryPath) { public void index(Path maneFile, Path lrgFile, Path proteinFastaFile, Path cDnaFastaFile, Path geneDrugFile, Path hpoFilePath, Path disgenetFile, Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot, Path tso500File, - Path eglhHaemOncFile) throws IOException, RocksDBException, FileFormatException { - indexManeMapping(maneFile, "refseq"); - indexLrgMapping(lrgFile, "refseq"); + Path eglhHaemOncFile) throws IOException, RocksDBException, FileFormatException, CellBaseException { + indexManeMapping(maneFile, REFSEQ_DATA); + indexLrgMapping(lrgFile, REFSEQ_DATA); indexProteinSequences(proteinFastaFile); indexCdnaSequences(cDnaFastaFile); indexDrugs(geneDrugFile); indexDiseases(hpoFilePath, disgenetFile); indexMiRTarBase(miRTarBaseFile); - indexCancerGeneCensus(cancerGeneGensus); +// indexCancerGeneCensus(cancerGeneGensus); indexCancerHotspot(cancerHotspot); - indexTSO500(tso500File); - indexEGLHHaemOnc(eglhHaemOncFile); - } - - private void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { - if (geneDrugFile != null && Files.exists(geneDrugFile) && Files.size(geneDrugFile) > 0) { - logger.info("Loading gene-drug interaction data from '{}'", geneDrugFile); - BufferedReader br = FileUtils.newBufferedReader(geneDrugFile); - - // Skip header - br.readLine(); - - int lineCounter = 1; - String line; - String currentGene = ""; - List drugs = new ArrayList<>(); - while ((line = br.readLine()) != null) { - String[] parts = line.split("\t"); - String geneName = parts[0]; - if (currentGene.equals("")) { - currentGene = geneName; - } else if (!currentGene.equals(geneName)) { - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - drugs = new ArrayList<>(); - currentGene = geneName; - } - - String source = null; - if (parts.length >= 4) { - source = parts[3]; - } - - String interactionType = null; - if (parts.length >= 5) { - interactionType = parts[4]; - } - - String drugName = null; - if (parts.length >= 8) { - // if drug name column is empty, use drug claim name instead - drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; - } - if (StringUtils.isEmpty(drugName)) { - // no drug name - continue; - } - - String chemblId = null; - if (parts.length >= 9) { - chemblId = parts[8]; - } - - List publications = new ArrayList<>(); - if (parts.length >= 10 && parts[9] != null) { - publications = Arrays.asList(parts[9].split(",")); - } - - GeneDrugInteraction drug = new GeneDrugInteraction( - geneName, drugName, source, null, null, interactionType, chemblId, publications); - drugs.add(drug); - lineCounter++; - } - br.close(); - // update last gene - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - } else { - logger.warn("Gene drug file " + geneDrugFile + " not found"); - logger.warn("Ignoring " + geneDrugFile); - } - } - - public List getDrugs(String id) throws RocksDBException, IOException { - String key = id + DRUGS_SUFFIX; - return rocksDbManager.getDrugs(rocksdb, key); - } - - private void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { - Map> geneDiseaseAssociationMap = new HashMap<>(50000); - - String line; - if (hpoFilePath != null && hpoFilePath.toFile().exists() && Files.size(hpoFilePath) > 0) { - BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath); - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String omimId = fields[6]; - String geneSymbol = fields[3]; - String hpoId = fields[0]; - String diseaseName = fields[1]; - GeneTraitAssociation disease = - new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), "hpo"); - addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); - } - bufferedReader.close(); - } - - if (disgenetFilePath != null && disgenetFilePath.toFile().exists() && Files.size(disgenetFilePath) > 0) { - BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath); - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String diseaseId = fields[4]; - String diseaseName = fields[5]; - String score = fields[9]; - String numberOfPubmeds = fields[13].trim(); - String numberOfSNPs = fields[14]; - String source = fields[15]; - GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), - Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), "disgenet"); - addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); - } - bufferedReader.close(); - } - - for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); - } - } - - public List getDiseases(String id) throws RocksDBException, IOException { - String key = id + DISEASE_SUFFIX; - return rocksDbManager.getDiseases(rocksdb, key); - } - - private void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { - if (miRTarBaseFile != null && Files.exists(miRTarBaseFile) && Files.size(miRTarBaseFile) > 0) { - logger.info("Loading mirna targets from '{}'", miRTarBaseFile); - FileInputStream file = new FileInputStream(miRTarBaseFile.toFile()); - Workbook workbook = new XSSFWorkbook(file); - Sheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - String currentMiRTarBaseId = null; - String currentMiRNA = null; - String currentGene = null; - List targetGenes = new ArrayList(); - Map> geneToMirna = new HashMap(); - while (iterator.hasNext()) { - - Row currentRow = iterator.next(); - Iterator cellIterator = currentRow.iterator(); - - Cell cell = cellIterator.next(); - String miRTarBaseId = cell.getStringCellValue(); - - // skip header - if (miRTarBaseId.startsWith("miRTarBase")) { - continue; - } - - if (currentMiRTarBaseId == null) { - currentMiRTarBaseId = miRTarBaseId; - } - - cell = cellIterator.next(); - String miRNA = cell.getStringCellValue(); - if (currentMiRNA == null) { - currentMiRNA = miRNA; - } - - // species - cellIterator.next(); - - cell = cellIterator.next(); - String geneName = cell.getStringCellValue(); - if (currentGene == null) { - currentGene = geneName; - } - - // entrez - cellIterator.next(); - // species - cellIterator.next(); - - if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { - // new entry, store current one - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - targetGenes = new ArrayList(); - currentGene = geneName; - currentMiRTarBaseId = miRTarBaseId; - currentMiRNA = miRNA; - } - - // experiment - cell = cellIterator.next(); - String experiment = cell.getStringCellValue(); - - // support type - cell = cellIterator.next(); - String supportType = cell.getStringCellValue(); - - // pubmeds - cell = cellIterator.next(); - String pubmed = null; - // seems to vary, so check both - if (cell.getCellType().equals(CellType.NUMERIC)) { - pubmed = String.valueOf(cell.getNumericCellValue()); - } else { - pubmed = cell.getStringCellValue(); - } - - targetGenes.add(new TargetGene(experiment, supportType, pubmed)); - } - - // parse last entry - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - - for (Map.Entry> entry : geneToMirna.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); - } - } else { - logger.error("mirtarbase file not found"); - } +// indexTSO500(tso500File); +// indexEGLHHaemOnc(eglhHaemOncFile); } - - public List getMirnaTargets(String geneName) throws RocksDBException, IOException { - String key = geneName + MIRTARBASE_SUFFIX; - return rocksDbManager.getMirnaTargets(rocksdb, key); - } - - private static void addValueToMapElement(Map> map, String key, T value) { - if (map.containsKey(key)) { - map.get(key).add(value); - } else { - List valueList = new ArrayList<>(); - valueList.add(value); - map.put(key, valueList); - } - } - } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java index cf8351cc54..3a178b4828 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java @@ -60,8 +60,11 @@ public RocksDB getDBConnection(String dbLocation) { Options options = new Options().setCreateIfMissing(true); RocksDB db = null; try { + if (!Files.exists(Paths.get(dbLocation))) { + Files.createDirectories(Paths.get(dbLocation)); + } return RocksDB.open(options, dbLocation); - } catch (RocksDBException e) { + } catch (RocksDBException | IOException e) { // do some error handling e.printStackTrace(); System.exit(1); diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java new file mode 100644 index 0000000000..63d1f445a8 --- /dev/null +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderTest.java @@ -0,0 +1,22 @@ +package org.opencb.cellbase.lib.builders; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.SpeciesConfiguration; + +import java.nio.file.Path; +import java.nio.file.Paths; + +class EnsemblGeneBuilderTest { + + public void testGeneBuilder() throws Exception { + Path downloadPath = Paths.get("/home/jtarraga/data/cellbase/cb6/v6.1.0-dr1/homo_sapiens_grch38/download/gene"); + Path buildPath = Paths.get("/home/jtarraga/data/cellbase/cb6/v6.1.0-dr1/homo_sapiens_grch38/generated_json/gene"); + boolean flexibleGTFParsing = false; + CellBaseConfiguration configuration = CellBaseConfiguration.load(Paths.get("/home/jtarraga/appl/cellbase/build/conf/configuration.yml")); + SpeciesConfiguration speciesConfiguration = configuration.getSpeciesConfig("hsapiens"); + + GeneBuilder geneBuilder = new GeneBuilder(downloadPath, buildPath, speciesConfiguration, flexibleGTFParsing); + geneBuilder.check(); + geneBuilder.parse(); + } +} \ No newline at end of file diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java index 5926c0184b..798c1a29db 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderTest.java @@ -55,23 +55,23 @@ public GeneBuilderTest() { @BeforeAll public void init() { - try { - Path genomeSequenceFastaFile - = Paths.get(GeneBuilderTest.class.getResource("/gene/Homo_sapiens.GRCh38.fa").toURI()); - Path geneDirectoryPath = Paths.get(GeneBuilderTest.class.getResource("/gene").toURI()); - // put the results in /tmp - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "gene", - true); - SpeciesConfiguration species = new SpeciesConfiguration("hsapiens", "Homo sapiens", - "human", null, null, null); - geneParser = new GeneBuilder(geneDirectoryPath, genomeSequenceFastaFile, species, serializer); - jsonObjectMapper = new ObjectMapper(); - jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); - jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); - geneParser.parse(); - } catch (Exception e) { - e.printStackTrace(); - } +// try { +// Path genomeSequenceFastaFile +// = Paths.get(GeneBuilderTest.class.getResource("/gene/Homo_sapiens.GRCh38.fa").toURI()); +// Path geneDirectoryPath = Paths.get(GeneBuilderTest.class.getResource("/gene").toURI()); +// // put the results in /tmp +// CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "gene", +// true); +// SpeciesConfiguration species = new SpeciesConfiguration("hsapiens", "Homo sapiens", +// "human", null, null, null); +// geneParser = new GeneBuilder(geneDirectoryPath, genomeSequenceFastaFile, species, serializer); +// jsonObjectMapper = new ObjectMapper(); +// jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); +// jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); +// geneParser.parse(); +// } catch (Exception e) { +// e.printStackTrace(); +// } } @Test @@ -226,36 +226,36 @@ public void testProteinSequence() throws Exception { } } - @Test - @Disabled - public void testaddTranscriptTfbstoList() throws Exception { - String attributes = "binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116;stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB"; - String source = null; - String sequenceName = "1"; - String feature = "TF_binding_site"; - int start = 10000; - int end = 100100; - String score = "1.2870005"; - String strand = "+"; - String frame = null; - - Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attributes); - Gtf transcript = new Gtf(sequenceName, source, feature, start, end, score, strand, frame, new HashMap<>()); - - List transcriptTfbs = geneParser.addTranscriptTfbstoList(tfbs, transcript,"1", new ArrayList<>()); - - assertEquals(1, transcriptTfbs.size()); - TranscriptTfbs result = transcriptTfbs.get(0); - - assertEquals(sequenceName, result.getChromosome()); - assertEquals(feature, result.getType()); - assertEquals(start, result.getStart()); - assertEquals(end, result.getEnd()); - assertEquals(score, String.valueOf(result.getScore())); - assertEquals("ENSPFM0542", result.getPfmId()); - assertEquals("ENSM00208374688", result.getId()); - assertEquals(2, result.getTranscriptionFactors().size()); - } +// @Test +// @Disabled +// public void testaddTranscriptTfbstoList() throws Exception { +// String attributes = "binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116;stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB"; +// String source = null; +// String sequenceName = "1"; +// String feature = "TF_binding_site"; +// int start = 10000; +// int end = 100100; +// String score = "1.2870005"; +// String strand = "+"; +// String frame = null; +// +// Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attributes); +// Gtf transcript = new Gtf(sequenceName, source, feature, start, end, score, strand, frame, new HashMap<>()); +// +// List transcriptTfbs = geneParser.addTranscriptTfbstoList(tfbs, transcript,"1", new ArrayList<>()); +// +// assertEquals(1, transcriptTfbs.size()); +// TranscriptTfbs result = transcriptTfbs.get(0); +// +// assertEquals(sequenceName, result.getChromosome()); +// assertEquals(feature, result.getType()); +// assertEquals(start, result.getStart()); +// assertEquals(end, result.getEnd()); +// assertEquals(score, String.valueOf(result.getScore())); +// assertEquals("ENSPFM0542", result.getPfmId()); +// assertEquals("ENSM00208374688", result.getId()); +// assertEquals(2, result.getTranscriptionFactors().size()); +// } private List loadSerializedGenes(String fileName) { List geneList = new ArrayList(); From a25b9c1d3cd7802250732dd3c377a60684e869de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 8 May 2024 09:52:40 +0200 Subject: [PATCH 103/107] core: fix PGS section in the configuration file, #TASK-5406, #TASK-5387 --- cellbase-core/src/main/resources/configuration.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 8edbedb090..3585b9da27 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -132,6 +132,11 @@ download: host: http://geneontology.org/ files: GO_ANNOTATION: gene-associations/goa_human.gaf.gz + pgs: + host: https://www.pgscatalog.org/ + version: "Dec. 15, 2023" + files: + PGS_METADATA: https://ftp.ebi.ac.uk/pub/databases/spot/pgs/metadata/pgs_all_metadata_scores.csv ## Regulation mirbase: @@ -288,11 +293,6 @@ download: CLINICAL_VARIANTS: clinicalVariants.zip DRUG_LABELS: drugLabels.zip RELATIONSHIPS: relationships.zip -pgs: - host: https://www.pgscatalog.org/ - version: "Dec. 15, 2023" - files: - PGS_METADATA: https://ftp.ebi.ac.uk/pub/databases/spot/pgs/metadata/pgs_all_metadata_scores.csv species: vertebrates: - id: hsapiens From df05c915591ad0b960fe3e8671153810faa4f6d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 8 May 2024 09:55:19 +0200 Subject: [PATCH 104/107] app: add PGS_DATA (polygenic scores) as valid data in the CellBase builder, #TASK-5407, #TASK-5387 --- .../app/cli/admin/executors/BuildCommandExecutor.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 540c1c7b04..1a6085dd0e 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -66,7 +66,7 @@ public class BuildCommandExecutor extends CommandExecutor { private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANTS_DATA, REPEATS_DATA, - ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); + ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA, PGS_DATA); public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); @@ -168,7 +168,7 @@ public void execute() throws CellBaseException { case PHARMACOGENOMICS_DATA: parser = buildPharmacogenomics(); break; - case EtlCommons.PGS_DATA: + case PGS_DATA: parser = buildPolygenicScores(); break; default: @@ -470,6 +470,7 @@ private List checkDataSources() { case SPLICE_SCORE_DATA: case PUBMED_DATA: case PHARMACOGENOMICS_DATA: + case PGS_DATA: break; default: throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " From e7c238511fa41ed4763126dfd6512f72cb1f73f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 10 May 2024 11:07:35 +0200 Subject: [PATCH 105/107] lib: update clinical variant downloader by moving the split ClinVar file to the build step, adding log messages, fixing sonnar issues,... #TASK-5575, #TASK-5564 --- .../executors/DownloadCommandExecutor.java | 6 +- .../core/config/DownloadProperties.java | 39 ----- .../org/opencb/cellbase/lib/EtlCommons.java | 40 +++-- .../lib/download/AbstractDownloadManager.java | 16 +- .../lib/download/ClinicalDownloadManager.java | 156 ++++++------------ 5 files changed, 87 insertions(+), 170 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index 8da49800df..5a0fb00877 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -41,7 +41,7 @@ public class DownloadCommandExecutor extends CommandExecutor { private Path outputDirectory; private static final List VALID_SOURCES_TO_DOWNLOAD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, - MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANTS_DATA, REPEATS_DATA, + MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANT_DATA, REPEATS_DATA, ONTOLOGY_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions) { @@ -86,7 +86,7 @@ public void execute() throws CellBaseException { case CONSERVATION_DATA: downloadFiles.addAll(downloader.downloadConservation()); break; - case CLINICAL_VARIANTS_DATA: + case CLINICAL_VARIANT_DATA: downloadFiles.addAll(downloader.downloadClinicalVariants()); break; case REPEATS_DATA: @@ -132,7 +132,7 @@ private List checkDataSources() { case REGULATION_DATA: case PROTEIN_DATA: case CONSERVATION_DATA: - case CLINICAL_VARIANTS_DATA: + case CLINICAL_VARIANT_DATA: case REPEATS_DATA: case ONTOLOGY_DATA: case PUBMED_DATA: diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index bb44f91138..a52e7ce544 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -44,10 +44,6 @@ public class DownloadProperties { private URLProperties phylop; private URLProperties gerp; private URLProperties clinvar; - private URLProperties clinvarVariation; - private URLProperties clinvarSummary; - private URLProperties clinvarVariationAllele; - private URLProperties clinvarEfoTerms; private URLProperties cosmic; private URLProperties hgmd; private URLProperties dgv; @@ -225,41 +221,6 @@ public DownloadProperties setClinvar(URLProperties clinvar) { return this; } - public URLProperties getClinvarVariation() { - return clinvarVariation; - } - - public DownloadProperties setClinvarVariation(URLProperties clinvarVariation) { - this.clinvarVariation = clinvarVariation; - return this; - } - - public URLProperties getClinvarSummary() { - return clinvarSummary; - } - - public DownloadProperties setClinvarSummary(URLProperties clinvarSummary) { - this.clinvarSummary = clinvarSummary; - return this; - } - - public URLProperties getClinvarVariationAllele() { - return clinvarVariationAllele; - } - - public void setClinvarVariationAllele(URLProperties clinvarVariationAllele) { - this.clinvarVariationAllele = clinvarVariationAllele; - } - - public URLProperties getClinvarEfoTerms() { - return clinvarEfoTerms; - } - - public DownloadProperties setClinvarEfoTerms(URLProperties clinvarEfoTerms) { - this.clinvarEfoTerms = clinvarEfoTerms; - return this; - } - public URLProperties getCosmic() { return cosmic; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index e0a19c7114..57a592bb54 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -47,6 +47,10 @@ public final class EtlCommons { // Commons public static final String XLSX_EXTENSION = ".xlsx"; public static final String CSV_EXTENSION = ".csv"; + public static final String TBI_EXTENSION = ".tbi"; + public static final String FAI_EXTENSION = ".fai"; + + public static final String OK_LOG_MESSAGE = "Ok."; // Ensembl public static final String ENSEMBL_DATA = "ensembl"; @@ -139,7 +143,6 @@ public final class EtlCommons { public static final String GO_ANNOTATION_FILE_ID = "GO_ANNOTATION"; public static final String VARIATION_DATA = "variation"; - public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; public static final String SPLICE_SCORE_DATA = "splice_score"; // Pharmacogenomics @@ -165,10 +168,10 @@ public final class EtlCommons { public static final String REVEL_FILE_ID = "REVEL"; // Clinical variants data - public static final String CLINICAL_VARIANTS_SUBDIRECTORY = "clinicalVariant"; + public static final String CLINICAL_VARIANT_DATA = "clinical_variant"; + public static final String CLINICAL_VARIANTS_BASENAME = "clinicalVariant"; // ClinVar - public static final String CLINVAR_NAME = "ClinVar"; - public static final String CLINVAR_VERSION_FILENAME = "clinvar" + SUFFIX_VERSION_FILENAME; + public static final String CLINVAR_DATA = "clinvar"; public static final String CLINVAR_CHUNKS_SUBDIRECTORY = "clinvar_chunks"; // Must match the configuration file public static final String CLINVAR_FULL_RELEASE_FILE_ID = "FULL_RELEASE"; @@ -176,20 +179,18 @@ public final class EtlCommons { public static final String CLINVAR_ALLELE_FILE_ID = "ALLELE"; public static final String CLINVAR_EFO_TERMS_FILE_ID = "EFO_TERMS"; // COSMIC - public static final String COSMIC_NAME = "COSMIC"; - public static final String COSMIC_VERSION_FILENAME = "cosmic" + SUFFIX_VERSION_FILENAME; + public static final String COSMIC_DATA = "cosmic"; // Must match the configuration file public static final String COSMIC_FILE_ID = "COSMIC"; // HGMD - public static final String HGMD_NAME = "HGMD"; - public static final String HGMD_VERSION_FILENAME = "hgmd" + SUFFIX_VERSION_FILENAME; + public static final String HGMD_DATA = "hgmd"; // Must match the configuration file public static final String HGMD_FILE_ID = "HGMD"; // GWAS - public static final String GWAS_NAME = "GWAS catalog"; - public static final String GWAS_VERSION_FILENAME = "gwas" + SUFFIX_VERSION_FILENAME; + public static final String GWAS_DATA = "gwas"; // Must match the configuration file public static final String GWAS_FILE_ID = "GWAS"; + public static final String GWAS_DBSNP_FILE_ID = "DBSNP"; // Repeats public static final String REPEATS_DATA = "repeats"; @@ -345,7 +346,7 @@ public final class EtlCommons { dataNamesMap.put(GENE_DISEASE_ANNOTATION_DATA, "Gene Disease Annotation"); dataNamesMap.put(HPO_DATA, "HPO"); dataNamesMap.put(DISGENET_DATA, "DisGeNet"); - dataNamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomAD Constraints"); + dataNamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomAD Constraint"); dataNamesMap.put(GO_ANNOTATION_DATA, "EBI Gene Ontology Annotation"); dataNamesMap.put(PROTEIN_DATA, "Protein"); dataNamesMap.put(UNIPROT_DATA, "UniProt"); @@ -372,10 +373,15 @@ public final class EtlCommons { dataNamesMap.put(PUBMED_DATA, "PubMed"); dataNamesMap.put(PHARMACOGENOMICS_DATA, "Pharmacogenomics"); dataNamesMap.put(PHARMGKB_DATA, "PharmGKB"); - dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Scores"); + dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Score"); dataNamesMap.put(CADD_DATA, "CADD"); - dataNamesMap.put(MISSENSE_VARIATION_SCORE_DATA, "Missense Variation Scores"); + dataNamesMap.put(MISSENSE_VARIATION_SCORE_DATA, "Missense Variation Score"); dataNamesMap.put(REVEL_DATA, "Revel"); + dataNamesMap.put(CLINICAL_VARIANT_DATA, "Clinical Variant"); + dataNamesMap.put(CLINVAR_DATA, "ClinVar"); + dataNamesMap.put(COSMIC_DATA, "Cosmic"); + dataNamesMap.put(HGMD_DATA, "HGMD"); + dataNamesMap.put(GWAS_DATA, "GWAS Catalog"); // Populate data categories map dataCategoriesMap.put(ENSEMBL_DATA, "Gene"); @@ -413,6 +419,10 @@ public final class EtlCommons { dataCategoriesMap.put(PHARMGKB_DATA, dataNamesMap.get(PHARMACOGENOMICS_DATA)); dataCategoriesMap.put(CADD_DATA, dataNamesMap.get(VARIATION_FUNCTIONAL_SCORE_DATA)); dataCategoriesMap.put(REVEL_DATA, dataNamesMap.get(MISSENSE_VARIATION_SCORE_DATA)); + dataCategoriesMap.put(CLINVAR_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(COSMIC_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(HGMD_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(GWAS_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); // Populate data version filenames Map dataVersionFilenamesMap.put(ENSEMBL_DATA, "ensemblCore" + SUFFIX_VERSION_FILENAME); @@ -450,6 +460,10 @@ public final class EtlCommons { dataVersionFilenamesMap.put(PHARMGKB_DATA, "pharmGKB" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(CADD_DATA, "cadd" + SUFFIX_VERSION_FILENAME); dataVersionFilenamesMap.put(REVEL_DATA, "revel" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CLINVAR_DATA, "clinVar" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(COSMIC_DATA, "cosmic" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HGMD_DATA, "hgmd" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GWAS_DATA, "gwas" + SUFFIX_VERSION_FILENAME); } private EtlCommons() { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index 7c4e331f18..7ac8bcf800 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -52,9 +52,9 @@ public abstract class AbstractDownloadManager { protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; - protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {} done!"; + protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Ok. {}"; protected static final String CATEGORY_DOWNLOADING_LOG_MESSAGE = "Downloading {}/{} ..."; - protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Downloading {}/{} done!"; + protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Ok. {}/{}"; protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; protected String species; @@ -195,8 +195,8 @@ protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.Ensem DownloadFile downloadFile = downloadEnsemblDataSource(ensemblProps, fileId, chromosome, outPath); // Save data source - saveDataSource(data, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), - outPath.resolve(getDataVersionFilename(data))); + saveDataSource(data, "(" + getDataName(ENSEMBL_DATA) + " " + ensemblVersion + ")", getTimeStamp(), + Collections.singletonList(downloadFile.getUrl()), outPath.resolve(getDataVersionFilename(data))); return downloadFile; } @@ -226,7 +226,9 @@ protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props String url = EtlCommons.getUrl(props, fileId, species, assembly, chromosome); File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); - return downloadFile(url, outFile.toString()); + DownloadFile downloadFile = downloadFile(url, outFile.toString()); + logger.info(OK_LOG_MESSAGE); + return downloadFile; } protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, Path outPath) @@ -240,7 +242,9 @@ protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblPrope chromosome); File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); - return downloadFile(url, outFile.toString()); + DownloadFile downloadFile = downloadFile(url, outFile.toString()); + logger.info(OK_LOG_MESSAGE); + return downloadFile; } protected void saveDataSource(String data, String version, String date, List urls, Path versionFilePath) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 77f658626a..9fd0e7562c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -20,12 +20,8 @@ import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.commons.utils.FileUtils; -import java.io.BufferedReader; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.PrintWriter; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -50,114 +46,56 @@ public List download() throws IOException, InterruptedException, C } public List downloadClinical() throws IOException, InterruptedException, CellBaseException { - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_SUBDIRECTORY).toAbsolutePath(); - Files.createDirectories(clinicalFolder); - logger.info("Downloading clinical information at {} ...", clinicalFolder); - - String url; - List urls; - Path outPath; - DownloadProperties.URLProperties props; - - DownloadFile downloadFile; - List downloadFiles = new ArrayList<>(); - - // COSMIC - logger.warn("{} files must be downloaded manually !", COSMIC_NAME); - props = configuration.getDownload().getCosmic(); - urls = Collections.singletonList(props.getHost() + props.getFiles().get(COSMIC_FILE_ID)); - // Save data source - saveDataSource(EtlCommons.CLINICAL_VARIANTS_DATA, COSMIC_NAME, props.getVersion(), getTimeStamp(), urls, - clinicalFolder.resolve(COSMIC_VERSION_FILENAME)); - - // HGMD - logger.warn("{} files must be downloaded manually !", HGMD_NAME); - props = configuration.getDownload().getHgmd(); - urls = Collections.singletonList(props.getHost() + props.getFiles().get(HGMD_FILE_ID)); - // Save data source - saveDataSource(EtlCommons.CLINICAL_VARIANTS_DATA, HGMD_NAME, props.getVersion(), getTimeStamp(), urls, - clinicalFolder.resolve(HGMD_VERSION_FILENAME)); - - // GWAS catalog - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_FILE_ID, GWAS_NAME, - CLINICAL_VARIANTS_DATA, GWAS_VERSION_FILENAME, clinicalFolder); + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info("{} not supported for the species {}", getDataName(CLINICAL_VARIANT_DATA), + speciesConfiguration.getScientificName()); + return Collections.emptyList(); + } + + // Create clinical directory + Path clinicalPath = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANT_DATA).toAbsolutePath(); + Files.createDirectories(clinicalPath); + + DownloadFile downloadFile; + List downloadFiles = new ArrayList<>(); + + // ClinVar + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CLINVAR_DATA)); + DownloadProperties.URLProperties props = configuration.getDownload().getClinvar(); + List urls = new ArrayList<>(); + for (String fileId : Arrays.asList(CLINVAR_FULL_RELEASE_FILE_ID, CLINVAR_SUMMARY_FILE_ID, CLINVAR_ALLELE_FILE_ID, + CLINVAR_EFO_TERMS_FILE_ID)) { + downloadFile = downloadDataSource(props, fileId, clinicalPath); downloadFiles.add(downloadFile); - // ClinVar - logger.info("Downloading {}} files ...", CLINVAR_NAME); - props = configuration.getDownload().getClinvar(); - urls = new ArrayList<>(); - for (String fileId : Arrays.asList(CLINVAR_FULL_RELEASE_FILE_ID, CLINVAR_SUMMARY_FILE_ID, CLINVAR_ALLELE_FILE_ID, - CLINVAR_EFO_TERMS_FILE_ID)) { - url = props.getHost() + props.getFiles().get(fileId); - outPath = clinicalFolder.resolve(getFilenameFromUrl(url)); - logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outPath); - downloadFiles.add(downloadFile(url, outPath.toString())); - urls.add(url); - } - // Save data source - saveDataSource(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, props.getVersion(), getTimeStamp(), urls, - clinicalFolder.resolve(CLINVAR_VERSION_FILENAME)); - - // Prepare CliVar chunk files - Path chunksPath = clinicalFolder.resolve(CLINVAR_CHUNKS_SUBDIRECTORY); - if (Files.notExists(chunksPath)) { - Files.createDirectories(chunksPath); - Path clinvarPath = clinicalFolder.resolve(getFilenameFromUrl( - props.getHost() + props.getFiles().get(CLINVAR_FULL_RELEASE_FILE_ID))); - logger.info("Splitting {} in {} ...", clinvarPath, chunksPath); - splitClinvar(clinvarPath, chunksPath); - } - - return downloadFiles; + // Save URLs to be written in the version file + urls.add(downloadFile.getUrl()); } - return Collections.emptyList(); - } + // Save data source + saveDataSource(CLINVAR_DATA, props.getVersion(), getTimeStamp(), urls, clinicalPath.resolve(getDataVersionFilename(CLINVAR_DATA))); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CLINVAR_DATA)); + + // COSMIC + logger.warn("{} files must be downloaded manually !", getDataName(COSMIC_DATA)); + props = configuration.getDownload().getCosmic(); + String url = props.getHost() + props.getFiles().get(COSMIC_FILE_ID); + saveDataSource(COSMIC_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), + clinicalPath.resolve(getDataVersionFilename(COSMIC_DATA))); + + // HGMD + logger.warn("{} files must be downloaded manually !", getDataName(HGMD_DATA)); + props = configuration.getDownload().getHgmd(); + url = props.getHost() + props.getFiles().get(HGMD_FILE_ID); + saveDataSource(HGMD_DATA, props.getVersion(), getTimeStamp(), Collections.singletonList(url), + clinicalPath.resolve(getDataVersionFilename(HGMD_DATA))); + + // GWAS catalog + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GWAS_DATA)); + downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGwasCatalog(), GWAS_FILE_ID, GWAS_DATA, clinicalPath); + downloadFiles.add(downloadFile); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GWAS_DATA)); - private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { - PrintWriter pw = null; - try (BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath)) { - StringBuilder header = new StringBuilder(); - boolean beforeEntry = true; - boolean inEntry = false; - int count = 0; - int chunk = 0; - String line; - while ((line = br.readLine()) != null) { - if (line.trim().startsWith("")) { - inEntry = false; - if (count % 10000 == 0) { - if (pw != null) { - pw.print(""); - pw.close(); - } - chunk++; - } - } - } - if (pw != null) { - pw.print(""); - pw.close(); - } - } + return downloadFiles; } } From f5b7c34d17815b50e4112fde967b9a055a459d49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 10 May 2024 11:09:44 +0200 Subject: [PATCH 106/107] lib: update clinical variant builder by including the split ClinVar file to the build step, adding checks and log messages, fixing sonnar issues,... #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 37 ++-- .../lib/builders/CellBaseBuilder.java | 52 ++++- .../clinical/variant/ClinVarIndexer.java | 4 +- .../clinical/variant/ClinicalIndexer.java | 2 +- .../variant/ClinicalVariantBuilder.java | 206 +++++++++++++----- .../clinical/variant/CosmicIndexer.java | 2 +- .../clinical/variant/HGMDIndexer.java | 2 +- .../variant/ClinicalVariantBuilderTest.java | 6 +- 8 files changed, 222 insertions(+), 89 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 081880ebe3..899e5f52d3 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -39,7 +39,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -65,7 +64,7 @@ public class BuildCommandExecutor extends CommandExecutor { private SpeciesConfiguration speciesConfiguration; private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, - MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANTS_DATA, REPEATS_DATA, + MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANT_DATA, REPEATS_DATA, ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA); public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { @@ -150,7 +149,7 @@ public void execute() throws CellBaseException { case CONSERVATION_DATA: parser = buildConservation(); break; - case CLINICAL_VARIANTS_DATA: + case CLINICAL_VARIANT_DATA: parser = buildClinicalVariants(); break; case REPEATS_DATA: @@ -306,30 +305,24 @@ private CellBaseBuilder buildConservation() throws CellBaseException { } private CellBaseBuilder buildClinicalVariants() throws CellBaseException { - Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_SUBDIRECTORY); - - List versionFiles = new ArrayList<>(); - List versionFilenames = Arrays.asList(CLINVAR_VERSION_FILENAME, COSMIC_VERSION_FILENAME, GWAS_VERSION_FILENAME, - HGMD_VERSION_FILENAME); - for (String versionFilename : versionFilenames) { - Path versionFile = clinicalVariantFolder.resolve(versionFilename); - if (!versionFile.toFile().exists()) { - throw new CellBaseException("Could not build clinical variants because of the file " + versionFilename + " does not exist"); - } - versionFiles.add(versionFile); - } - copyVersionFiles(versionFiles); - - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, - EtlCommons.CLINICAL_VARIANTS_JSON_FILE.replace(".json.gz", ""), true); - return new ClinicalVariantBuilder(clinicalVariantFolder, normalize, getFastaReferenceGenome(), + // Sanity check + Path clinicalDownloadPath = downloadFolder.resolve(CLINICAL_VARIANT_DATA); + Path clinicalBuildPath = buildFolder.resolve(CLINICAL_VARIANT_DATA); + copyVersionFiles(Arrays.asList(clinicalDownloadPath.resolve(getDataVersionFilename(CLINVAR_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(COSMIC_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(HGMD_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(GWAS_DATA))), clinicalBuildPath); + + // Create the file serializer and the clinical variants builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(clinicalBuildPath, CLINICAL_VARIANTS_BASENAME, true); + return new ClinicalVariantBuilder(clinicalDownloadPath, normalize, getFastaReferenceGenome(), buildCommandOptions.assembly == null ? getDefaultHumanAssembly() : buildCommandOptions.assembly, configuration, serializer); } private String getDefaultHumanAssembly() { for (SpeciesConfiguration species : configuration.getSpecies().getVertebrates()) { - if (species.getId().equals("hsapiens")) { + if (species.getId().equals(HSAPIENS_NAME)) { return species.getAssemblies().get(0).getName(); } } @@ -461,7 +454,7 @@ private List checkDataSources() { case REGULATION_DATA: case PROTEIN_DATA: case CONSERVATION_DATA: - case CLINICAL_VARIANTS_DATA: + case CLINICAL_VARIANT_DATA: case REPEATS_DATA: case ONTOLOGY_DATA: case SPLICE_SCORE_DATA: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index 26fb2e838b..fe1b5fe648 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -19,17 +19,21 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; import org.apache.commons.lang3.StringUtils; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; @@ -48,7 +52,7 @@ public abstract class CellBaseBuilder { protected Logger logger; public static final String CHECKING_BEFORE_BUILDING_LOG_MESSAGE = "Checking files before building {} ..."; - public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking done!"; + public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking {} done!"; public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; public static final String BUILDING_DONE_LOG_MESSAGE = "Building done!"; @@ -59,7 +63,6 @@ public abstract class CellBaseBuilder { public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done!"; - public CellBaseBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); @@ -79,6 +82,24 @@ public void disconnect() { } } + protected File checkFile(String data, DownloadProperties.URLProperties props, String fileId, Path targetPath) throws CellBaseException { + logger.info("Checking file {}/{} ...", getDataName(data), fileId); + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " does not exist in the configuration file in the section '" + data + "'"); + } + if (!Files.exists(targetPath)) { + throw new CellBaseException("Folder does not exist " + targetPath); + } + + String filename = Paths.get(props.getFiles().get(fileId)).getFileName().toString(); + Path filePath = targetPath.resolve(filename); + if (!Files.exists(filePath)) { + throw new CellBaseException(getDataName(data) + " file " + filePath + " does not exist"); + } + logger.info("Ok."); + return filePath.toFile(); + } + protected List checkFiles(String data, Path downloadPath, int expectedFiles) throws CellBaseException, IOException { return checkFiles(getDataName(data), data, downloadPath, expectedFiles); } @@ -94,7 +115,7 @@ protected List checkFiles(String label, String data, Path downloadPath, in } protected List checkFiles(DataSource dataSource, Path targetPath, String name) throws CellBaseException { - logger.info("Checking {} folder and files", name); + logger.info("Checking {} folder and files ...", name); if (!targetPath.toFile().exists()) { throw new CellBaseException(name + " folder does not exist " + targetPath); } @@ -110,7 +131,30 @@ protected List checkFiles(DataSource dataSource, Path targetPath, String n files.add(file); } } - + logger.info("Ok."); return files; } + + protected Path getIndexFastaReferenceGenome(Path fastaPath) throws CellBaseException { + Path indexFastaPath = Paths.get(fastaPath + FAI_EXTENSION); + if (!Files.exists(indexFastaPath)) { + // Index FASTA file + logger.info("Indexing FASTA file {} ...", fastaPath); + String errorMsg = "Error executing 'samtools faidx' for FASTA file "; + try { + List params = Arrays.asList("faidx", fastaPath.toString()); + EtlCommons.runCommandLineProcess(null, "samtools", params, null); + } catch (IOException e) { + throw new CellBaseException(errorMsg + fastaPath, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException(errorMsg + fastaPath, e); + } + if (!Files.exists(indexFastaPath)) { + throw new CellBaseException("It could not index the FASTA file " + fastaPath + ". Please, try to do it manually!"); + } + } + return indexFastaPath; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java index 7e5baa9e6d..951ea5c530 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java @@ -309,7 +309,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, String variation String mateVariantString, String clinicalHaplotypeString, Map traitsToEfoTermsMap) { - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_NAME, version, null); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, null); // Create a set to avoid situations like germline;germline;germline List alleleOrigin = null; if (!EtlCommons.isMissing(lineFields[VARIANT_SUMMARY_ORIGIN_COLUMN])) { @@ -390,7 +390,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu throws JsonProcessingException { List additionalProperties = new ArrayList<>(3); - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_NAME, version, null); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, null); // String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); VariantClassification variantClassification = getVariantClassification( diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java index bbe33017fd..3f6e87b89c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java @@ -83,7 +83,7 @@ public ClinicalIndexer(Path genomeSequenceFilePath) throws IOException { .setDecomposeMNVs(true); if (genomeSequenceFilePath != null) { - logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath.toString()); + logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath); variantNormalizerConfig.enableLeftAlign(genomeSequenceFilePath.toString()); } else { logger.info("Left alignment is NOT enabled."); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java index 41b701fdbe..e3c7ab3ff8 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java @@ -23,100 +23,155 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.cellbase.lib.builders.CellBaseBuilder; +import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; -import java.io.File; -import java.io.IOException; +import java.io.*; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by fjlopez on 26/09/16. */ public class ClinicalVariantBuilder extends CellBaseBuilder { - private final Path clinicalVariantFolder; + private final Path clinicalVariantPath; private final String assembly; private final Path genomeSequenceFilePath; private boolean normalize; + private Path clinvarFullReleaseFilePath; + private Path clinvarSummaryFilePath; + private Path clinvarVariationAlleleFilePath; + private Path clinvarEFOFilePath; + private Path cosmicFilePath; + private Path hgmdFilePath; + private Path gwasFilePath; + private Path gwasDbSnpFilePath; + private final CellBaseConfiguration configuration; public ClinicalVariantBuilder(Path clinicalVariantFolder, boolean normalize, Path genomeSequenceFilePath, String assembly, CellBaseConfiguration configuration, CellBaseSerializer serializer) { super(serializer); - this.clinicalVariantFolder = clinicalVariantFolder; + this.clinicalVariantPath = clinicalVariantFolder; this.normalize = normalize; this.genomeSequenceFilePath = genomeSequenceFilePath; this.assembly = assembly; this.configuration = configuration; } + public void check() throws CellBaseException, IOException { + if (checked) { + return; + } + + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + + // Sanity check + checkDirectory(clinicalVariantPath, getDataName(CLINICAL_VARIANT_DATA)); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); + } + } + + // Check genome file + logger.info("Checking genome FASTA file ..."); + if (!Files.exists(genomeSequenceFilePath)) { + throw new CellBaseException("Genome file path does not exist " + genomeSequenceFilePath); + } + logger.info(OK_LOG_MESSAGE); + logger.info("Checking index for genome FASTA file ..."); + getIndexFastaReferenceGenome(genomeSequenceFilePath); + logger.info(OK_LOG_MESSAGE); + + // Check ClinVar files + clinvarFullReleaseFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_FULL_RELEASE_FILE_ID, + clinicalVariantPath).toPath(); + clinvarSummaryFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_SUMMARY_FILE_ID, + clinicalVariantPath).toPath(); + clinvarVariationAlleleFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_ALLELE_FILE_ID, + clinicalVariantPath).toPath(); + clinvarEFOFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_EFO_TERMS_FILE_ID, + clinicalVariantPath).toPath(); + + // Check COSMIC file + cosmicFilePath = checkFiles(COSMIC_DATA, clinicalVariantPath, 1).get(0).toPath(); + + // Check HGMD file + hgmdFilePath = checkFiles(HGMD_DATA, clinicalVariantPath, 1).get(0).toPath(); + + // Check GWAS files + gwasFilePath = checkFiles(GWAS_DATA, clinicalVariantPath, 1).get(0).toPath(); + String dbSnpFilename = Paths.get(configuration.getDownload().getGwasCatalog().getFiles().get(GWAS_DBSNP_FILE_ID)).getFileName() + .toString(); + gwasDbSnpFilePath = clinicalVariantPath.resolve(dbSnpFilename); + if (!Files.exists(gwasDbSnpFilePath)) { + throw new CellBaseException("Could not build clinical variants: the dbSNP file " + dbSnpFilename + " is missing at " + + clinicalVariantPath); + } + if (!Files.exists(clinicalVariantPath.resolve(dbSnpFilename + TBI_EXTENSION))) { + throw new CellBaseException("Could not build clinical variants: the dbSNP tabix file " + dbSnpFilename + TBI_EXTENSION + + " is missing at " + clinicalVariantPath); + } + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + checked = true; + } + public void parse() throws IOException, RocksDBException, CellBaseException { + check(); + + // Prepare ClinVar chunk files before building (if necessary) + Path chunksPath = serializer.getOutdir().resolve(CLINVAR_CHUNKS_SUBDIRECTORY); + if (Files.notExists(chunksPath)) { + Files.createDirectories(chunksPath); + logger.info("Splitting CliVar file {} in {} ...", clinvarFullReleaseFilePath, chunksPath); + splitClinvar(clinvarFullReleaseFilePath, chunksPath); + logger.info(OK_LOG_MESSAGE); + } + RocksDB rdb = null; Options dbOption = null; String dbLocation = null; try { - Object[] dbConnection = getDBConnection(clinicalVariantFolder.toString() + "/integration.idx", true); + Object[] dbConnection = getDBConnection(clinicalVariantPath.toString() + "/integration.idx", true); rdb = (RocksDB) dbConnection[0]; dbOption = (Options) dbConnection[1]; dbLocation = (String) dbConnection[2]; // COSMIC - // IMPORTANT: COSMIC must be indexed first (before ClinVar, IARC TP53, DOCM, HGMD,...)!!! - Path cosmicFile = clinicalVariantFolder.resolve(configuration.getDownload().getCosmic().getFiles().get(0)); - if (cosmicFile != null && Files.exists(cosmicFile)) { - CosmicIndexer cosmicIndexer = new CosmicIndexer(cosmicFile, configuration.getDownload().getCosmic().getVersion(), - normalize, genomeSequenceFilePath, assembly, rdb); - cosmicIndexer.index(); - } else { - throw new CellBaseException("Could not build clinical variants: the COSMIC file " + cosmicFile + " is missing"); - } + // IMPORTANT: COSMIC must be indexed first (before ClinVar, HGMD,...)!!! + CosmicIndexer cosmicIndexer = new CosmicIndexer(cosmicFilePath, configuration.getDownload().getCosmic().getVersion(), + normalize, genomeSequenceFilePath, assembly, rdb); + cosmicIndexer.index(); // ClinVar - Path clinvarXMLFile = getPathFromHost(configuration.getDownload().getClinvar().getHost()); - Path clinvarSummaryFile = getPathFromHost(configuration.getDownload().getClinvarSummary().getHost()); - Path clinvarVariationAlleleFile = getPathFromHost(configuration.getDownload().getClinvarVariationAllele().getHost()); - Path clinvarEFOFile = getPathFromHost(configuration.getDownload().getClinvarEfoTerms().getHost()); - ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent().resolve("clinvar_chunks"), clinvarSummaryFile, - clinvarVariationAlleleFile, clinvarEFOFile, configuration.getDownload().getClinvar().getVersion(), normalize, - genomeSequenceFilePath, assembly, rdb); + ClinVarIndexer clinvarIndexer = new ClinVarIndexer(serializer.getOutdir().resolve(CLINVAR_CHUNKS_SUBDIRECTORY), + clinvarSummaryFilePath, clinvarVariationAlleleFilePath, clinvarEFOFilePath, configuration.getDownload().getClinvar() + .getVersion(), normalize, genomeSequenceFilePath, assembly, rdb); clinvarIndexer.index(); // HGMD - Path hgmdFile = clinicalVariantFolder.resolve(configuration.getDownload().getHgmd().getFiles().get(0)); - if (hgmdFile != null && Files.exists(hgmdFile)) { - HGMDIndexer hgmdIndexer = new HGMDIndexer(hgmdFile, configuration.getDownload().getHgmd().getVersion(), normalize, - genomeSequenceFilePath, assembly, rdb); - hgmdIndexer.index(); - } else { - throw new CellBaseException("Could not build clinical variants: the HGMD file " + hgmdFile + " is missing"); - } + HGMDIndexer hgmdIndexer = new HGMDIndexer(hgmdFilePath, configuration.getDownload().getHgmd().getVersion(), normalize, + genomeSequenceFilePath, assembly, rdb); + hgmdIndexer.index(); // GWAS catalog - Path gwasFile = clinicalVariantFolder.resolve(Paths.get(configuration.getDownload().getGwasCatalog().getHost()).getFileName()); - if (gwasFile != null && Files.exists(gwasFile)) { - Path dbsnpFile = clinicalVariantFolder.resolve(configuration.getDownload().getGwasCatalog().getFiles().get(0)); - if (dbsnpFile != null && Files.exists(dbsnpFile)) { - Path tabixFile = Paths.get(dbsnpFile.toAbsolutePath() + ".tbi"); - if (tabixFile != null && Files.exists(tabixFile)) { - GwasIndexer gwasIndexer = new GwasIndexer(gwasFile, dbsnpFile, genomeSequenceFilePath, assembly, rdb); - gwasIndexer.index(); - } else { - throw new CellBaseException("Could not build clinical variants: the dbSNP tabix file " + tabixFile + " is missing"); - } - } else { - throw new CellBaseException("Could not build clinical variants: the dbSNP file " + dbsnpFile + " is missing"); - } - } else { - throw new CellBaseException("Could not build clinical variants: the GWAS catalog file " + gwasFile + " is missing"); - } + GwasIndexer gwasIndexer = new GwasIndexer(gwasFilePath, gwasDbSnpFilePath, genomeSequenceFilePath, assembly, rdb); + gwasIndexer.index(); + // Serialize serializeRDB(rdb); closeIndex(rdb, dbOption, dbLocation); serializer.close(); @@ -127,14 +182,6 @@ public void parse() throws IOException, RocksDBException, CellBaseException { } } - private Path getPathFromHost(String host) throws CellBaseException { - Path path = clinicalVariantFolder.resolve(Paths.get(host).getFileName()); - if (!Files.exists(path)) { - throw new CellBaseException("Could not build clinical variants. The file " + path + " is missing"); - } - return path; - } - private void serializeRDB(RocksDB rdb) throws IOException { // DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's // named "iterator" @@ -169,7 +216,7 @@ private Variant parseVariantFromVariantId(String variantId) { return new Variant(parts[0].trim(), Integer.parseInt(parts[1].trim()), parts[2], parts[3]); } } catch (Exception e) { - logger.warn(e.getMessage() + ". Impossible to create the variant object from the variant ID: " + variantId); + logger.warn("{}. Impossible to create the variant object from the variant ID: {}", e.getMessage(), variantId); return null; } } @@ -221,4 +268,53 @@ private Object[] getDBConnection(String dbLocation, boolean forceCreate) { } + private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { + PrintWriter pw = null; + try (BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath)) { + StringBuilder header = new StringBuilder(); + boolean beforeEntry = true; + boolean inEntry = false; + int count = 0; + int chunk = 0; + String line; + while ((line = br.readLine()) != null) { + if (line.trim().startsWith("")) { + inEntry = false; + if (count % 10000 == 0) { + if (pw != null) { + pw.print(""); + pw.close(); + } + chunk++; + } + } + } + if (pw != null) { + pw.print(""); + pw.close(); + } + } finally { + if (pw != null) { + pw.close(); + } + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java index c772501738..51be2b6f31 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java @@ -471,7 +471,7 @@ private EvidenceEntry buildCosmic(String[] fields) { String id = fields[ID_COLUMN]; String url = "https://cancer.sanger.ac.uk/cosmic/search?q=" + id; - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_NAME, version, null); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_DATA, version, null); SomaticInformation somaticInformation = getSomaticInformation(fields); List genomicFeatureList = getGenomicFeature(fields); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java index 2c0d2b3d27..f132f4b9e8 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java @@ -95,7 +95,7 @@ private void parseHgmdInfo(Variant variant) { } // Source - entry.setSource(new EvidenceSource(EtlCommons.HGMD_NAME, version, null)); + entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, version, null)); // Assembly entry.setAssembly(assembly); diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java index fc5df3af35..aea3b9e7fe 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java @@ -89,7 +89,7 @@ public void noNormaliseTest() throws Exception { .getResource("/variant/annotation/clinicalVariant/ClinVarFullRelease_2020-02.xml.gz").toURI()).toFile(), clinicalVariantChunksFolder.resolve("ClinVarFullRelease_2020-02.xml.gz").toFile()); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANTS_DATA, true); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANT_DATA, true); (new ClinicalVariantBuilder(clinicalVariantFolder, false, genomeSequenceFilePath, "GRCh37", null, serializer)).parse(); List parsedVariantList = loadSerializedVariants("/tmp/" + EtlCommons.CLINICAL_VARIANTS_JSON_FILE); @@ -145,7 +145,7 @@ public void parseMNVTest() throws Exception { Path genomeSequenceFilePath = clinicalVariantFolder.resolve("Homo_sapiens.GRCh37.75.dna.primary_assembly.chr17.fa.gz"); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANTS_DATA, true); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANT_DATA, true); (new ClinicalVariantBuilder(clinicalVariantFolder, true, genomeSequenceFilePath, "GRCh37", null, serializer)).parse(); List parsedVariantList = loadSerializedVariants("/tmp/" + EtlCommons.CLINICAL_VARIANTS_JSON_FILE); @@ -230,7 +230,7 @@ public void parse() throws Exception { .getResource("/variant/annotation/clinicalVariant/ClinVarFullRelease_2020-02.xml.gz").toURI()).toFile(), clinicalVariantChunksFolder.resolve("ClinVarFullRelease_2020-02.xml.gz").toFile()); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANTS_DATA, true); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANT_DATA, true); (new ClinicalVariantBuilder(clinicalVariantFolder, true, genomeSequenceFilePath, "GRCh37", null, serializer)).parse(); List parsedVariantList = loadSerializedVariants("/tmp/" + EtlCommons.CLINICAL_VARIANTS_JSON_FILE); From a4fca6bb6b3c82b8d82422226322fa8af06c41b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 10 May 2024 11:10:36 +0200 Subject: [PATCH 107/107] lib: update code to the last changes, #TASK-5564 --- .../cellbase/app/cli/admin/AdminCliOptionsParser.java | 4 ++-- .../app/cli/admin/executors/ExportCommandExecutor.java | 6 +++--- .../app/cli/admin/executors/LoadCommandExecutor.java | 6 +++--- .../opencb/cellbase/lib/builders/EnsemblGeneBuilder.java | 2 +- .../opencb/cellbase/lib/builders/RefSeqGeneBuilder.java | 2 +- .../cellbase/lib/download/GenomeDownloadManager.java | 7 +++++++ .../cellbase/lib/download/PharmGKBDownloadManager.java | 1 + .../cellbase/lib/download/PubMedDownloadManager.java | 1 + 8 files changed, 19 insertions(+), 10 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 1bda7d2793..15396663a4 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -91,7 +91,7 @@ public class DownloadCommandOptions { @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: " + GENOME_DATA + "," + GENE_DATA + "," + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA - + "," + CONSERVATION_DATA + "," + CLINICAL_VARIANTS_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + PUBMED_DATA + + "," + CONSERVATION_DATA + "," + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + PUBMED_DATA + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to download everything", required = true, arity = 1) public String data; @@ -108,7 +108,7 @@ public class BuildCommandOptions { @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: " + GENOME_DATA + "," + GENE_DATA + "," + VARIATION_FUNCTIONAL_SCORE_DATA + "," + MISSENSE_VARIATION_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA + "," - + CONSERVATION_DATA + "," + CLINICAL_VARIANTS_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA + + CONSERVATION_DATA + "," + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA + "," + PUBMED_DATA + "," + PHARMACOGENOMICS_DATA + "; or use 'all' to build everything", required = true, arity = 1) public String data; diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java index 85446fac1f..4fba479a36 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java @@ -85,7 +85,7 @@ public ExportCommandExecutor(AdminCliOptionsParser.ExportCommandOptions exportCo this.dataToExport = new String[]{EtlCommons.GENOME_DATA, EtlCommons.GENE_DATA, EtlCommons.REFSEQ_DATA, EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, - EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, + EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANT_DATA, EtlCommons.REPEATS_DATA, ONTOLOGY_DATA, MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { this.dataToExport = exportCommandOptions.data.split(","); @@ -293,7 +293,7 @@ public void execute() throws CellBaseException { counterMsg = counter + " protein functional predictions"; break; } - case EtlCommons.CLINICAL_VARIANTS_DATA: { + case EtlCommons.CLINICAL_VARIANT_DATA: { counter = exportClinicalVariantData(regions); counterMsg = counter + " clinical variants"; break; @@ -424,7 +424,7 @@ private String exportPharmacogenomicsData(List genes) private int exportClinicalVariantData(List regions) throws CellBaseException, QueryException, IllegalAccessException, IOException { - String baseFilename = CLINICAL_VARIANTS_DATA + ".full"; + String baseFilename = CLINICAL_VARIANT_DATA + ".full"; CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, baseFilename); ClinicalManager clinicalManager = managerFactory.getClinicalManager(species, assembly); ClinicalVariantQuery query = new ClinicalVariantQuery(); diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 166c4e7a6f..0eb53b4ad4 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -81,7 +81,7 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO loadOptions = new String[]{EtlCommons.GENOME_DATA, EtlCommons.GENE_DATA, EtlCommons.REFSEQ_DATA, EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, - EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, + EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANT_DATA, EtlCommons.REPEATS_DATA, EtlCommons.ONTOLOGY_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, PUBMED_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { @@ -257,7 +257,7 @@ public void execute() throws CellBaseException { loadProteinFunctionalPrediction(); break; } - case EtlCommons.CLINICAL_VARIANTS_DATA: { + case EtlCommons.CLINICAL_VARIANT_DATA: { // Load data, create index and update release loadClinical(); break; @@ -461,7 +461,7 @@ private void loadClinical() throws FileNotFoundException { input.resolve("cosmicVersion.json"), input.resolve("gwasVersion.json") )); - dataReleaseManager.update(dataRelease, "clinical_variants", EtlCommons.CLINICAL_VARIANTS_DATA, sources); + dataReleaseManager.update(dataRelease, "clinical_variants", EtlCommons.CLINICAL_VARIANT_DATA, sources); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException e) { logger.error(e.toString()); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java index a7e6b9f1cf..d6b935fa52 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -183,7 +183,7 @@ public void check() throws Exception { // Check genome fasta file genomeSequenceFilePath = checkFiles(GENOME_DATA, downloadPath.getParent().getParent().resolve(GENOME_DATA), 1).get(0).toPath(); - logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE); + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel); checked = true; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index 56e1edd6ff..8f03a801f2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -137,7 +137,7 @@ public void check() throws Exception { throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist"); } - logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE); + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); checked = true; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 289ec23258..9b967eb052 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -102,6 +102,7 @@ public List downloadConservation() throws IOException, Interrupted outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phastConsUrl, outputPath); downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); phastconsUrls.add(phastConsUrl); // PhyloP @@ -112,6 +113,7 @@ public List downloadConservation() throws IOException, Interrupted outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, phyloPUrl, outputPath); downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); phyloPUrls.add(phyloPUrl); } @@ -123,6 +125,8 @@ public List downloadConservation() throws IOException, Interrupted outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, gerpUrl, outputPath); downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); + // Save data version saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, @@ -162,6 +166,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep Path outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); saveDataSource(TRF_DATA, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(TRF_DATA))); @@ -171,6 +176,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); saveDataSource(GSD_DATA, configuration.getDownload().getGenomicSuperDups().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(GSD_DATA))); @@ -181,6 +187,7 @@ public List downloadRepeats() throws IOException, InterruptedExcep outputPath = repeatsFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outputPath); downloadFiles.add(downloadFile(url, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); saveDataSource(WM_DATA, configuration.getDownload().getWindowMasker().getVersion(), getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(getDataVersionFilename(WM_DATA))); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java index 2eeac8415f..25ad390650 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -55,6 +55,7 @@ public List download() throws IOException, InterruptedException, C Path downloadedFilePath = pharmgkbDownloadFolder.resolve(getFilenameFromUrl(url)); logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, downloadedFilePath); DownloadFile downloadFile = downloadFile(url, downloadedFilePath.toString()); + logger.info(OK_LOG_MESSAGE); downloadFiles.add(downloadFile); } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java index 6451fd76aa..9006be7a7d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PubMedDownloadManager.java @@ -51,6 +51,7 @@ public List download() throws IOException, InterruptedException, C String url = host + filename; logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, pubmedDownloadFolder.resolve(filename)); downloadFiles.add(downloadFile(url, pubmedDownloadFolder.resolve(filename).toString())); + logger.info(OK_LOG_MESSAGE); } // Save data source