From 003d12a57a9c89c388e25c40de2be0d019ae2849 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Mon, 28 Oct 2024 18:37:48 -0400 Subject: [PATCH 1/3] ChEBI subset - Add: makefile: To add goals for creating these outputs. - Update: .gitignore: To include folders for these inputs/outputs. --- .gitignore | 5 ++++ makefile | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 makefile diff --git a/.gitignore b/.gitignore index 59adb45..607ba9f 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,8 @@ test/input/* !test/input/.keep test/output/* !test/output/.keep + +# input +input/* +!input/owl-files/ +!input/data/ diff --git a/makefile b/makefile new file mode 100644 index 0000000..76145a9 --- /dev/null +++ b/makefile @@ -0,0 +1,83 @@ +.PHONY=all chebi-subsets + +# All ------------------------------------------------------------------------------------------------------------------ +all: chebi-subsets + +# Analysis ------------------------------------------------------------------------------------------------------------- +input/analysis/: + mkdir -p $@ + +output/analysis/: + mkdir -p $@ + +# - ChEBI subsets +PART_MAPPINGS=loinc_release/Loinc_2.78/AccessoryFiles/PartFile/PartRelatedCodeMapping.csv +CHEBI_OWL=input/analysis/chebi.owl +CHEBI_MODULE=output/analysis/chebi_module.txt +CHEBI_OUT_BOT=output/analysis/chebi-subset-BOT.owl +CHEBI_OUT_MIREOT=output/analysis/chebi-subset-MIREOT.owl + +chebi-subsets: $(CHEBI_OUT_BOT) $(CHEBI_OUT_MIREOT) + +input/analysis/chebi.owl.gz: | input/analysis/ + wget -O $@ ftp://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz + +input/analysis/chebi.owl: input/analysis/chebi.owl.gz + gunzip -c $< > $@ + rm $< + +# todo: bug fix for label comment: Alwyas shows up as ' # ,'. Alternatively, I could just not include the label comment. +$(CHEBI_MODULE): $(PART_MAPPINGS) | output/analysis/ + awk -F'",' '/ebi\.ac\.uk\/chebi/ { \ + split($$0, parts, "\""); \ + for (i=1; i<=NF; i++) { \ + if (parts[i] ~ /CHEBI:/) { \ + id = parts[i]; \ + gsub(".*CHEBI:", "http://purl.obolibrary.org/obo/CHEBI_", id); \ + gsub(",.*", "", id); \ + print id " # " parts[i+1] \ + } \ + } \ + }' $< > $@ + +# BOT: use the SLME (Syntactic Locality Module Extractor) to extract a bottom module +# - Source: https://robot.obolibrary.org/extract +# - The BOT, or BOTTOM, -module contains mainly the terms in the seed, plus all their super-classes and the +# inter-relations between them. The module is called BOT (or BOTTOM) because it takes a view from the BOTTOM of the +# class-hierarchy upwards. Modules of this type are typically of a medium size and should be used if there is a need to +# include all super-classes in the module. This is the most widely used module type - when in doubt, use this one. +$(CHEBI_OUT_BOT): $(CHEBI_OWL) $(CHEBI_MODULE) + robot extract --method BOT \ + --input $(CHEBI_OWL) \ + --term-file $(CHEBI_MODULE) \ + --output $@ + +# MIREOT: Minimum Information to Reference an External Ontology Term +# - Source: https://robot.obolibrary.org/extract +# - To specify upper and lower term files, use --upper-terms and --lower-terms. The upper terms are the upper boundaries +# of what will be extracted. If no upper term is specified, all terms up to the root (owl:Thing) will be returned. The +# lower term (or terms) is required; this is the limit to what will be extracted, e.g. no descendants of the lower term +# will be included in the result. +$(CHEBI_OUT_MIREOT): $(CHEBI_OWL) $(CHEBI_MODULE) + robot extract --method MIREOT \ + --input $(CHEBI_OWL) \ + --lower-terms $(CHEBI_MODULE) \ + --output $@ + +# todo: SPARQL TSV subset: Just the classes themselves? +# - might require a jinja SPARQL or something like it, populated by the module +#output/analysis/chebi-terms.tsv: $(CHEBI_OWL) | output/analysis/ +# robot query -i $< -q src/sparql/mappings.sparql $@ + +# todo: Subset? +# - Source: https://robot.obolibrary.org/extract +# - The subset method extracts a sub-ontology that contains only the seed terms (that you specify with --term and +# --term-file options) and the relations between them. This method uses the relation-graph to materialize the +# existential relations among the seed terms. Procedurally, the subset method materializes the input ontology and adds +# the inferred axioms to the input ontology. Then filters the ontology with the given seed terms. Finally, it reduces +# the filtered ontology to remove redundant subClassOf axioms. +#output/analysis/chebi-subset-subsetOnly.owl: $(CHEBI_OWL) $(CHEBI_MODULE) +# robot extract --method subset \ +# --input $(CHEBI_OWL) \ +# --term-file $(CHEBI_MODULE) \ +# --output $@ From 9ffec163f4d03f189d4bebff32d37d9b1ffedc61 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Sun, 10 Nov 2024 17:07:27 -0500 Subject: [PATCH 2/3] ChEBI subset - Delete: Alternative, commented out, variations of subsetting ChEBI. --- makefile | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/makefile b/makefile index 76145a9..1fb1511 100644 --- a/makefile +++ b/makefile @@ -63,21 +63,3 @@ $(CHEBI_OUT_MIREOT): $(CHEBI_OWL) $(CHEBI_MODULE) --input $(CHEBI_OWL) \ --lower-terms $(CHEBI_MODULE) \ --output $@ - -# todo: SPARQL TSV subset: Just the classes themselves? -# - might require a jinja SPARQL or something like it, populated by the module -#output/analysis/chebi-terms.tsv: $(CHEBI_OWL) | output/analysis/ -# robot query -i $< -q src/sparql/mappings.sparql $@ - -# todo: Subset? -# - Source: https://robot.obolibrary.org/extract -# - The subset method extracts a sub-ontology that contains only the seed terms (that you specify with --term and -# --term-file options) and the relations between them. This method uses the relation-graph to materialize the -# existential relations among the seed terms. Procedurally, the subset method materializes the input ontology and adds -# the inferred axioms to the input ontology. Then filters the ontology with the given seed terms. Finally, it reduces -# the filtered ontology to remove redundant subClassOf axioms. -#output/analysis/chebi-subset-subsetOnly.owl: $(CHEBI_OWL) $(CHEBI_MODULE) -# robot extract --method subset \ -# --input $(CHEBI_OWL) \ -# --term-file $(CHEBI_MODULE) \ -# --output $@ From b450ad6f7d2a1b59a8febff220bc62958430a9f4 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Tue, 12 Nov 2024 15:49:39 -0500 Subject: [PATCH 3/3] ChEBI Subset - Update: Download URI: Changed to PURL --- makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/makefile b/makefile index 1fb1511..9b897fc 100644 --- a/makefile +++ b/makefile @@ -12,6 +12,7 @@ output/analysis/: # - ChEBI subsets PART_MAPPINGS=loinc_release/Loinc_2.78/AccessoryFiles/PartFile/PartRelatedCodeMapping.csv +CHEBI_URI=http://purl.obolibrary.org/obo/chebi/chebi.owl.gz CHEBI_OWL=input/analysis/chebi.owl CHEBI_MODULE=output/analysis/chebi_module.txt CHEBI_OUT_BOT=output/analysis/chebi-subset-BOT.owl @@ -20,7 +21,7 @@ CHEBI_OUT_MIREOT=output/analysis/chebi-subset-MIREOT.owl chebi-subsets: $(CHEBI_OUT_BOT) $(CHEBI_OUT_MIREOT) input/analysis/chebi.owl.gz: | input/analysis/ - wget -O $@ ftp://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz + wget -O $@ $(CHEBI_URI) input/analysis/chebi.owl: input/analysis/chebi.owl.gz gunzip -c $< > $@