loinc · joeflack4 · Oct 28, 2024 · Nov 10, 2024 · Nov 12, 2024 · joeflack4
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,8 @@ test/input/*
 !test/input/.keep
 test/output/*
 !test/output/.keep
+
+# input
+input/*
+!input/owl-files/
+!input/data/
diff --git a/makefile b/makefile
@@ -0,0 +1,65 @@
+.PHONY=all chebi-subsets
+
+# All ------------------------------------------------------------------------------------------------------------------
+all: chebi-subsets
+
+# Analysis -------------------------------------------------------------------------------------------------------------
+input/analysis/:
+	mkdir -p $@
+
+output/analysis/:
+	mkdir -p $@
+
+# - ChEBI subsets
+PART_MAPPINGS=loinc_release/Loinc_2.78/AccessoryFiles/PartFile/PartRelatedCodeMapping.csv
+CHEBI_OWL=input/analysis/chebi.owl
+CHEBI_MODULE=output/analysis/chebi_module.txt
+CHEBI_OUT_BOT=output/analysis/chebi-subset-BOT.owl
+CHEBI_OUT_MIREOT=output/analysis/chebi-subset-MIREOT.owl
+
+chebi-subsets: $(CHEBI_OUT_BOT) $(CHEBI_OUT_MIREOT)
+
+input/analysis/chebi.owl.gz: | input/analysis/
+	wget -O $@ ftp://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz
+
+input/analysis/chebi.owl: input/analysis/chebi.owl.gz
+	gunzip -c $< > $@
+	rm $<
+
+# todo: bug fix for label comment: Alwyas shows up as ' # ,'. Alternatively, I could just not include the label comment.
+$(CHEBI_MODULE): $(PART_MAPPINGS) | output/analysis/
+	awk -F'",' '/ebi\.ac\.uk\/chebi/ { \
+		split($$0, parts, "\""); \
+		for (i=1; i<=NF; i++) { \
+			if (parts[i] ~ /CHEBI:/) { \
+				id = parts[i]; \
+				gsub(".*CHEBI:", "http://purl.obolibrary.org/obo/CHEBI_", id); \
+				gsub(",.*", "", id); \
+				print id " # " parts[i+1] \
+			} \
+		} \
+	}' $< > $@
+
+# BOT: use the SLME (Syntactic Locality Module Extractor) to extract a bottom module
+# - Source: https://robot.obolibrary.org/extract
+# - The BOT, or BOTTOM, -module contains mainly the terms in the seed, plus all their super-classes and the
+# inter-relations between them. The module is called BOT (or BOTTOM) because it takes a view from the BOTTOM of the
+# class-hierarchy upwards. Modules of this type are typically of a medium size and should be used if there is a need to
+# include all super-classes in the module. This is the most widely used module type - when in doubt, use this one.
+$(CHEBI_OUT_BOT): $(CHEBI_OWL) $(CHEBI_MODULE)
+	robot extract --method BOT \
+    --input $(CHEBI_OWL) \
+    --term-file $(CHEBI_MODULE) \
+    --output $@
+
+# MIREOT: Minimum Information to Reference an External Ontology Term
+# - Source: https://robot.obolibrary.org/extract
+# - To specify upper and lower term files, use --upper-terms and --lower-terms. The upper terms are the upper boundaries
+# of what will be extracted. If no upper term is specified, all terms up to the root (owl:Thing) will be returned. The
+# lower term (or terms) is required; this is the limit to what will be extracted, e.g. no descendants of the lower term
+# will be included in the result.
+$(CHEBI_OUT_MIREOT): $(CHEBI_OWL) $(CHEBI_MODULE)
+	robot extract --method MIREOT \
+    --input $(CHEBI_OWL) \
+    --lower-terms $(CHEBI_MODULE) \
+    --output $@