From fe916148944afe828f619a57c8f22179fc7577de Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Wed, 2 Nov 2022 21:01:03 +0100 Subject: [PATCH] feat: convert clades.json tree into a graph Here I add a script to "flatten" the object hierarchy in `clades.json` into 2 lists: `nodes` and `edges` that represent the same hierarchy in a different way. Every node now has an `id` (an arbitrary string). Every edge is directional and refers to its `source` and `target` noes by the node `id`. This representation will allow us to build arbitrary directed graphs instead of just trees (in particular, to allow showing recombinant relationships between clades) I also split the composite Nextstrain name strings into the clade, lineages, who variant and other components. This should allows a richer display. --- package.json | 4 + src/clades.json | 20 +-- src/clades2.json | 454 +++++++++++++++++++++++++++++++++++++++++++++++ tools/flatten.ts | 164 +++++++++++++++++ tsconfig.json | 8 + yarn.lock | 42 ++++- 6 files changed, 678 insertions(+), 14 deletions(-) create mode 100644 src/clades2.json create mode 100644 tools/flatten.ts diff --git a/package.json b/package.json index 8d70bb9..a610890 100644 --- a/package.json +++ b/package.json @@ -15,6 +15,7 @@ "d3-flextree": "2.1.2", "d3-hierarchy": "3.1.2", "file-saver": "2.0.5", + "lodash": "4.17.21", "next": "12.0.7", "react": "17.0.2", "react-dom": "17.0.2", @@ -25,6 +26,8 @@ "devDependencies": { "@types/d3": "7.1.0", "@types/file-saver": "2.0.4", + "@types/fs-extra": "9.0.13", + "@types/lodash": "4.14.187", "@types/node": "17.0.0", "@types/react": "17.0.37", "@types/react-dom": "17.0.11", @@ -32,6 +35,7 @@ "eslint": "8.4.1", "eslint-config-next": "12.0.7", "eslint-config-prettier": "8.3.0", + "fs-extra": "10.1.0", "prettier": "2.5.1", "ts-node": "10.4.0", "typescript": "4.5.4" diff --git a/src/clades.json b/src/clades.json index 98e6c89..d98cc19 100644 --- a/src/clades.json +++ b/src/clades.json @@ -1,22 +1,18 @@ { "color": "#C8C8C8", "name": "19A (B)", - "lineages": ["B"], "children": [ { "color": "#B8B8B8", - "name": "20A", - "lineages": ["B.1"], + "name": "20A (B.1)", "children": [ - { + { "color": "#B0B0B0", - "name": "20B", - "lineages": ["B.1.1"], + "name": "20B (B.1.1)", "children": [ { "color": "#D3B23F", - "name": "21M (Omicron)", - "lineages": ["B.1.1.529"], + "name": "21M (Omicron, B.1.1.529)", "children": [ { "color": "#C5B945", @@ -29,11 +25,10 @@ { "color": "#E59638", "name": "22B (Omicron, BA.5)", - "children": [ + "children":[ { "color": "#DF4628", - "name": "22E (Omicron)", - "lineages": ["BQ.1"] + "name": "22E (Omicron, BQ.1)" } ] }, @@ -141,8 +136,7 @@ }, { "color": "#C0C0C0", - "name": "19B", - "lineages": ["A"] + "name": "19B (A)" } ] } diff --git a/src/clades2.json b/src/clades2.json new file mode 100644 index 0000000..66bf5f2 --- /dev/null +++ b/src/clades2.json @@ -0,0 +1,454 @@ +{ + "nodes": [ + { + "id": "19A (B)", + "color": "#C8C8C8", + "clade": "19A", + "lineages": [ + "B" + ], + "otherNames": [] + }, + { + "id": "19B (A)", + "color": "#C0C0C0", + "clade": "19B", + "lineages": [ + "A" + ], + "otherNames": [] + }, + { + "id": "20A (B.1)", + "color": "#B8B8B8", + "clade": "20A", + "lineages": [ + "B.1" + ], + "otherNames": [] + }, + { + "id": "20B (B.1.1)", + "color": "#B0B0B0", + "clade": "20B", + "lineages": [ + "B.1.1" + ], + "otherNames": [] + }, + { + "id": "20C", + "color": "#A8A8A8", + "clade": "20C" + }, + { + "id": "20D (B.1.1.1)", + "color": "#A0A0A0", + "clade": "20D", + "lineages": [ + "B.1.1.1" + ], + "otherNames": [] + }, + { + "id": "20E (EU1, B.1.177)", + "color": "#989898", + "clade": "20E", + "lineages": [ + "B.1.177" + ], + "otherNames": [ + "EU1" + ] + }, + { + "id": "20F (D.2)", + "color": "#909090", + "clade": "20F", + "lineages": [ + "D.2" + ], + "otherNames": [] + }, + { + "id": "20G (B.1.2)", + "color": "#888888", + "clade": "20G", + "lineages": [ + "B.1.2" + ], + "otherNames": [] + }, + { + "id": "20H (Beta, V2, B.1.351)", + "color": "#5E1D9D", + "clade": "20H", + "lineages": [ + "B.1.351" + ], + "who": "Beta", + "version": "V2", + "otherNames": [] + }, + { + "id": "20I (Alpha, V1, B.1.1.7)", + "color": "#4A28B3", + "clade": "20I", + "lineages": [ + "B.1.1.7" + ], + "who": "Alpha", + "version": "V1", + "otherNames": [] + }, + { + "id": "20J (Gamma, V3, P.1)", + "color": "#403DC5", + "clade": "20J", + "lineages": [ + "P.1" + ], + "who": "Gamma", + "version": "V3", + "otherNames": [] + }, + { + "id": "21A (Delta, B.1.617.2)", + "color": "#3F56CE", + "clade": "21A", + "lineages": [ + "B.1.617.2" + ], + "who": "Delta", + "otherNames": [] + }, + { + "id": "21B (Kappa, B.1.617.1)", + "color": "#4E95BD", + "clade": "21B", + "lineages": [ + "B.1.617.1" + ], + "who": "Kappa", + "otherNames": [] + }, + { + "id": "21C (Epsilon, B.1.427/429)", + "color": "#58A2AC", + "clade": "21C", + "lineages": [ + "B.1.427/429" + ], + "who": "Epsilon", + "otherNames": [] + }, + { + "id": "21D (Eta, B.1.525)", + "color": "#64AC99", + "clade": "21D", + "lineages": [ + "B.1.525" + ], + "who": "Eta", + "otherNames": [] + }, + { + "id": "21E (Theta, P.3)", + "color": "#71B486", + "clade": "21E", + "lineages": [ + "P.3" + ], + "who": "Theta", + "otherNames": [] + }, + { + "id": "21F (Iota, B.1.526)", + "color": "#80B973", + "clade": "21F", + "lineages": [ + "B.1.526" + ], + "who": "Iota", + "otherNames": [] + }, + { + "id": "21G (Lambda, C.37)", + "color": "#91BC64", + "clade": "21G", + "lineages": [ + "C.37" + ], + "who": "Lambda", + "otherNames": [] + }, + { + "id": "21H (Mu, B.1.621)", + "color": "#A3BE57", + "clade": "21H", + "lineages": [ + "B.1.621" + ], + "who": "Mu", + "otherNames": [] + }, + { + "id": "21I (Delta)", + "color": "#416DCE", + "clade": "21I", + "lineages": [], + "who": "Delta", + "otherNames": [] + }, + { + "id": "21J (Delta)", + "color": "#4683C8", + "clade": "21J", + "lineages": [], + "who": "Delta", + "otherNames": [] + }, + { + "id": "21K (Omicron, BA.1)", + "color": "#B5BD4C", + "clade": "21K", + "lineages": [ + "BA.1" + ], + "who": "Omicron", + "otherNames": [] + }, + { + "id": "21L (Omicron, ~BA.2)", + "color": "#C5B945", + "clade": "21L", + "lineages": [ + "~BA.2" + ], + "who": "Omicron", + "otherNames": [] + }, + { + "id": "21M (Omicron, B.1.1.529)", + "color": "#D3B23F", + "clade": "21M", + "lineages": [ + "B.1.1.529" + ], + "who": "Omicron", + "otherNames": [] + }, + { + "id": "22A (Omicron, BA.4)", + "color": "#DEA63B", + "clade": "22A", + "lineages": [ + "BA.4" + ], + "who": "Omicron", + "otherNames": [] + }, + { + "id": "22B (Omicron, BA.5)", + "color": "#E59638", + "clade": "22B", + "lineages": [ + "BA.5" + ], + "who": "Omicron", + "otherNames": [] + }, + { + "id": "22C (Omicron, BA.2.12.1)", + "color": "#E67F33", + "clade": "22C", + "lineages": [ + "BA.2.12.1" + ], + "who": "Omicron", + "otherNames": [] + }, + { + "id": "22D (Omicron, BA.2.75)", + "color": "#E4642E", + "clade": "22D", + "lineages": [ + "BA.2.75" + ], + "who": "Omicron", + "otherNames": [] + }, + { + "id": "22E (Omicron, BQ.1)", + "color": "#DF4628", + "clade": "22E", + "lineages": [ + "BQ.1" + ], + "who": "Omicron", + "otherNames": [] + }, + { + "id": "22F (Omicron, XBB)", + "color": "#DB2823", + "clade": "22F", + "lineages": [ + "XBB" + ], + "who": "Omicron", + "otherNames": [] + } + ], + "edges": [ + { + "id": "0", + "source": "19A (B)", + "target": "20A (B.1)" + }, + { + "id": "1", + "source": "20A (B.1)", + "target": "20B (B.1.1)" + }, + { + "id": "2", + "source": "20B (B.1.1)", + "target": "21M (Omicron, B.1.1.529)" + }, + { + "id": "3", + "source": "21M (Omicron, B.1.1.529)", + "target": "21L (Omicron, ~BA.2)" + }, + { + "id": "4", + "source": "21L (Omicron, ~BA.2)", + "target": "22A (Omicron, BA.4)" + }, + { + "id": "5", + "source": "21L (Omicron, ~BA.2)", + "target": "22B (Omicron, BA.5)" + }, + { + "id": "6", + "source": "22B (Omicron, BA.5)", + "target": "22E (Omicron, BQ.1)" + }, + { + "id": "7", + "source": "21L (Omicron, ~BA.2)", + "target": "22C (Omicron, BA.2.12.1)" + }, + { + "id": "8", + "source": "21L (Omicron, ~BA.2)", + "target": "22D (Omicron, BA.2.75)" + }, + { + "id": "9", + "source": "21L (Omicron, ~BA.2)", + "target": "22F (Omicron, XBB)" + }, + { + "id": "10", + "source": "21M (Omicron, B.1.1.529)", + "target": "21K (Omicron, BA.1)" + }, + { + "id": "11", + "source": "20B (B.1.1)", + "target": "21E (Theta, P.3)" + }, + { + "id": "12", + "source": "20B (B.1.1)", + "target": "20J (Gamma, V3, P.1)" + }, + { + "id": "13", + "source": "20B (B.1.1)", + "target": "20I (Alpha, V1, B.1.1.7)" + }, + { + "id": "14", + "source": "20B (B.1.1)", + "target": "20F (D.2)" + }, + { + "id": "15", + "source": "20B (B.1.1)", + "target": "20D (B.1.1.1)" + }, + { + "id": "16", + "source": "20D (B.1.1.1)", + "target": "21G (Lambda, C.37)" + }, + { + "id": "17", + "source": "20A (B.1)", + "target": "21A (Delta, B.1.617.2)" + }, + { + "id": "18", + "source": "21A (Delta, B.1.617.2)", + "target": "21I (Delta)" + }, + { + "id": "19", + "source": "21A (Delta, B.1.617.2)", + "target": "21J (Delta)" + }, + { + "id": "20", + "source": "20A (B.1)", + "target": "20C" + }, + { + "id": "21", + "source": "20C", + "target": "21F (Iota, B.1.526)" + }, + { + "id": "22", + "source": "20C", + "target": "21C (Epsilon, B.1.427/429)" + }, + { + "id": "23", + "source": "20C", + "target": "20H (Beta, V2, B.1.351)" + }, + { + "id": "24", + "source": "20C", + "target": "20G (B.1.2)" + }, + { + "id": "25", + "source": "20A (B.1)", + "target": "21H (Mu, B.1.621)" + }, + { + "id": "26", + "source": "20A (B.1)", + "target": "21D (Eta, B.1.525)" + }, + { + "id": "27", + "source": "20A (B.1)", + "target": "21B (Kappa, B.1.617.1)" + }, + { + "id": "28", + "source": "20A (B.1)", + "target": "20E (EU1, B.1.177)" + }, + { + "id": "29", + "source": "19A (B)", + "target": "19B (A)" + } + ] +} diff --git a/tools/flatten.ts b/tools/flatten.ts new file mode 100644 index 0000000..72e1ba5 --- /dev/null +++ b/tools/flatten.ts @@ -0,0 +1,164 @@ +import { omit, sortBy } from 'lodash' +import fs from 'fs-extra' + +main().catch(console.error) + +export interface OldNode { + color: string + name: string + children?: OldNode[] +} + +export interface GraphNode extends Omit { + id: string + clade: string + lineages?: string[] + who?: string + version?: string + otherNames?: string[] +} + +export interface GraphEdge { + id: string + source: string + target: string +} + +export async function main() { + const cladesJson = await fs.readJSON('src/clades.json') + + let nodes: GraphNode[] = [] + let edges: GraphEdge[] = [] + flattenCladeTree(cladesJson, nodes, edges) + nodes = sortBy(nodes, (node) => node.clade) + + verifyGraph(nodes, edges) + + console.log(require('util').inspect({ nodes, edges }, { colors: true, depth: null, maxArrayLength: null })) + + fs.writeJson('src/clades2.json', { nodes, edges }, { spaces: 2 }) +} + +// Convert tree node hierarhy into flat lists of nodes and edges +function flattenCladeTree(node: OldNode, nodes: GraphNode[], edges: GraphEdge[]) { + const { clade, lineages, who, version, otherNames } = splitName(node.name) + + const id = node.name + + nodes.push({ + id, + color: node.color, + clade, + lineages, + who, + version, + otherNames, + }) + + const children = node.children ?? [] + + children.forEach((child) => { + edges.push({ id: edges.length.toString(), source: node.name, target: child.name }) + flattenCladeTree(child, nodes, edges) + }) +} + +function splitName(name: string) { + // Extract clade (outside parentheses) and details string (the thing in parenteses) + const matches = name.match(/^(?.*?)( \((?
[^)]*)\))?$/) + if (!matches) { + throw new Error(`Unable to parse name '${name}'`) + } + + const clade = matches.groups?.clade + if (!clade) { + throw new Error(`Unable to extract clade from name '${name}'`) + } + + const details = matches.groups?.details + if (!details) { + return { clade } + } + + // Decompose details string (the thing in parenteses), + // for example "(B.1)", "(Omicron, B.1.1.529), "(Beta, V2, B.1.351)", "(EU1)"" + const components = details.split(',').map((component) => component.trim()) + if (components.length === 0) { + throw new Error(`Unable to make sense of the name '${name}': details string is present but is empty`) + } + + const who = findExactlyOne(components, isWhoVariant) + const version = findExactlyOne(components, isVersion) + const lineages = components.filter((c) => isPangoLineage(c)) + const otherNames = components.filter((c) => c != who && c != version && !lineages.includes(c)) + return { clade, lineages, who, version, otherNames } + + throw new Error( + `Unable to make sense of the name '${name}': details string does not follow any known format: '${details}'`, + ) +} + +function isPangoLineage(s: string) { + return s.match(/^(\~)?(A|B|C|D|BA|BQ|P|XBB|)(\.\d+?(\/\d+)?)*$/) !== null +} + +function isWhoVariant(s: string) { + return !WHO_EXCEPTIONS.includes(s) && GREEK_LETTERS.includes(s) +} + +function isVersion(s: string) { + return s.match(/^V\d$/) !== null +} + +const GREEK_LETTERS = [ + 'Alpha', + 'Beta', + 'Gamma', + 'Delta', + 'Epsilon', + 'Zeta', + 'Eta', + 'Theta', + 'Iota', + 'Kappa', + 'Lambda', + 'Mu', + 'Nu', + 'Xi', + 'Omicron', + 'Pi', + 'Rho', + 'Sigma', + 'Tau', + 'Upsilon', + 'Phi', + 'Chi', + 'Psi', + 'Omega', +] + +const WHO_EXCEPTIONS = ['EU1'] + +function findExactlyOne(components: string[], predicate: (s: string) => boolean) { + const candidates = components.filter(predicate) + if (candidates.length > 1) { + throw new Error(`Expected to find exactly one element, but found: ${candidates.map((s) => `"${s}"`).join(',')}`) + } + return candidates[0] +} + +function verifyGraph(nodes: GraphNode[], edges: GraphEdge[]) { + edges.forEach((edge) => { + if (edge.target == edge.source) { + throw new Error(`Graph is invalid: invalid edge: the target node '${edge.target}' is the same as source node`) + } + + if (!nodes.find((node) => edge.target == node.id)) { + throw new Error(`Graph is invalid: invalid edge: the target node '${edge.target}' does not exist`) + } + + if (!nodes.find((node) => edge.source == node.id)) { + throw new Error(`Graph is invalid: invalid edge: the source node '${edge.source}' does not exist`) + } + }) +} diff --git a/tsconfig.json b/tsconfig.json index 56b6ea1..427f8f2 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,4 +1,12 @@ { + "ts-node": { + "extends": "ts-node/node16/tsconfig.json", + "transpileOnly": true, + "files": true, + "compilerOptions": { + "module": "commonjs" + } + }, "compilerOptions": { "baseUrl": "./", "paths": { diff --git a/yarn.lock b/yarn.lock index 1aa697c..d684461 100644 --- a/yarn.lock +++ b/yarn.lock @@ -649,6 +649,13 @@ resolved "https://registry.yarnpkg.com/@types/file-saver/-/file-saver-2.0.4.tgz#aaf9b96296150d737b2fefa535ced05ed8013d84" integrity sha512-sPZYQEIF/SOnLAvaz9lTuydniP+afBMtElRTdYkeV1QtEgvtJ7qolCPjly6O32QI8CbEmP5O/fztMXEDWfEcrg== +"@types/fs-extra@9.0.13": + version "9.0.13" + resolved "https://registry.yarnpkg.com/@types/fs-extra/-/fs-extra-9.0.13.tgz#7594fbae04fe7f1918ce8b3d213f74ff44ac1f45" + integrity sha512-nEnwB++1u5lVDM2UI4c1+5R+FYaKfaAzS4OococimjVm3nQw3TuzH5UNsocrcTBbhnerblyHj4A49qXbIiZdpA== + dependencies: + "@types/node" "*" + "@types/geojson@*": version "7946.0.8" resolved "https://registry.yarnpkg.com/@types/geojson/-/geojson-7946.0.8.tgz#30744afdb385e2945e22f3b033f897f76b1f12ca" @@ -667,6 +674,11 @@ resolved "https://registry.yarnpkg.com/@types/json5/-/json5-0.0.29.tgz#ee28707ae94e11d2b827bcbe5270bcea7f3e71ee" integrity sha1-7ihweulOEdK4J7y+UnC86n8+ce4= +"@types/lodash@4.14.187": + version "4.14.187" + resolved "https://registry.yarnpkg.com/@types/lodash/-/lodash-4.14.187.tgz#122ff0a7192115b4c1a19444ab4482caa77e2c9d" + integrity sha512-MrO/xLXCaUgZy3y96C/iOsaIqZSeupyTImKClHunL5GrmaiII2VwvWmLBu2hwa0Kp0sV19CsyjtrTc/Fx8rg/A== + "@types/node@*": version "17.0.2" resolved "https://registry.yarnpkg.com/@types/node/-/node-17.0.2.tgz#a4c07d47ff737e8ee7e586fe636ff0e1ddff070a" @@ -2128,6 +2140,15 @@ foreach@^2.0.5: resolved "https://registry.yarnpkg.com/foreach/-/foreach-2.0.5.tgz#0bee005018aeb260d0a3af3ae658dd0136ec1b99" integrity sha1-C+4AUBiusmDQo6865ljdATbsG5k= +fs-extra@10.1.0: + version "10.1.0" + resolved "https://registry.yarnpkg.com/fs-extra/-/fs-extra-10.1.0.tgz#02873cfbc4084dde127eaa5f9905eef2325d1abf" + integrity sha512-oRXApq54ETRj4eMiFzGnHWGy+zo5raudjuxN0b8H7s/RU2oW0Wvsx9O0ACRN/kRq9E8Vu/ReskGB5o3ji+FzHQ== + dependencies: + graceful-fs "^4.2.0" + jsonfile "^6.0.1" + universalify "^2.0.0" + fs.realpath@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f" @@ -2244,6 +2265,11 @@ graceful-fs@^4.1.2: resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.2.8.tgz#e412b8d33f5e006593cbd3cee6df9f2cebbe802a" integrity sha512-qkIilPUYcNhJpd33n0GBXTB1MMPp14TxEsEs0pTrsSVucApsYzW5V+Q8Qxhik6KU3evy+qkAAowTByymK0avdg== +graceful-fs@^4.1.6, graceful-fs@^4.2.0: + version "4.2.10" + resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.2.10.tgz#147d3a006da4ca3ce14728c7aefc287c367d7a6c" + integrity sha512-9ByhssR2fPVsNZj478qUUbKfmL0+t5BDVyjShtyZZLiK7ZDAArFFfopyOTj0M05wE2tJPisA4iTnnXl2YoPvOA== + has-bigints@^1.0.1: version "1.0.1" resolved "https://registry.yarnpkg.com/has-bigints/-/has-bigints-1.0.1.tgz#64fe6acb020673e3b78db035a5af69aa9d07b113" @@ -2594,6 +2620,15 @@ json5@^1.0.1: dependencies: minimist "^1.2.0" +jsonfile@^6.0.1: + version "6.1.0" + resolved "https://registry.yarnpkg.com/jsonfile/-/jsonfile-6.1.0.tgz#bc55b2634793c679ec6403094eb13698a6ec0aae" + integrity sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ== + dependencies: + universalify "^2.0.0" + optionalDependencies: + graceful-fs "^4.1.6" + "jsx-ast-utils@^2.4.1 || ^3.0.0", jsx-ast-utils@^3.2.1: version "3.2.1" resolved "https://registry.yarnpkg.com/jsx-ast-utils/-/jsx-ast-utils-3.2.1.tgz#720b97bfe7d901b927d87c3773637ae8ea48781b" @@ -2656,7 +2691,7 @@ lodash.sortby@^4.7.0: resolved "https://registry.yarnpkg.com/lodash.sortby/-/lodash.sortby-4.7.0.tgz#edd14c824e2cc9c1e0b0a1b42bb5210516a42438" integrity sha1-7dFMgk4sycHgsKG0K7UhBRakJDg= -lodash@^4.17.11, lodash@^4.17.21: +lodash@4.17.21, lodash@^4.17.11, lodash@^4.17.21: version "4.17.21" resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c" integrity sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg== @@ -3752,6 +3787,11 @@ unbox-primitive@^1.0.1: has-symbols "^1.0.2" which-boxed-primitive "^1.0.2" +universalify@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/universalify/-/universalify-2.0.0.tgz#75a4984efedc4b08975c5aeb73f530d02df25717" + integrity sha512-hAZsKq7Yy11Zu1DE0OzWjw7nnLZmJZYTDZZyEFHZdUhV8FkH5MCfoU1XMaxXovpyW5nq5scPqq0ZDP9Zyl04oQ== + unpipe@1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/unpipe/-/unpipe-1.0.0.tgz#b2bf4ee8514aae6165b4817829d21b2ef49904ec"