Migrating usgs-nggdpp/mdeditor-keywords repository

adiwg · Jan 22, 2025 · 2d62f4f · 2d62f4f
1 parent c489f1a
commit 2d62f4f
Show file tree

Hide file tree

Showing 55 changed files with 7,171 additions and 2,476 deletions.
diff --git a/.babelrc b/.babelrc
@@ -0,0 +1,4 @@
+{
+  "presets": ["@babel/preset-env"],
+  "plugins": ["@babel/plugin-syntax-import-attributes"]
+}
diff --git a/.eslintrc b/.eslintrc
@@ -0,0 +1,18 @@
+{
+  "env": {
+    "node": true,
+    "es2021": true
+  },
+  "extends": [
+    "eslint:recommended",
+    "plugin:import/recommended",
+    "plugin:prettier/recommended"
+  ],
+  "plugins": ["prettier"],
+  "parserOptions": {
+    "ecmaVersion": 2021
+  },
+  "rules": {
+    "prettier/prettier": ["error"]
+  }
+}
diff --git a/.github/workflows/test-resources.yml b/.github/workflows/test-resources.yml
@@ -0,0 +1,21 @@
+name: test-resources
+run-name: Testing keywords and thesaurus configuration files
+on:
+  push:
+jobs:
+  test-resources:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out the repository
+        uses: actions/checkout@v3
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v3
+        with:
+          node-version: '20'
+
+      - name: Install dependencies
+        run: yarn install
+
+      - name: Run tests
+        run: yarn test
diff --git a/.gitignore b/.gitignore
@@ -62,3 +62,6 @@ typings/
 #generated files
 /lib/js
 /dist
+
+# gnis data
+harvesters/gnis/data/
diff --git a/.prettierrc b/.prettierrc
@@ -0,0 +1,10 @@
+{
+  "printWidth": 80,
+  "tabWidth": 2,
+  "useTabs": false,
+  "semi": true,
+  "singleQuote": true,
+  "trailingComma": "none",
+  "bracketSpacing": true,
+  "arrowParens": "avoid"
+}
diff --git a/eslint.config.js b/eslint.config.js
@@ -0,0 +1,40 @@
+import pluginJs from '@eslint/js';
+import importPlugin from 'eslint-plugin-import';
+import jsonPlugin from 'eslint-plugin-json';
+import prettierPlugin from 'eslint-plugin-prettier';
+import globals from 'globals';
+
+export default [
+  {
+    languageOptions: {
+      globals: globals.browser,
+      ecmaVersion: 2023,
+      sourceType: 'module',
+      parserOptions: {
+        requireConfigFile: false,
+        babelOptions: {
+          presets: ['@babel/preset-env']
+        }
+      }
+    },
+    plugins: {
+      prettier: prettierPlugin,
+      json: jsonPlugin,
+      import: importPlugin
+    },
+    rules: {
+      'prettier/prettier': ['error']
+    }
+  },
+  pluginJs.configs.recommended,
+  prettierPlugin.configs.recommended,
+  jsonPlugin.configs.recommended,
+  importPlugin.configs.recommended,
+  {
+    settings: {
+      jest: {
+        version: 29
+      }
+    }
+  }
+];
diff --git a/harvesters/README.md b/harvesters/README.md
@@ -0,0 +1,40 @@
+# USGS-NGGDPP mdEditor-keywords Harvesters
+
+## Overview
+
+This directory contains harvesters designed to automate the collection of keyword data from authoritative sources. Each harvester extracts keywords from a specific data source or system and processes them into two types of files:
+
+1. **Keyword files** – Saved in `resources/keywords/`.
+2. **Thesaurus files** – Saved in `resources/thesaurus/`.
+
+These harvesters ensure that keyword data remains current, structured, and formatted for integration into **mdEditor**, improving the consistency and accuracy of metadata.
+
+## Available Harvesters
+
+### GNIS (Geographic Names Information System)
+
+The GNIS Harvester collects geographic names and feature data for the United States and its territories. Harvested data includes names for natural and cultural features, administrative boundaries, and other spatial data.
+
+### NALT (National Agricultural Library Thesaurus)
+
+The NALT Harvester gathers keywords related to agricultural sciences, economics, human nutrition, and more, aiding metadata management for agricultural datasets.
+
+### ScienceBase
+
+The ScienceBase Harvester collects keywords for conservation, collection management, and scientific data cataloging, ensuring consistency in metadata for collections and specimens.
+
+### USGS (United States Geological Survey)
+
+The USGS Harvester retrieves thesauri related to geographic, geologic, and environmental data, standardizing metadata terms for scientific datasets.
+
+## Configuration
+
+Each harvester has specific configuration requirements detailed in its individual README. General prerequisites include:
+
+- **Node.js**: Required to run the harvesters.
+- **API Access**: Ensure access to APIs for the relevant harvesters (e.g., GNIS, ScienceBase).
+- **Output Directories**: Set up `resources/keywords/` and `resources/thesaurus/`.
+
+## Usage
+
+Harvesters are executed from the root directory using commands specified in their respective README files. Refer to the root README for further instructions.
diff --git a/harvesters/gcmd/README.md b/harvesters/gcmd/README.md
@@ -0,0 +1,91 @@
+# GCMD Harvester
+
+## Purpose
+
+The **GCMD Harvester** gathers keywords and thesauri from the Global Change Master Directory (GCMD). These resources provide standardized terminology for Earth science data, climate research, and environmental monitoring. The harvester generates hierarchical JSON files that can be integrated into **mdEditor**, supporting metadata workflows in the **USGS NGGDPP** project.
+
+## Vocabularies Processed
+
+The GCMD Harvester generates thesauri and keyword files for the following categories:
+
+- Chronostratigraphic Units
+- Platforms
+- Disciplines
+- IDN Nodes
+- ISO Topic Categories
+- Horizontal Resolution Ranges
+- Vertical Resolution Ranges
+- Temporal Resolution Ranges
+- Instruments
+- Projects
+- Persistent Identifier
+- Private
+- Phone Type
+- Product Flag
+- Dataset Progress
+- Dataset Language
+- Metadata Association Type
+- Organization Personnel Role
+- Personnel Role
+- Organization Type
+- Duration Unit
+- Platform Type
+- Collection Data Type
+- Coordinate System
+- Granule Spatial Representation
+- Product Level Id
+- Spatial Coverage Type
+- Multimedia Format
+- Metadata Language
+- Contact Type
+- Mime Type
+- Distribution Size Unit
+- Related URL Content Types
+- Data Format
+- Measurement Name
+- Projection Name
+- Projection Authority
+- Chained Operations
+- Operations
+- Projection Datum Names
+- Science Keywords
+
+## Input and Output
+
+### Input File
+
+- **Path:** `harvesters/gcmd/vocabularies.json`
+- **Description:** JSON file containing a list of GCMD vocabulary IDs and names to process.
+
+### Output Directory
+
+- **Path:** `resources/`
+- **Description:** Directory where the generated JSON files are saved.
+  - `resources/keywords/`: Contains JSON files for each keyword hierarchy.
+  - `resources/thesaurus/`: Contains thesaurus configuration JSON files.
+
+## How It Works
+
+1. **Load Vocabulary Data**:
+   - Reads vocabulary data from `harvesters/gcmd/vocabularies.json`.
+2. **Fetch Metadata**:
+   - Retrieves concept and keyword data for each vocabulary using the GCMD API.
+3. **Build Hierarchies**:
+   - Constructs hierarchical keyword structures based on broader and narrower relationships.
+4. **Generate JSON Files**:
+   - Outputs JSON files to `resources/keywords/` and `resources/thesaurus/`.
+
+## Running the GCMD Harvester
+
+To run the harvester, execute the following command from the root directory:
+
+`yarn gcmd`
+
+**Important Notes:**
+
+- Ensure all dependencies are installed by running `yarn install` before execution.
+- The command must be run from the repository’s root directory.
+
+## Validation and Testing
+
+The GCMD harvester generates files that are validated against schemas. Refer to the root README for detailed instructions on validation.
diff --git a/harvesters/gcmd/gcmd.js b/harvesters/gcmd/gcmd.js
@@ -0,0 +1,137 @@
+import axios from 'axios';
+import { sleep, writeToLocalFile } from '../utils';
+import dayjs from 'dayjs';
+
+import vocabularies from './vocabularies.json';
+
+const baseUrl = 'https://gcmd.earthdata.nasa.gov/kms/';
+const thesaurusPath = 'resources/thesaurus/';
+const keywordsPath = 'resources/keywords/';
+
+async function fetchConceptById(id) {
+  try {
+    const response = await axios.get(`${baseUrl}concept/${id}?format=json`);
+    return response.data;
+  } catch (error) {
+    console.error('Error fetching concept data', id, error);
+    return null;
+  }
+}
+
+async function fetchKeywordById(id) {
+  try {
+    const response = await axios.get(`${baseUrl}keyword/${id}?format=json`);
+    return response.data;
+  } catch (error) {
+    console.error('Error fetching keyword data', id, error);
+    return null;
+  }
+}
+
+const loadMetadata = async vocabulary => {
+  await sleep(50);
+  const { id } = vocabulary;
+  const conceptData = await fetchConceptById(id);
+  const keywordData = await fetchKeywordById(id);
+  return { ...vocabulary, conceptData, keywordData };
+};
+
+function generateNode(metadata) {
+  return {
+    uuid: metadata.conceptData.uuid,
+    label: metadata.conceptData.prefLabel,
+    parentId: metadata?.conceptData?.broader[0]?.uuid || null,
+    definition:
+      metadata?.conceptData?.definitions[0]?.text ||
+      metadata?.keywordData?.definition ||
+      '',
+    children: []
+  };
+}
+
+async function buildChildren(metadata) {
+  if (metadata.conceptData.isLeaf) {
+    return [];
+  }
+  const narrowerList = metadata.conceptData.narrower;
+  const children = [];
+  for (let i = 0; i < narrowerList.length; i++) {
+    const narrower = narrowerList[i];
+    const narrowerMetadata = await loadMetadata({
+      id: narrower.uuid,
+      isLeaf: narrower.isLeaf,
+      name: narrower.prefLabel
+    });
+    const narrowerChildren = await buildChildren(narrowerMetadata);
+    const node = generateNode(narrowerMetadata);
+    node.children = narrowerChildren;
+    children.push(node);
+  }
+  return children;
+}
+
+const buildKeywordTree = async metadata => {
+  const node = generateNode(metadata);
+  // if node.parentId is null remove the key from the object
+  if (node.parentId === null) {
+    delete node.parentId;
+  }
+  node.children = await buildChildren(metadata);
+  return [node];
+};
+
+function buildConfig(metadata) {
+  return {
+    citation: {
+      date: [
+        {
+          date: dayjs(metadata.conceptData.schemeVersion).format(
+            'YYYY-MM-DDTHH:mm:ss'
+          ),
+          dateType: 'revision'
+        }
+      ],
+      description: metadata.description || 'No description available.',
+      title: `Global Change Master Directory (GCMD) ${metadata.conceptData.prefLabel}`,
+      edition: `Version ${metadata.conceptData.keywordVersion}`,
+      onlineResource: [
+        {
+          uri: `${baseUrl}concept/${metadata.conceptData.uuid}?format=json`
+        }
+      ],
+      identifier: [
+        {
+          identifier: metadata.conceptData.uuid
+        }
+      ]
+    },
+    label: metadata.conceptData.prefLabel,
+    keywordsUrl: `https://cdn.jsdelivr.net/gh/USGS-NGGDPP/mdEditor-keywords@main/resources/keywords/gcmd-${metadata.id}.json`
+  };
+}
+
+async function generateKeywords(vocabulary) {
+  const metadata = await loadMetadata(vocabulary);
+  const keywordsJson = await buildKeywordTree(metadata);
+  return keywordsJson;
+}
+
+async function generateThesaurusConfig(vocabulary) {
+  const metadata = await loadMetadata(vocabulary);
+  const config = buildConfig(metadata);
+  return config;
+}
+
+export default async function main() {
+  for (let i = 0; i < vocabularies.length; i++) {
+    const vocabulary = vocabularies[i];
+    console.log('processing vocabulary', vocabulary.id);
+    const thesaurusConfig = await generateThesaurusConfig(vocabulary);
+    const keywords = await generateKeywords(vocabulary);
+    writeToLocalFile(
+      thesaurusConfig,
+      `${thesaurusPath}gcmd-${vocabulary.id}.json`
+    );
+    writeToLocalFile(keywords, `${keywordsPath}gcmd-${vocabulary.id}.json`);
+  }
+}
diff --git a/harvesters/gcmd/index.js b/harvesters/gcmd/index.js
@@ -0,0 +1,7 @@
+import main from './gcmd';
+
+async function run() {
+  await main();
+}
+
+export default { run };