Merge pull request #74 from MathewBiddle/mbon_stats

starting mbon_stats
ioos · May 23, 2024 · fb0aa66 · fb0aa66
2 parents b131d12 + 94024f3
commit fb0aa66
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 1 deletion.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -16,7 +16,7 @@ jobs:
       uses: mamba-org/setup-micromamba@v1
       with:
         init-shell: bash
-        environment-file: conda-lock.yml
+        environment-file: environment.yml
         environment-name: TEST
 
     - name: Tests

diff --git a/environment.yml b/environment.yml
@@ -16,5 +16,6 @@ dependencies:
   - pdfminer.six
   - pyarrow
   - pytest
+  - pyobis
   - requests
   - suds
diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py
@@ -554,6 +554,59 @@ def hf_radar_installations():
     # This is a hardcoded number at the moment!
     return 165
 
+@functools.lru_cache(maxsize=128)
+def mbon_stats():
+    """
+    This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity
+    Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a
+    dataframe with rows corresponding to each paper citing a dataset.
+    """
+    import pyobis
+    import urllib.parse
+
+    # collect dataset information from OBIS
+    institution_id = 23070
+    query = pyobis.dataset.search(instituteid=institution_id)
+    df = pd.DataFrame(query.execute())
+    df_obis = pd.DataFrame.from_records(df["results"])
+    df_obis.columns = [f'obis_{col}' for col in df_obis.columns]
+
+    df_mapping = pd.DataFrame()
+    base_url = 'https://api.gbif.org'
+    # iterate through each OBIS dataset to gather uuid from GBIF
+    # create a mapping table
+    for title in df_obis['obis_title']:
+        string = title
+        query = f'{base_url}/v1/dataset/search?q={urllib.parse.quote(string)}'
+        df = pd.read_json(query, orient='index').T
+
+        # build a DataFrame with the info we need more accessible
+        df_mapping = pd.concat([df_mapping, pd.DataFrame({
+            'gbif_uuid': df['results'].values[0][0]['key'],
+            'title': [df['results'].values[0][0]['title']],
+            'obis_id': [df_obis.loc[df_obis['obis_title']==title,'obis_id'].to_string(index=False)],
+            'doi': [df['results'].values[0][0]['doi']]
+        })], ignore_index=True)
+
+    df_gbif = pd.DataFrame()
+    for key in df_mapping['gbif_uuid']:
+
+        url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key)
+        df2 = pd.read_csv(url)  # collect liturature cited information
+        df2.columns = ['literature_' + str(col) for col in df2.columns]
+        df2['gbif_uuid'] = key
+
+        df_gbif = pd.concat([df2,df_gbif], ignore_index=True)
+
+    # merge the OBIS and GBIF data frames together
+    df_obis = df_obis.merge(df_mapping, on='obis_id')
+
+    df_out = df_gbif.merge(df_obis, on='gbif_uuid')
+
+    return df_out
+
+
+
 
 def update_metrics(*, debug=False):
     """Load previous metrics and update the spreadsheet."""

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -73,3 +73,24 @@ def test_update_metrics():
     """Runs update_metrics in debug to log any possibles issues with the scrapping."""
     df = update_metrics(debug=True)
     df.to_csv("updated_metrics.csv")
+
+def test_mbon_stats():
+    df = ioos_metrics.mbon_stats()
+    columns = ['literature_title', 'literature_authors', 'literature_source',
+               'literature_discovered', 'literature_published',
+               'literature_open_access', 'literature_peer_review',
+               'literature_citation_type', 'literature_countries_of_coverage',
+               'literature_countries_of_researcher', 'literature_keywords',
+               'literature_literature_type', 'literature_websites',
+               'literature_identifiers', 'literature_id', 'literature_abstract',
+               'literature_topics', 'literature_added', 'literature_gbif_download_key',
+               'gbif_uuid', 'obis_id', 'obis_url', 'obis_archive', 'obis_published',
+               'obis_created', 'obis_updated', 'obis_core', 'obis_extensions',
+               'obis_statistics', 'obis_extent', 'obis_title', 'obis_citation',
+               'obis_citation_id', 'obis_abstract', 'obis_intellectualrights',
+               'obis_feed', 'obis_institutes', 'obis_contacts', 'obis_nodes',
+               'obis_keywords', 'obis_downloads', 'obis_records', 'title', 'doi']
+
+    assert isinstance(df, pd.DataFrame)
+    assert all([col in df.columns for col in columns])
+    assert not df.empty
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,5 +16,6 @@ dependencies: @@
       - pdfminer.six
       - pyarrow
       - pytest
+      - pyobis
       - requests
       - suds