diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 02b7242..513a73c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: uses: mamba-org/setup-micromamba@v1 with: init-shell: bash - environment-file: conda-lock.yml + environment-file: environment.yml environment-name: TEST - name: Tests diff --git a/environment.yml b/environment.yml index 3ef0816..2ccd39b 100644 --- a/environment.yml +++ b/environment.yml @@ -16,5 +16,6 @@ dependencies: - pdfminer.six - pyarrow - pytest + - pyobis - requests - suds diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index f6ca4f2..9464f34 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -554,6 +554,59 @@ def hf_radar_installations(): # This is a hardcoded number at the moment! return 165 +@functools.lru_cache(maxsize=128) +def mbon_stats(): + """ + This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity + Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a + dataframe with rows corresponding to each paper citing a dataset. + """ + import pyobis + import urllib.parse + + # collect dataset information from OBIS + institution_id = 23070 + query = pyobis.dataset.search(instituteid=institution_id) + df = pd.DataFrame(query.execute()) + df_obis = pd.DataFrame.from_records(df["results"]) + df_obis.columns = [f'obis_{col}' for col in df_obis.columns] + + df_mapping = pd.DataFrame() + base_url = 'https://api.gbif.org' + # iterate through each OBIS dataset to gather uuid from GBIF + # create a mapping table + for title in df_obis['obis_title']: + string = title + query = f'{base_url}/v1/dataset/search?q={urllib.parse.quote(string)}' + df = pd.read_json(query, orient='index').T + + # build a DataFrame with the info we need more accessible + df_mapping = pd.concat([df_mapping, pd.DataFrame({ + 'gbif_uuid': df['results'].values[0][0]['key'], + 'title': [df['results'].values[0][0]['title']], + 'obis_id': [df_obis.loc[df_obis['obis_title']==title,'obis_id'].to_string(index=False)], + 'doi': [df['results'].values[0][0]['doi']] + })], ignore_index=True) + + df_gbif = pd.DataFrame() + for key in df_mapping['gbif_uuid']: + + url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key) + df2 = pd.read_csv(url) # collect liturature cited information + df2.columns = ['literature_' + str(col) for col in df2.columns] + df2['gbif_uuid'] = key + + df_gbif = pd.concat([df2,df_gbif], ignore_index=True) + + # merge the OBIS and GBIF data frames together + df_obis = df_obis.merge(df_mapping, on='obis_id') + + df_out = df_gbif.merge(df_obis, on='gbif_uuid') + + return df_out + + + def update_metrics(*, debug=False): """Load previous metrics and update the spreadsheet.""" diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 00b8cb2..8d373e7 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -73,3 +73,24 @@ def test_update_metrics(): """Runs update_metrics in debug to log any possibles issues with the scrapping.""" df = update_metrics(debug=True) df.to_csv("updated_metrics.csv") + +def test_mbon_stats(): + df = ioos_metrics.mbon_stats() + columns = ['literature_title', 'literature_authors', 'literature_source', + 'literature_discovered', 'literature_published', + 'literature_open_access', 'literature_peer_review', + 'literature_citation_type', 'literature_countries_of_coverage', + 'literature_countries_of_researcher', 'literature_keywords', + 'literature_literature_type', 'literature_websites', + 'literature_identifiers', 'literature_id', 'literature_abstract', + 'literature_topics', 'literature_added', 'literature_gbif_download_key', + 'gbif_uuid', 'obis_id', 'obis_url', 'obis_archive', 'obis_published', + 'obis_created', 'obis_updated', 'obis_core', 'obis_extensions', + 'obis_statistics', 'obis_extent', 'obis_title', 'obis_citation', + 'obis_citation_id', 'obis_abstract', 'obis_intellectualrights', + 'obis_feed', 'obis_institutes', 'obis_contacts', 'obis_nodes', + 'obis_keywords', 'obis_downloads', 'obis_records', 'title', 'doi'] + + assert isinstance(df, pd.DataFrame) + assert all([col in df.columns for col in columns]) + assert not df.empty \ No newline at end of file