From 17525f644133c8dd9b0cb6e99c44e6a5cc1901f2 Mon Sep 17 00:00:00 2001 From: Mathew Biddle <8480023+MathewBiddle@users.noreply.github.com> Date: Tue, 21 May 2024 15:16:06 -0400 Subject: [PATCH 01/11] starting mbon_stats --- environment.yml | 1 + ioos_metrics/ioos_metrics.py | 62 ++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/environment.yml b/environment.yml index 3ef0816..2ccd39b 100644 --- a/environment.yml +++ b/environment.yml @@ -16,5 +16,6 @@ dependencies: - pdfminer.six - pyarrow - pytest + - pyobis - requests - suds diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index f6ca4f2..dd2b133 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -554,6 +554,68 @@ def hf_radar_installations(): # This is a hardcoded number at the moment! return 165 +def mbon_stats(): + + import pyobis + institution_id = 23070 + + query = pyobis.dataset.search(instituteid=institution_id) + + df = pd.DataFrame(query.execute()) + + df_meta = pd.DataFrame.from_records(df["results"]) + + import numpy as np + + df_downloads = pd.DataFrame.from_records(df_meta['downloads']) + + # df_downloads.rename(columns={'index':'year'}, inplace=True) + + df_downloads.fillna(value=np.nan) + + import urllib.parse + + df_gbif = pd.DataFrame() + + for title in df_meta['title']: + string = title + query = '{}/v1/dataset/search?q={}'.format(base_url, urllib.parse.quote(string)) + df = pd.read_json(query, orient='index').T + + key = df['results'].values[0][0]['key'] + + # build a DataFrame with the info we need more accessible + df_gbif = pd.concat([df_gbif, pd.DataFrame({ + 'key': df['results'].values[0][0]['key'], + 'title': [df['results'].values[0][0]['title']], + 'doi': [df['results'].values[0][0]['doi']] + })], ignore_index=True) + + topics = [] + + for key in df_gbif['key']: + url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key) + print(url) + df2 = pd.read_csv(url) # summary of citations + df_gbif.loc[df_gbif['key'] == key, 'number_of_citations'] = df2.shape[0] + + # df_gbif.loc[df_gbif['key']==key,'topics'] = df_gbif.loc[df_gbif['key']==key,'topics'].astype('O') + # df_gbif.loc[df_gbif['key']==key,'topics'] = df2['topics'].to_list() + + topics.append(df2['topics'].tolist()) + + flat_list = [ + x + for xs in topics + for x in xs + ] + + unique_topics = sorted(set(flat_list)) + + return df_gbif + + + def update_metrics(*, debug=False): """Load previous metrics and update the spreadsheet.""" From d305554cc1ac77b494dd5e0def00dfa71b2ef202 Mon Sep 17 00:00:00 2001 From: Mathew Biddle <8480023+MathewBiddle@users.noreply.github.com> Date: Tue, 21 May 2024 15:24:19 -0400 Subject: [PATCH 02/11] adding missing url --- ioos_metrics/ioos_metrics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index dd2b133..fdecadd 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -577,6 +577,8 @@ def mbon_stats(): df_gbif = pd.DataFrame() + base_url = 'https://api.gbif.org' + for title in df_meta['title']: string = title query = '{}/v1/dataset/search?q={}'.format(base_url, urllib.parse.quote(string)) From 46c3050a242a1d82cb2015e0485ab0a65a8dee0e Mon Sep 17 00:00:00 2001 From: Mathew Biddle <8480023+MathewBiddle@users.noreply.github.com> Date: Wed, 22 May 2024 11:12:28 -0400 Subject: [PATCH 03/11] saving data to one big dictionary --- ioos_metrics/ioos_metrics.py | 49 ++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index fdecadd..5c4ff82 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -555,25 +555,27 @@ def hf_radar_installations(): return 165 def mbon_stats(): - + """ + This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity Information + System (OBIS) and the Global Biodiversity Information Framework (GBIF). + :return: + """ import pyobis - institution_id = 23070 + import numpy as np + import urllib.parse + institution_id = 23070 query = pyobis.dataset.search(instituteid=institution_id) - df = pd.DataFrame(query.execute()) df_meta = pd.DataFrame.from_records(df["results"]) - import numpy as np - - df_downloads = pd.DataFrame.from_records(df_meta['downloads']) + df_meta.rename(columns={'id':'obis_uuid'},inplace=True) + #df_downloads = pd.DataFrame.from_records(df_meta['downloads']) # df_downloads.rename(columns={'index':'year'}, inplace=True) - df_downloads.fillna(value=np.nan) - - import urllib.parse + #df_downloads.fillna(value=np.nan) df_gbif = pd.DataFrame() @@ -584,7 +586,7 @@ def mbon_stats(): query = '{}/v1/dataset/search?q={}'.format(base_url, urllib.parse.quote(string)) df = pd.read_json(query, orient='index').T - key = df['results'].values[0][0]['key'] + gbif_key = df['results'].values[0][0]['key'] # build a DataFrame with the info we need more accessible df_gbif = pd.concat([df_gbif, pd.DataFrame({ @@ -595,26 +597,29 @@ def mbon_stats(): topics = [] + dict_out = {} + + for i in df_gbif['key'].tolist(): + dict_out[i] = {} + for key in df_gbif['key']: url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key) print(url) - df2 = pd.read_csv(url) # summary of citations - df_gbif.loc[df_gbif['key'] == key, 'number_of_citations'] = df2.shape[0] - # df_gbif.loc[df_gbif['key']==key,'topics'] = df_gbif.loc[df_gbif['key']==key,'topics'].astype('O') - # df_gbif.loc[df_gbif['key']==key,'topics'] = df2['topics'].to_list() + df2 = pd.read_csv(url) # count number of citations + df2['number_of_citations'] = df2.shape[0] - topics.append(df2['topics'].tolist()) + dict_out[key]['liturature'] = df2 + dict_out[key]['number_of_citations'] = df2.shape[0] + dict_out[key]['title'] = df_gbif.loc[df_gbif['key'] == key, 'title'].to_string() + dict_out[key]['doi'] = df_gbif.loc[df_gbif['key'] == key, 'doi'].to_string() - flat_list = [ - x - for xs in topics - for x in xs - ] + df_gbif.loc[df_gbif['key'] == key, 'number_of_citations'] = df2.shape[0] - unique_topics = sorted(set(flat_list)) + # df_gbif.loc[df_gbif['key']==key,'topics'] = df_gbif.loc[df_gbif['key']==key,'topics'].astype('O') + # df_gbif.loc[df_gbif['key']==key,'topics'] = df2['topics'].to_list() - return df_gbif + return df_gbif, df2, dict_out From de83ed0474dfcc41b80c0a79ab3bc00b2a07b984 Mon Sep 17 00:00:00 2001 From: Mathew Biddle <8480023+MathewBiddle@users.noreply.github.com> Date: Wed, 22 May 2024 14:53:52 -0400 Subject: [PATCH 04/11] mbon stats to dataframe --- ioos_metrics/ioos_metrics.py | 61 ++++++++++++++---------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index 5c4ff82..7a4c970 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -556,70 +556,57 @@ def hf_radar_installations(): def mbon_stats(): """ - This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity Information - System (OBIS) and the Global Biodiversity Information Framework (GBIF). + This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity + Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a + dataframe with rows corresponding to each dataset. :return: """ import pyobis import numpy as np import urllib.parse + # collect dataset information from OBIS institution_id = 23070 query = pyobis.dataset.search(instituteid=institution_id) df = pd.DataFrame(query.execute()) + df_obis = pd.DataFrame.from_records(df["results"]) + df_obis.columns = ['obis_' + str(col) for col in df_obis.columns] - df_meta = pd.DataFrame.from_records(df["results"]) - - df_meta.rename(columns={'id':'obis_uuid'},inplace=True) - #df_downloads = pd.DataFrame.from_records(df_meta['downloads']) - - # df_downloads.rename(columns={'index':'year'}, inplace=True) - - #df_downloads.fillna(value=np.nan) - - df_gbif = pd.DataFrame() - + df_mapping = pd.DataFrame() base_url = 'https://api.gbif.org' - - for title in df_meta['title']: + # iterate through each OBIS dataset to gather uuid from GBIF + # create a mapping table + for title in df_obis['obis_title']: string = title query = '{}/v1/dataset/search?q={}'.format(base_url, urllib.parse.quote(string)) df = pd.read_json(query, orient='index').T - gbif_key = df['results'].values[0][0]['key'] - # build a DataFrame with the info we need more accessible - df_gbif = pd.concat([df_gbif, pd.DataFrame({ - 'key': df['results'].values[0][0]['key'], + df_mapping = pd.concat([df_mapping, pd.DataFrame({ + 'gbif_uuid': df['results'].values[0][0]['key'], 'title': [df['results'].values[0][0]['title']], + 'obis_id': [df_obis.loc[df_obis['obis_title']==title,'obis_id'].to_string(index=False)], 'doi': [df['results'].values[0][0]['doi']] })], ignore_index=True) - topics = [] - - dict_out = {} - - for i in df_gbif['key'].tolist(): - dict_out[i] = {} - - for key in df_gbif['key']: + df_gbif = pd.DataFrame() + for key in df_mapping['gbif_uuid']: url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key) print(url) - df2 = pd.read_csv(url) # count number of citations - df2['number_of_citations'] = df2.shape[0] + df2 = pd.read_csv(url) # collect liturature cited information + + df2.columns = ['literature_' + str(col) for col in df2.columns] + df2['gbif_uuid'] = key - dict_out[key]['liturature'] = df2 - dict_out[key]['number_of_citations'] = df2.shape[0] - dict_out[key]['title'] = df_gbif.loc[df_gbif['key'] == key, 'title'].to_string() - dict_out[key]['doi'] = df_gbif.loc[df_gbif['key'] == key, 'doi'].to_string() + df_gbif = pd.concat([df2,df_gbif], ignore_index=True) - df_gbif.loc[df_gbif['key'] == key, 'number_of_citations'] = df2.shape[0] + # merge the OBIS and GBIF data frames together + df_obis = df_obis.merge(df_mapping, on='obis_id') - # df_gbif.loc[df_gbif['key']==key,'topics'] = df_gbif.loc[df_gbif['key']==key,'topics'].astype('O') - # df_gbif.loc[df_gbif['key']==key,'topics'] = df2['topics'].to_list() + df_out = df_gbif.merge(df_obis, on='gbif_uuid') - return df_gbif, df2, dict_out + return df_out From 86e46e2298f66cea3ee2467e6486389cbc6bb0a8 Mon Sep 17 00:00:00 2001 From: Mathew Biddle <8480023+MathewBiddle@users.noreply.github.com> Date: Wed, 22 May 2024 15:02:44 -0400 Subject: [PATCH 05/11] removing print statement --- ioos_metrics/ioos_metrics.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index 7a4c970..d02c414 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -562,7 +562,6 @@ def mbon_stats(): :return: """ import pyobis - import numpy as np import urllib.parse # collect dataset information from OBIS @@ -591,11 +590,9 @@ def mbon_stats(): df_gbif = pd.DataFrame() for key in df_mapping['gbif_uuid']: - url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key) - print(url) + url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key) df2 = pd.read_csv(url) # collect liturature cited information - df2.columns = ['literature_' + str(col) for col in df2.columns] df2['gbif_uuid'] = key From 1d78317a38cefc2d0ff88158787ee95e551198db Mon Sep 17 00:00:00 2001 From: Mathew Biddle <8480023+MathewBiddle@users.noreply.github.com> Date: Wed, 22 May 2024 15:25:05 -0400 Subject: [PATCH 06/11] Update ioos_metrics/ioos_metrics.py Co-authored-by: Filipe --- ioos_metrics/ioos_metrics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index d02c414..e7ed833 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -559,7 +559,6 @@ def mbon_stats(): This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a dataframe with rows corresponding to each dataset. - :return: """ import pyobis import urllib.parse From 8fef5719f4688c5745c81796204f02281e50c1a2 Mon Sep 17 00:00:00 2001 From: Mathew Biddle <8480023+MathewBiddle@users.noreply.github.com> Date: Wed, 22 May 2024 15:25:15 -0400 Subject: [PATCH 07/11] Update ioos_metrics/ioos_metrics.py Co-authored-by: Filipe --- ioos_metrics/ioos_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index e7ed833..0c8f0d2 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -576,7 +576,7 @@ def mbon_stats(): # create a mapping table for title in df_obis['obis_title']: string = title - query = '{}/v1/dataset/search?q={}'.format(base_url, urllib.parse.quote(string)) + query = f'{base_url}/v1/dataset/search?q={urllib.parse.quote(string)}' df = pd.read_json(query, orient='index').T # build a DataFrame with the info we need more accessible From c36c9b7c85d3ea384458783c50986fb1476c955b Mon Sep 17 00:00:00 2001 From: Mathew Biddle <8480023+MathewBiddle@users.noreply.github.com> Date: Wed, 22 May 2024 15:25:24 -0400 Subject: [PATCH 08/11] Update ioos_metrics/ioos_metrics.py Co-authored-by: Filipe --- ioos_metrics/ioos_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index 0c8f0d2..5973a42 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -568,7 +568,7 @@ def mbon_stats(): query = pyobis.dataset.search(instituteid=institution_id) df = pd.DataFrame(query.execute()) df_obis = pd.DataFrame.from_records(df["results"]) - df_obis.columns = ['obis_' + str(col) for col in df_obis.columns] + df_obis.columns = [f'obis_{col}' for col in df_obis.columns] df_mapping = pd.DataFrame() base_url = 'https://api.gbif.org' From 4745891dca52110e053228e44f0bef901298db4f Mon Sep 17 00:00:00 2001 From: Mathew Biddle <8480023+MathewBiddle@users.noreply.github.com> Date: Thu, 23 May 2024 10:09:21 -0400 Subject: [PATCH 09/11] adding test for mbon_stats moving test CI to environment.yml --- .github/workflows/tests.yml | 2 +- tests/test_metrics.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 02b7242..513a73c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: uses: mamba-org/setup-micromamba@v1 with: init-shell: bash - environment-file: conda-lock.yml + environment-file: environment.yml environment-name: TEST - name: Tests diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 00b8cb2..4ae2809 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -73,3 +73,8 @@ def test_update_metrics(): """Runs update_metrics in debug to log any possibles issues with the scrapping.""" df = update_metrics(debug=True) df.to_csv("updated_metrics.csv") + +def test_mbon_stats(): + df = ioos_metrics.mbon_stats() + assert isinstance(df, pd.DataFrame) + assert not df.empty \ No newline at end of file From 207dc9fd47783b68b30e13546a04e2b19463aeea Mon Sep 17 00:00:00 2001 From: Mathew Biddle <8480023+MathewBiddle@users.noreply.github.com> Date: Thu, 23 May 2024 10:36:30 -0400 Subject: [PATCH 10/11] testing columns clarifying documentation --- ioos_metrics/ioos_metrics.py | 2 +- tests/test_metrics.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index 5973a42..13d11fc 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -558,7 +558,7 @@ def mbon_stats(): """ This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a - dataframe with rows corresponding to each dataset. + dataframe with rows corresponding to each paper citing a dataset. """ import pyobis import urllib.parse diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 4ae2809..8d373e7 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -76,5 +76,21 @@ def test_update_metrics(): def test_mbon_stats(): df = ioos_metrics.mbon_stats() + columns = ['literature_title', 'literature_authors', 'literature_source', + 'literature_discovered', 'literature_published', + 'literature_open_access', 'literature_peer_review', + 'literature_citation_type', 'literature_countries_of_coverage', + 'literature_countries_of_researcher', 'literature_keywords', + 'literature_literature_type', 'literature_websites', + 'literature_identifiers', 'literature_id', 'literature_abstract', + 'literature_topics', 'literature_added', 'literature_gbif_download_key', + 'gbif_uuid', 'obis_id', 'obis_url', 'obis_archive', 'obis_published', + 'obis_created', 'obis_updated', 'obis_core', 'obis_extensions', + 'obis_statistics', 'obis_extent', 'obis_title', 'obis_citation', + 'obis_citation_id', 'obis_abstract', 'obis_intellectualrights', + 'obis_feed', 'obis_institutes', 'obis_contacts', 'obis_nodes', + 'obis_keywords', 'obis_downloads', 'obis_records', 'title', 'doi'] + assert isinstance(df, pd.DataFrame) + assert all([col in df.columns for col in columns]) assert not df.empty \ No newline at end of file From 94024f30f11097fcc8c75896eea5b1dc04e9221c Mon Sep 17 00:00:00 2001 From: Mathew Biddle <8480023+MathewBiddle@users.noreply.github.com> Date: Thu, 23 May 2024 10:37:06 -0400 Subject: [PATCH 11/11] Update ioos_metrics/ioos_metrics.py Co-authored-by: Filipe --- ioos_metrics/ioos_metrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index 13d11fc..9464f34 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -554,6 +554,7 @@ def hf_radar_installations(): # This is a hardcoded number at the moment! return 165 +@functools.lru_cache(maxsize=128) def mbon_stats(): """ This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity