Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

starting mbon_stats #74

Merged
merged 11 commits into from
May 23, 2024
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
uses: mamba-org/setup-micromamba@v1
with:
init-shell: bash
environment-file: conda-lock.yml
environment-file: environment.yml
environment-name: TEST

- name: Tests
Expand Down
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ dependencies:
- pdfminer.six
- pyarrow
- pytest
- pyobis
- requests
- suds
53 changes: 53 additions & 0 deletions ioos_metrics/ioos_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,59 @@ def hf_radar_installations():
# This is a hardcoded number at the moment!
return 165

@functools.lru_cache(maxsize=128)
def mbon_stats():
MathewBiddle marked this conversation as resolved.
Show resolved Hide resolved
"""
This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity
Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a
dataframe with rows corresponding to each paper citing a dataset.
"""
import pyobis
import urllib.parse

# collect dataset information from OBIS
institution_id = 23070
query = pyobis.dataset.search(instituteid=institution_id)
df = pd.DataFrame(query.execute())
df_obis = pd.DataFrame.from_records(df["results"])
df_obis.columns = [f'obis_{col}' for col in df_obis.columns]

df_mapping = pd.DataFrame()
base_url = 'https://api.gbif.org'
# iterate through each OBIS dataset to gather uuid from GBIF
# create a mapping table
for title in df_obis['obis_title']:
string = title
query = f'{base_url}/v1/dataset/search?q={urllib.parse.quote(string)}'
df = pd.read_json(query, orient='index').T

# build a DataFrame with the info we need more accessible
df_mapping = pd.concat([df_mapping, pd.DataFrame({
'gbif_uuid': df['results'].values[0][0]['key'],
'title': [df['results'].values[0][0]['title']],
'obis_id': [df_obis.loc[df_obis['obis_title']==title,'obis_id'].to_string(index=False)],
'doi': [df['results'].values[0][0]['doi']]
})], ignore_index=True)

df_gbif = pd.DataFrame()
for key in df_mapping['gbif_uuid']:

url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key)
df2 = pd.read_csv(url) # collect liturature cited information
df2.columns = ['literature_' + str(col) for col in df2.columns]
df2['gbif_uuid'] = key

df_gbif = pd.concat([df2,df_gbif], ignore_index=True)

# merge the OBIS and GBIF data frames together
df_obis = df_obis.merge(df_mapping, on='obis_id')

df_out = df_gbif.merge(df_obis, on='gbif_uuid')

return df_out




def update_metrics(*, debug=False):
"""Load previous metrics and update the spreadsheet."""
Expand Down
21 changes: 21 additions & 0 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,24 @@ def test_update_metrics():
"""Runs update_metrics in debug to log any possibles issues with the scrapping."""
df = update_metrics(debug=True)
df.to_csv("updated_metrics.csv")

def test_mbon_stats():
df = ioos_metrics.mbon_stats()
columns = ['literature_title', 'literature_authors', 'literature_source',
'literature_discovered', 'literature_published',
'literature_open_access', 'literature_peer_review',
'literature_citation_type', 'literature_countries_of_coverage',
'literature_countries_of_researcher', 'literature_keywords',
'literature_literature_type', 'literature_websites',
'literature_identifiers', 'literature_id', 'literature_abstract',
'literature_topics', 'literature_added', 'literature_gbif_download_key',
'gbif_uuid', 'obis_id', 'obis_url', 'obis_archive', 'obis_published',
'obis_created', 'obis_updated', 'obis_core', 'obis_extensions',
'obis_statistics', 'obis_extent', 'obis_title', 'obis_citation',
'obis_citation_id', 'obis_abstract', 'obis_intellectualrights',
'obis_feed', 'obis_institutes', 'obis_contacts', 'obis_nodes',
'obis_keywords', 'obis_downloads', 'obis_records', 'title', 'doi']

assert isinstance(df, pd.DataFrame)
assert all([col in df.columns for col in columns])
assert not df.empty