Skip to content

Commit

Permalink
Merge pull request #74 from MathewBiddle/mbon_stats
Browse files Browse the repository at this point in the history
starting mbon_stats
  • Loading branch information
laurabrenskelle authored May 23, 2024
2 parents b131d12 + 94024f3 commit fb0aa66
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
uses: mamba-org/setup-micromamba@v1
with:
init-shell: bash
environment-file: conda-lock.yml
environment-file: environment.yml
environment-name: TEST

- name: Tests
Expand Down
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ dependencies:
- pdfminer.six
- pyarrow
- pytest
- pyobis
- requests
- suds
53 changes: 53 additions & 0 deletions ioos_metrics/ioos_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,59 @@ def hf_radar_installations():
# This is a hardcoded number at the moment!
return 165

@functools.lru_cache(maxsize=128)
def mbon_stats():
"""
This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity
Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a
dataframe with rows corresponding to each paper citing a dataset.
"""
import pyobis
import urllib.parse

# collect dataset information from OBIS
institution_id = 23070
query = pyobis.dataset.search(instituteid=institution_id)
df = pd.DataFrame(query.execute())
df_obis = pd.DataFrame.from_records(df["results"])
df_obis.columns = [f'obis_{col}' for col in df_obis.columns]

df_mapping = pd.DataFrame()
base_url = 'https://api.gbif.org'
# iterate through each OBIS dataset to gather uuid from GBIF
# create a mapping table
for title in df_obis['obis_title']:
string = title
query = f'{base_url}/v1/dataset/search?q={urllib.parse.quote(string)}'
df = pd.read_json(query, orient='index').T

# build a DataFrame with the info we need more accessible
df_mapping = pd.concat([df_mapping, pd.DataFrame({
'gbif_uuid': df['results'].values[0][0]['key'],
'title': [df['results'].values[0][0]['title']],
'obis_id': [df_obis.loc[df_obis['obis_title']==title,'obis_id'].to_string(index=False)],
'doi': [df['results'].values[0][0]['doi']]
})], ignore_index=True)

df_gbif = pd.DataFrame()
for key in df_mapping['gbif_uuid']:

url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key)
df2 = pd.read_csv(url) # collect liturature cited information
df2.columns = ['literature_' + str(col) for col in df2.columns]
df2['gbif_uuid'] = key

df_gbif = pd.concat([df2,df_gbif], ignore_index=True)

# merge the OBIS and GBIF data frames together
df_obis = df_obis.merge(df_mapping, on='obis_id')

df_out = df_gbif.merge(df_obis, on='gbif_uuid')

return df_out




def update_metrics(*, debug=False):
"""Load previous metrics and update the spreadsheet."""
Expand Down
21 changes: 21 additions & 0 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,24 @@ def test_update_metrics():
"""Runs update_metrics in debug to log any possibles issues with the scrapping."""
df = update_metrics(debug=True)
df.to_csv("updated_metrics.csv")

def test_mbon_stats():
df = ioos_metrics.mbon_stats()
columns = ['literature_title', 'literature_authors', 'literature_source',
'literature_discovered', 'literature_published',
'literature_open_access', 'literature_peer_review',
'literature_citation_type', 'literature_countries_of_coverage',
'literature_countries_of_researcher', 'literature_keywords',
'literature_literature_type', 'literature_websites',
'literature_identifiers', 'literature_id', 'literature_abstract',
'literature_topics', 'literature_added', 'literature_gbif_download_key',
'gbif_uuid', 'obis_id', 'obis_url', 'obis_archive', 'obis_published',
'obis_created', 'obis_updated', 'obis_core', 'obis_extensions',
'obis_statistics', 'obis_extent', 'obis_title', 'obis_citation',
'obis_citation_id', 'obis_abstract', 'obis_intellectualrights',
'obis_feed', 'obis_institutes', 'obis_contacts', 'obis_nodes',
'obis_keywords', 'obis_downloads', 'obis_records', 'title', 'doi']

assert isinstance(df, pd.DataFrame)
assert all([col in df.columns for col in columns])
assert not df.empty

0 comments on commit fb0aa66

Please sign in to comment.