Skip to content

Commit

Permalink
Merge pull request #1 from gfbio/bms_provider_refactoring
Browse files Browse the repository at this point in the history
ABCD Provider Refactoring
  • Loading branch information
ChristianBeilschmidt authored Aug 26, 2019
2 parents 956474c + 146ad18 commit 98416e7
Show file tree
Hide file tree
Showing 31 changed files with 3,019 additions and 1,173 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Settings for debug
settings-example.toml
settings.toml
vat_abcd_crawler.log

# Created by https://www.gitignore.io/api/rust,clion+all
Expand Down
32 changes: 32 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
language: rust

rust:
- stable

services:
- postgresql

addons:
postgresql: "11.2"

cache: cargo

before_install:
- sudo apt-get update
- sudo apt-get --yes remove postgresql\*
- sudo apt-get install -y postgresql-11 postgresql-client-11
- sudo cp /etc/postgresql/{9.6,11}/main/pg_hba.conf
- sudo service postgresql restart 11

before_script:
- psql -c 'create database travis_ci_test;' -U postgres
- touch settings.toml
- echo '[database]' >> settings.toml
- echo 'database = "travis_ci_test"' >> settings.toml
- echo 'tls = false' >> settings.toml
- echo 'user = "postgres"' >> settings.toml
- echo 'password = ""' >> settings.toml

script:
- cargo build --verbose --all
- cargo test --verbose --all
6 changes: 5 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ config = { version = "0.9", features = ["toml"] }
failure = "0.1"
failure_derive = "0.1"
log = "0.4"
postgres = { version = "0.15", features = ['with-openssl'] }
postgres = "0.15"
postgres-openssl = "0.1"
quick-xml = "0.13"
reqwest = "0.9"
serde = { version = "1.0", features = ["derive"] }
Expand All @@ -20,3 +21,6 @@ sha1 = "0.6"
simplelog = "0.5"
tempfile = "3.0"
zip = "0.5"

[dev-dependencies]
mockito = "0.17.1"
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
[![Build Status](https://travis-ci.org/gfbio/vat-abcd-crawler.svg?branch=master)](https://travis-ci.org/gfbio/vat-abcd-crawler)

# VAT ABCD Crawler

This repository contains the ABCD crawler for the VAT system.
Expand Down
21 changes: 13 additions & 8 deletions settings.toml → settings-default.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@ dataset_limit = 3

[abcd]
fields_file = "abcd-fields.json"
landing_page_field = "/DataSets/DataSet/Metadata/Description/Representation/URI"
storage_dir = "raw_data"

[bms]
monitor_url = "http://bms.gfbio.org/services/xml-archives/?provider=&dsa="
provider_url = "https://bms.gfbio.org/services/providers/?provider=&name="
landing_page_url = "http://bms.gfbio.org/services/landingpages/?output=json"
[pangaea]
search_url = "https://ws.pangaea.de/es/dataportal-gfbio/pansimple/_search"
scroll_url = "https://ws.pangaea.de/es/_search/scroll"

[terminology_service]
landingpage_url = "https://terminologies.gfbio.org/tools/landingpages/landingpage.php"

[database]
host = "localhost"
Expand All @@ -22,14 +26,15 @@ database = ""
user = ""
password = ""
schema = ""
dataset_table = ""
temp_dataset_table = ""
dataset_table = "abcd_datasets"
temp_dataset_table = "abcd_datasets_temp"
surrogate_key_column = "surrogate_key"
dataset_id_column = "dataset_id"
dataset_path_column = "dataset_path"
dataset_landing_page_column = "dataset_landing_page"
dataset_provider_column = "dataset_provider"
unit_table = ""
temp_unit_table = ""
unit_table = "abcd_units"
temp_unit_table = "abcd_units_temp"
listing_view = "dataset_listing"
unit_indexed_columns = [
"/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LongitudeDecimal",
Expand Down
142 changes: 142 additions & 0 deletions src/abcd/abcd_fields.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
use std::collections::hash_map::Values;
use std::collections::HashMap;
use std::fs::File;
use std::io::BufReader;
use std::path::Path;

use failure::Error;
use serde::{Deserialize, Serialize};

/// This struct reflect a field within the ABCD fields specification file.
#[derive(Debug, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct AbcdField {
pub name: String,
pub numeric: bool,
pub vat_mandatory: bool,
pub gfbio_mandatory: bool,
pub global_field: bool,
pub unit: String,
}

type BinaryString = Vec<u8>;

#[derive(Debug)]
pub struct AbcdFields {
fields: HashMap<BinaryString, AbcdField>,
}

impl AbcdFields {
pub fn from_path(path: &Path) -> Result<Self, Error> {
let file = File::open(path)?;
let reader = BufReader::new(file);

Ok(Self {
fields: Self::fields_to_map(serde_json::from_reader(reader)?),
})
}

/// This function creates a map from binary field name to `AbcdField` from a list of `AbcdField`s.
fn fields_to_map(fields: Vec<AbcdField>) -> HashMap<Vec<u8>, AbcdField> {
let mut map = HashMap::with_capacity(fields.len());
for field in fields {
map.insert(field.name.as_bytes().into(), field);
}
map
}

pub fn value_of(&self, field: &[u8]) -> Option<&AbcdField> {
self.fields.get(field)
}

pub fn len(&self) -> usize {
self.fields.len()
}
}

impl<'a> IntoIterator for &'a AbcdFields {
type Item = &'a AbcdField;
type IntoIter = Values<'a, BinaryString, AbcdField>;

fn into_iter(self) -> Self::IntoIter {
self.fields.values()
}
}

#[cfg(test)]
mod tests {
use tempfile::TempPath;

use crate::test_utils;

use super::*;

#[test]
fn simple_file() {
let path = create_test_file_path();

let abcd_fields = AbcdFields::from_path(&path).expect("Unable to deserialize input.");

assert_eq!(abcd_fields.len(), 2);

let field1 = abcd_fields
.value_of(&b"/DataSets/DataSet/DatasetGUID".to_vec())
.expect("Field not found");
assert_eq!(field1.name, "/DataSets/DataSet/DatasetGUID");
assert_eq!(field1.numeric, false);
assert_eq!(field1.vat_mandatory, false);
assert_eq!(field1.gfbio_mandatory, false);
assert_eq!(field1.global_field, true);
assert!(field1.unit.is_empty());

let field2 = abcd_fields
.value_of(&b"/DataSets/DataSet/Units/Unit/SourceInstitutionID".to_vec())
.expect("Field not found");
assert_eq!(
field2.name,
"/DataSets/DataSet/Units/Unit/SourceInstitutionID"
);
assert_eq!(field2.numeric, false);
assert_eq!(field2.vat_mandatory, true);
assert_eq!(field2.gfbio_mandatory, true);
assert_eq!(field2.global_field, false);
assert_eq!(field2.unit, "TEST");
}

#[test]
fn iterate_values() {
let path = create_test_file_path();

let abcd_fields = AbcdFields::from_path(&path).expect("Unable to deserialize input.");

let mut number_of_fields = 0;
for _field in &abcd_fields {
number_of_fields += 1;
}

assert_eq!(number_of_fields, 2);
}

fn create_test_file_path() -> TempPath {
test_utils::create_temp_file(
r#"[
{
"name": "/DataSets/DataSet/DatasetGUID",
"numeric": false,
"vatMandatory": false,
"gfbioMandatory": false,
"globalField": true,
"unit": ""
},
{
"name": "/DataSets/DataSet/Units/Unit/SourceInstitutionID",
"numeric": false,
"vatMandatory": true,
"gfbioMandatory": true,
"globalField": false,
"unit": "TEST"
}
]"#,
)
}
}
Loading

0 comments on commit 98416e7

Please sign in to comment.