From 91b17db41a68635b857601c5cc384a53c3f8ef9d Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Fri, 24 May 2019 15:48:15 +0200 Subject: [PATCH 01/31] added mockito and refactored BmsProvider(s) --- Cargo.toml | 1 + src/bms_providers.rs | 85 +++++++++++++++++++++------ src/main.rs | 137 +++++++++++++++++++++++++++---------------- 3 files changed, 156 insertions(+), 67 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2035e76..1c80e9f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ config = { version = "0.9", features = ["toml"] } failure = "0.1" failure_derive = "0.1" log = "0.4" +mockito = "0.17.1" postgres = { version = "0.15", features = ['with-openssl'] } quick-xml = "0.13" reqwest = "0.9" diff --git a/src/bms_providers.rs b/src/bms_providers.rs index 6d26669..0929f9c 100644 --- a/src/bms_providers.rs +++ b/src/bms_providers.rs @@ -1,6 +1,6 @@ use failure::Error; -use std::collections::HashMap; use serde::Deserialize; +use std::collections::HashMap; /// This struct contains all provider information. /// The identifier is the `url`, strange as it seems. @@ -13,23 +13,74 @@ pub struct BmsProvider { pub biocase_url: String, } -/// This function downloads a list of providers from the BMS. -pub fn load_bms_providers(url: &str) -> Result, Error> { - Ok( - reqwest::Client::new() - .get(url) - .send()? - .json()? - ) +#[derive(Debug)] +pub struct BmsProviders { + providers: HashMap, } -/// This function downloads the BMS providers and provides them -/// as a map from `url`to `BmsProvider`. -pub fn load_bms_providers_as_map(url: &str) -> Result, Error> { - let providers = load_bms_providers(url)?; - Ok( - providers.into_iter() +impl BmsProviders { + pub fn from_url(url: &str) -> Result { + let providers: Vec = reqwest::Client::new().get(url).send()?.json()?; + let provider_map = providers + .into_iter() .map(|provider| (provider.url.clone(), provider)) - .collect() - ) + .collect(); + Ok(Self { + providers: provider_map, + }) + } + + pub fn get(&self, url: &str) -> Option<&BmsProvider> { + self.providers.get(url) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use mockito::{mock, Mock}; + + #[test] + fn downloads_providers() { + let _webserver = create_json_webserver(r#" + [ + { + "id": "6", + "shortname": "BGBM", + "name": "Botanic Garden and Botanical Museum Berlin, Freie Universit\u00e4t Berlin", + "url": "www.bgbm.org", + "biocase_url": "https:\/\/ww3.bgbm.org\/biocase\/" + }, + { + "id": "5", + "shortname": "DSMZ", + "name": "Leibniz Institute DSMZ \u2013 German Collection of Microorganisms and Cell Cultures, Braunschweig", + "url": "www.dsmz.de", + "biocase_url": "http:\/\/biocase.dsmz.de\/wrappers\/biocase" + } + ]"# + ); + + let bms_providers = match BmsProviders::from_url(&mockito::server_url()) { + Ok(providers) => providers, + Err(error) => panic!(error), + }; + + let bgbm = bms_providers.get("www.bgbm.org"); + assert!(bgbm.is_some()); + assert_eq!(bgbm.unwrap().id, "6"); + + let dsmz = bms_providers.get("www.dsmz.de"); + assert!(dsmz.is_some()); + assert_eq!(dsmz.unwrap().id, "5"); + + assert!(bms_providers.get("").is_none()); + } + + fn create_json_webserver(json_string: &str) -> Mock { + mock("GET", "/") + .with_header("content-type", "application/json") + .with_body(json_string) + .create() + } } diff --git a/src/main.rs b/src/main.rs index edb4e15..18a5e0d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,21 @@ +use std::fs::File; +use std::path::Path; + +use clap::{crate_authors, crate_description, crate_version, App, Arg}; +use failure::Error; +use log::{error, info, trace, warn}; +use simplelog::{CombinedLogger, SharedLogger, TermLogger, WriteLogger}; + +use settings::Settings; + +use crate::abcd_fields::load_abcd_fields; +use crate::abcd_parser::AbcdParser; +use crate::archive_reader::ArchiveReader; +use crate::bms_datasets::download_datasets; +use crate::bms_datasets::load_bms_datasets; +use crate::bms_providers::BmsProviders; +use crate::database_sink::DatabaseSink; + mod abcd_fields; mod abcd_parser; mod abcd_version; @@ -8,42 +26,32 @@ mod database_sink; mod settings; mod vat_type; -use clap::{App, Arg, crate_authors, crate_description, crate_version}; -use crate::abcd_fields::load_abcd_fields; -use crate::abcd_parser::AbcdParser; -use crate::archive_reader::ArchiveReader; -use crate::bms_datasets::download_datasets; -use crate::bms_datasets::load_bms_datasets; -use crate::database_sink::DatabaseSink; -use failure::Error; -use log::{info, trace, warn, error}; -use settings::Settings; -use simplelog::{CombinedLogger, SharedLogger, TermLogger, WriteLogger}; -use std::fs::File; -use std::path::Path; -use crate::bms_providers::load_bms_providers_as_map; - fn main() { let matches = App::new("VAT ABCD Crawler") .version(crate_version!()) .author(crate_authors!()) .about(crate_description!()) - .arg(Arg::with_name("settings") - .index(1) - .short("s") - .long("settings") - .value_name("SETTINGS") - .help("Specify the settings file") - .required(true) - .takes_value(true)) + .arg( + Arg::with_name("settings") + .index(1) + .short("s") + .long("settings") + .value_name("SETTINGS") + .help("Specify the settings file") + .required(true) + .takes_value(true), + ) .get_matches(); let settings_path = Path::new( - matches.value_of("settings").expect("There must be a settings path specified.") + matches + .value_of("settings") + .expect("There must be a settings path specified."), ); let settings = Settings::new(settings_path).expect("Unable to use config file."); - initialize_logger(Path::new(&settings.general.log_file), &settings).expect("Unable to initialize logger."); + initialize_logger(Path::new(&settings.general.log_file), &settings) + .expect("Unable to initialize logger."); let temp_dir = match tempfile::tempdir() { Ok(dir) => dir, @@ -69,7 +77,7 @@ fn main() { } }; - let bms_providers = match load_bms_providers_as_map(&settings.bms.provider_url) { + let bms_providers = match BmsProviders::from_url(&settings.bms.provider_url) { Ok(providers) => providers, Err(e) => { error!("Unable to download providers from BMS: {}", e); @@ -88,8 +96,21 @@ fn main() { let mut abcd_parser = AbcdParser::new(&abcd_fields); for path_result in download_datasets(temp_dir.path(), &bms_datasets) - .skip(settings.debug.dataset_start.filter(|_| settings.general.debug).unwrap_or(std::usize::MIN)) - .take(settings.debug.dataset_limit.filter(|_| settings.general.debug).unwrap_or(std::usize::MAX)) { + .skip( + settings + .debug + .dataset_start + .filter(|_| settings.general.debug) + .unwrap_or(std::usize::MIN), + ) + .take( + settings + .debug + .dataset_limit + .filter(|_| settings.general.debug) + .unwrap_or(std::usize::MAX), + ) + { let download = match path_result { Ok(d) => d, Err(e) => { @@ -98,18 +119,24 @@ fn main() { } }; trace!("Temp file: {}", download.path.display()); - info!("Processing `{}` @ `{}` ({})", - download.dataset.dataset, - download.dataset.provider_datacenter, - download.dataset.get_latest_archive() - .map(|archive| archive.xml_archive.as_str()) - .unwrap_or_else(|_| "-") + info!( + "Processing `{}` @ `{}` ({})", + download.dataset.dataset, + download.dataset.provider_datacenter, + download + .dataset + .get_latest_archive() + .map(|archive| archive.xml_archive.as_str()) + .unwrap_or_else(|_| "-") ); let bms_provider = match bms_providers.get(&download.dataset.provider_url) { Some(provider) => provider, None => { - warn!("Unable to retrieve BMS provider from map for {}", download.dataset.provider_url); + warn!( + "Unable to retrieve BMS provider from map for {}", + download.dataset.provider_url + ); continue; } }; @@ -117,12 +144,18 @@ fn main() { let landing_page = match download.dataset.get_landing_page(&settings, &bms_provider) { Ok(landing_page) => landing_page, Err(e) => { - warn!("Unable to generate landing page for {}; {}", download.dataset.dataset, e); + warn!( + "Unable to generate landing page for {}; {}", + download.dataset.dataset, e + ); continue; } }; - for xml_bytes_result in ArchiveReader::from_path(&download.path).unwrap().bytes_iter() { + for xml_bytes_result in ArchiveReader::from_path(&download.path) + .unwrap() + .bytes_iter() + { let xml_bytes = match xml_bytes_result { Ok(bytes) => bytes, Err(e) => { @@ -131,14 +164,16 @@ fn main() { } }; -// let mut string = String::from_utf8(xml_bytes).unwrap(); -// string.truncate(200); -// dbg!(string); + // let mut string = String::from_utf8(xml_bytes).unwrap(); + // string.truncate(200); + // dbg!(string); - let abcd_data = match abcd_parser.parse(&download.url, - &landing_page, - &bms_provider.name, - &xml_bytes) { + let abcd_data = match abcd_parser.parse( + &download.url, + &landing_page, + &bms_provider.name, + &xml_bytes, + ) { Ok(data) => data, Err(e) => { warn!("Unable to retrieve ABCD data: {}", e); @@ -147,9 +182,9 @@ fn main() { }; trace!("{:?}", abcd_data.dataset); -// for unit in abcd_data.units { -// trace!("{:?}", unit); -// } + // for unit in abcd_data.units { + // trace!("{:?}", unit); + // } match database_sink.insert_dataset(&abcd_data) { Ok(_) => (), @@ -179,9 +214,11 @@ fn initialize_logger(file_path: &Path, settings: &Settings) -> Result<(), Error> } if let Ok(file) = File::create(file_path) { - loggers.push( - WriteLogger::new(log_level, simplelog::Config::default(), file) - ); + loggers.push(WriteLogger::new( + log_level, + simplelog::Config::default(), + file, + )); } CombinedLogger::init(loggers)?; From ff15fdedb72f1845c71973163111fbebc065cbc2 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Mon, 27 May 2019 15:21:18 +0200 Subject: [PATCH 02/31] refactoring of get method to value_of --- src/bms_providers.rs | 8 ++++---- src/main.rs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/bms_providers.rs b/src/bms_providers.rs index 0929f9c..f9735cd 100644 --- a/src/bms_providers.rs +++ b/src/bms_providers.rs @@ -30,7 +30,7 @@ impl BmsProviders { }) } - pub fn get(&self, url: &str) -> Option<&BmsProvider> { + pub fn value_of(&self, url: &str) -> Option<&BmsProvider> { self.providers.get(url) } } @@ -66,15 +66,15 @@ mod tests { Err(error) => panic!(error), }; - let bgbm = bms_providers.get("www.bgbm.org"); + let bgbm = bms_providers.value_of("www.bgbm.org"); assert!(bgbm.is_some()); assert_eq!(bgbm.unwrap().id, "6"); - let dsmz = bms_providers.get("www.dsmz.de"); + let dsmz = bms_providers.value_of("www.dsmz.de"); assert!(dsmz.is_some()); assert_eq!(dsmz.unwrap().id, "5"); - assert!(bms_providers.get("").is_none()); + assert!(bms_providers.value_of("").is_none()); } fn create_json_webserver(json_string: &str) -> Mock { diff --git a/src/main.rs b/src/main.rs index 18a5e0d..1a97dc5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -130,7 +130,7 @@ fn main() { .unwrap_or_else(|_| "-") ); - let bms_provider = match bms_providers.get(&download.dataset.provider_url) { + let bms_provider = match bms_providers.value_of(&download.dataset.provider_url) { Some(provider) => provider, None => { warn!( From 06110e64a8f4e3a4b84bee2b5abb036c86e2bda6 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Mon, 27 May 2019 16:30:15 +0200 Subject: [PATCH 03/31] refactored abcd_fields --- src/abcd_fields.rs | 147 ++++++++++++++++++++--- src/abcd_parser.rs | 75 ++++++------ src/database_sink.rs | 273 ++++++++++++++++++++++++++----------------- src/main.rs | 6 +- 4 files changed, 342 insertions(+), 159 deletions(-) diff --git a/src/abcd_fields.rs b/src/abcd_fields.rs index 9c668ad..f96712f 100644 --- a/src/abcd_fields.rs +++ b/src/abcd_fields.rs @@ -1,9 +1,11 @@ -use failure::Error; -use std::path::Path; +use std::collections::hash_map::Values; +use std::collections::HashMap; use std::fs::File; use std::io::BufReader; +use std::path::Path; + +use failure::Error; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; /// This struct reflect a field within the ABCD fields specification file. #[derive(Debug, Deserialize, Serialize)] @@ -17,19 +19,132 @@ pub struct AbcdField { pub unit: String, } -/// This function loads all `AbcdField`s from a given file path. -/// It returns a map from the binary field name to the `AbcdField`. -pub fn load_abcd_fields(path: &Path) -> Result, AbcdField>, Error> { - let file = File::open(path)?; - let reader = BufReader::new(file); - Ok(fields_to_map(serde_json::from_reader(reader)?)) +type BinaryString = Vec; + +#[derive(Debug)] +pub struct AbcdFields { + fields: HashMap, } -/// This function creates a map from binary field name to `AbcdField` from a list of `AbcdField`s. -fn fields_to_map(fields: Vec) -> HashMap, AbcdField> { - let mut map = HashMap::with_capacity(fields.len()); - for field in fields { - map.insert(field.name.as_bytes().into(), field); +impl AbcdFields { + pub fn from_path(path: &Path) -> Result { + let file = File::open(path)?; + let reader = BufReader::new(file); + + Ok(Self { + fields: Self::fields_to_map(serde_json::from_reader(reader)?), + }) + } + + /// This function creates a map from binary field name to `AbcdField` from a list of `AbcdField`s. + fn fields_to_map(fields: Vec) -> HashMap, AbcdField> { + let mut map = HashMap::with_capacity(fields.len()); + for field in fields { + map.insert(field.name.as_bytes().into(), field); + } + map } - map -} \ No newline at end of file + + pub fn value_of(&self, field: &[u8]) -> Option<&AbcdField> { + self.fields.get(field) + } + + pub fn len(&self) -> usize { + self.fields.len() + } +} + +impl<'a> IntoIterator for &'a AbcdFields { + type Item = &'a AbcdField; + type IntoIter = Values<'a, BinaryString, AbcdField>; + + fn into_iter(self) -> Self::IntoIter { + self.fields.values() + } +} + +#[cfg(test)] +mod tests { + use std::io::Write; + + use tempfile::{NamedTempFile, TempPath}; + + use super::*; + + #[test] + fn simple_file() { + let path = test_file_path(); + + let abcd_fields = AbcdFields::from_path(&path).expect("Unable to deserialize input."); + + assert_eq!(abcd_fields.len(), 2); + + let field1 = abcd_fields + .value_of(&b"/DataSets/DataSet/DatasetGUID".to_vec()) + .expect("Field not found"); + assert_eq!(field1.name, "/DataSets/DataSet/DatasetGUID"); + assert_eq!(field1.numeric, false); + assert_eq!(field1.vat_mandatory, false); + assert_eq!(field1.gfbio_mandatory, false); + assert_eq!(field1.global_field, true); + assert!(field1.unit.is_empty()); + + let field2 = abcd_fields + .value_of(&b"/DataSets/DataSet/Units/Unit/SourceInstitutionID".to_vec()) + .expect("Field not found"); + assert_eq!( + field2.name, + "/DataSets/DataSet/Units/Unit/SourceInstitutionID" + ); + assert_eq!(field2.numeric, false); + assert_eq!(field2.vat_mandatory, true); + assert_eq!(field2.gfbio_mandatory, true); + assert_eq!(field2.global_field, false); + assert_eq!(field2.unit, "TEST"); + } + + #[test] + fn iterate_values() { + let path = test_file_path(); + + let abcd_fields = AbcdFields::from_path(&path).expect("Unable to deserialize input."); + + let mut number_of_fields = 0; + for _field in &abcd_fields { + number_of_fields += 1; + } + + assert_eq!(number_of_fields, 2); + } + + fn test_file_path() -> TempPath { + create_temp_file( + r#"[ + { + "name": "/DataSets/DataSet/DatasetGUID", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": false, + "globalField": true, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Units/Unit/SourceInstitutionID", + "numeric": false, + "vatMandatory": true, + "gfbioMandatory": true, + "globalField": false, + "unit": "TEST" + } + ]"#, + ) + } + + fn create_temp_file(content: &str) -> TempPath { + let mut file = NamedTempFile::new().expect("Unable to create file to test."); + + write!(file, "{}", content).expect("Unable to write content to test file."); + + file.into_temp_path() + } +} diff --git a/src/abcd_parser.rs b/src/abcd_parser.rs index 6e2fe28..484dea0 100644 --- a/src/abcd_parser.rs +++ b/src/abcd_parser.rs @@ -1,18 +1,20 @@ -use crate::abcd_fields::AbcdField; -use crate::abcd_version::AbcdVersion; -use crate::vat_type::VatType; +use std::collections::HashMap; + use failure::Error; use failure::Fail; use quick_xml::events::Event; use quick_xml::Reader; -use std::collections::HashMap; + +use crate::abcd_fields::AbcdFields; +use crate::abcd_version::AbcdVersion; +use crate::vat_type::VatType; pub type ValueMap = HashMap; /// This parser processes ABCD XML files. #[derive(Debug)] pub struct AbcdParser<'a> { - abcd_fields: &'a HashMap, AbcdField>, + abcd_fields: &'a AbcdFields, abcd_version: AbcdVersion, xml_tag_path: Vec, xml_buffer: Vec, @@ -21,7 +23,7 @@ pub struct AbcdParser<'a> { impl<'a> AbcdParser<'a> { /// Create a new `AbcdParser`. - pub fn new(abcd_fields: &'a HashMap, AbcdField>) -> Self { + pub fn new(abcd_fields: &'a AbcdFields) -> Self { Self { abcd_fields, abcd_version: AbcdVersion::Unknown, @@ -32,11 +34,13 @@ impl<'a> AbcdParser<'a> { } /// Parse a binary XML file to `AbcdResult`s. - pub fn parse(&mut self, - dataset_path: &str, - landing_page: &str, - provider_id: &str, - xml_bytes: &[u8]) -> Result { + pub fn parse( + &mut self, + dataset_path: &str, + landing_page: &str, + provider_id: &str, + xml_bytes: &[u8], + ) -> Result { let mut xml_reader = Reader::from_reader(xml_bytes); xml_reader.trim_text(true); @@ -49,7 +53,7 @@ impl<'a> AbcdParser<'a> { self.xml_tag_path.push(b'/'); self.xml_tag_path.extend(Self::strip_tag(e.name())); -// debug!("XML START: {}", String::from_utf8_lossy(&self.xml_tag_path)); + // debug!("XML START: {}", String::from_utf8_lossy(&self.xml_tag_path)); match self.xml_tag_path.as_slice() { b"/DataSets" => { @@ -67,13 +71,13 @@ impl<'a> AbcdParser<'a> { } } -// dbg!(&abcd_version); + // dbg!(&abcd_version); } b"/DataSets/DataSet/Units" => { -// eprintln!("Dataset Metadata:"); -// dbg!(&numeric_values); -// dbg!(&textual_values); -// dbg!(units); + // eprintln!("Dataset Metadata:"); + // dbg!(&numeric_values); + // dbg!(&textual_values); + // dbg!(units); dataset_data = Some(self.finish_map()) } @@ -86,25 +90,24 @@ impl<'a> AbcdParser<'a> { let tag: Vec = Self::strip_tag(e.name()).cloned().collect(); let stripped_name_length = tag.len(); - self.xml_tag_path.truncate(self.xml_tag_path.len() - stripped_name_length - SEPARATOR_LENGTH); + self.xml_tag_path.truncate( + self.xml_tag_path.len() - stripped_name_length - SEPARATOR_LENGTH, + ); if self.xml_tag_path == b"/DataSets/DataSet/Units" && tag == b"Unit" { -// eprintln!("Unit Data:"); -// dbg!(&numeric_values); -// dbg!(&textual_values); + // eprintln!("Unit Data:"); + // dbg!(&numeric_values); + // dbg!(&textual_values); units.push(self.finish_map()); } } Ok(Event::Text(ref e)) => { - if let Some(abcd_field) = self.abcd_fields.get(&self.xml_tag_path) { + if let Some(abcd_field) = self.abcd_fields.value_of(&self.xml_tag_path) { if abcd_field.numeric { let string = String::from_utf8_lossy(e.escaped()); if let Ok(number) = string.parse::() { - self.values.insert( - abcd_field.name.clone(), - number.into(), - ); + self.values.insert(abcd_field.name.clone(), number.into()); } } else { self.values.insert( @@ -115,7 +118,11 @@ impl<'a> AbcdParser<'a> { } } Ok(Event::Eof) => break, // exits the loop when reaching end of file - Err(e) => panic!("Error at position {}: {:?}", xml_reader.buffer_position(), e), + Err(e) => panic!( + "Error at position {}: {:?}", + xml_reader.buffer_position(), + e + ), _ => (), // ignore all other events } @@ -152,7 +159,7 @@ impl<'a> AbcdParser<'a> { } /// Strip the namespace from a tag. - fn strip_tag(tag: &[u8]) -> impl Iterator { + fn strip_tag(tag: &[u8]) -> impl Iterator { let has_colon = tag.iter().any(|&b| b == b':'); tag.iter() .skip_while(move |&&b| has_colon && b != b':') @@ -171,11 +178,13 @@ pub struct AbcdResult { impl AbcdResult { /// This constructor creates a new `AbcdResult` from dataset and unit data. - pub fn new(dataset_path: String, - landing_page: String, - provider_id: String, - dataset_data: ValueMap, - units_data: Vec) -> Self { + pub fn new( + dataset_path: String, + landing_page: String, + provider_id: String, + dataset_data: ValueMap, + units_data: Vec, + ) -> Self { AbcdResult { dataset_path, landing_page, diff --git a/src/database_sink.rs b/src/database_sink.rs index e2a46eb..7397f76 100644 --- a/src/database_sink.rs +++ b/src/database_sink.rs @@ -1,19 +1,22 @@ -use crate::abcd_fields::AbcdField; -use crate::abcd_parser::AbcdResult; -use crate::abcd_parser::ValueMap; -use crate::settings; +use std::collections::hash_map::Entry; +use std::collections::HashMap; + +use csv::WriterBuilder; use failure::{Error, Fail}; use log::debug; use postgres::params::ConnectParams; use postgres::params::Host; use postgres::tls::openssl::OpenSsl; -use postgres::{Connection, TlsMode}; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use csv::WriterBuilder; use postgres::transaction::Transaction; +use postgres::{Connection, TlsMode}; -const POSTGRES_CSV_CONFIGURATION: &str = "DELIMITER '\t', NULL '', QUOTE '\"', ESCAPE '\"', FORMAT CSV"; +use crate::abcd_fields::AbcdFields; +use crate::abcd_parser::AbcdResult; +use crate::abcd_parser::ValueMap; +use crate::settings; + +const POSTGRES_CSV_CONFIGURATION: &str = + "DELIMITER '\t', NULL '', QUOTE '\"', ESCAPE '\"', FORMAT CSV"; /// A PostgreSQL database DAO for storing datasets. pub struct DatabaseSink<'s> { @@ -29,8 +32,10 @@ pub struct DatabaseSink<'s> { impl<'s> DatabaseSink<'s> { /// Create a new PostgreSQL database sink (DAO). - pub fn new(database_settings: &'s settings::Database, - abcd_fields: &HashMap, AbcdField>) -> Result { + pub fn new( + database_settings: &'s settings::Database, + abcd_fields: &AbcdFields, + ) -> Result { // create database connection params from the settings, including optional tls let negotiator = if database_settings.tls { Some(OpenSsl::new()?) @@ -49,7 +54,7 @@ impl<'s> DatabaseSink<'s> { let mut unit_fields = Vec::new(); let mut unit_fields_hash = Vec::new(); let mut hasher = sha1::Sha1::new(); - for field in abcd_fields.values() { + for field in abcd_fields { let hash = { hasher.reset(); hasher.update(field.name.as_bytes()); @@ -88,7 +93,7 @@ impl<'s> DatabaseSink<'s> { } /// Initialize the temporary database schema. - fn initialize_temporary_schema(&mut self, abcd_fields: &HashMap, AbcdField>) -> Result<(), Error> { + fn initialize_temporary_schema(&mut self, abcd_fields: &AbcdFields) -> Result<(), Error> { self.drop_temporary_tables()?; self.create_temporary_dataset_table(abcd_fields)?; @@ -103,11 +108,14 @@ impl<'s> DatabaseSink<'s> { /// Create and fill a temporary mapping table from hashes to field names. fn create_and_fill_temporary_mapping_table(&mut self) -> Result<(), Error> { // create table - self.connection.execute(&format!( + self.connection.execute( + &format!( "create table {schema}.{table}_translation (name text not null, hash text not null);", schema = self.database_settings.schema, table = self.database_settings.temp_dataset_table - ), &[])?; + ), + &[], + )?; // fill table let statement = self.connection.prepare(&format!( @@ -126,16 +134,22 @@ impl<'s> DatabaseSink<'s> { } /// Create the temporary unit table - fn create_temporary_unit_table(&mut self, abcd_fields: &HashMap, AbcdField>) -> Result<(), Error> { - let mut fields = vec![ - format!("{} int not null", self.database_settings.dataset_id_column), - ]; + fn create_temporary_unit_table(&mut self, abcd_fields: &AbcdFields) -> Result<(), Error> { + let mut fields = vec![format!( + "{} int not null", + self.database_settings.dataset_id_column + )]; for (field, hash) in self.unit_fields.iter().zip(&self.unit_fields_hash) { - let abcd_field = abcd_fields.get(field.as_bytes()) + let abcd_field = abcd_fields + .value_of(field.as_bytes()) .ok_or_else(|| DatabaseSinkError::InconsistentUnitColumns(field.clone()))?; - let data_type_string = if abcd_field.numeric { "double precision" } else { "text" }; + let data_type_string = if abcd_field.numeric { + "double precision" + } else { + "text" + }; // TODO: enforce/filter not null // let null_string = if abcd_field.vat_mandatory { "NOT NULL" } else { "" } @@ -144,30 +158,50 @@ impl<'s> DatabaseSink<'s> { fields.push(format!("\"{}\" {} {}", hash, data_type_string, null_string)); } - self.connection.execute(&format!( - "CREATE TABLE {schema}.{table} ( {fields} );", - schema = &self.database_settings.schema, - table = self.database_settings.temp_unit_table, - fields = fields.join(",") - ), &[])?; + self.connection.execute( + &format!( + "CREATE TABLE {schema}.{table} ( {fields} );", + schema = &self.database_settings.schema, + table = self.database_settings.temp_unit_table, + fields = fields.join(",") + ), + &[], + )?; Ok(()) } /// Create the temporary dataset table - fn create_temporary_dataset_table(&mut self, abcd_fields: &HashMap, AbcdField>) -> Result<(), Error> { + fn create_temporary_dataset_table(&mut self, abcd_fields: &AbcdFields) -> Result<(), Error> { let mut fields = vec![ - format!("{} int primary key", self.database_settings.dataset_id_column), // id - format!("{} text not null", self.database_settings.dataset_path_column), // path - format!("{} text not null", self.database_settings.dataset_landing_page_column), // landing page - format!("{} text not null", self.database_settings.dataset_provider_column), // provider name + format!( + "{} int primary key", + self.database_settings.dataset_id_column + ), // id + format!( + "{} text not null", + self.database_settings.dataset_path_column + ), // path + format!( + "{} text not null", + self.database_settings.dataset_landing_page_column + ), // landing page + format!( + "{} text not null", + self.database_settings.dataset_provider_column + ), // provider name ]; for (field, hash) in self.dataset_fields.iter().zip(&self.dataset_fields_hash) { - let abcd_field = abcd_fields.get(field.as_bytes()) + let abcd_field = abcd_fields + .value_of(field.as_bytes()) .ok_or_else(|| DatabaseSinkError::InconsistentDatasetColumns(field.clone()))?; - let data_type_string = if abcd_field.numeric { "double precision" } else { "text" }; + let data_type_string = if abcd_field.numeric { + "double precision" + } else { + "text" + }; // TODO: enforce/filter not null // let null_string = if abcd_field.vat_mandatory { "NOT NULL" } else { "" } @@ -176,12 +210,15 @@ impl<'s> DatabaseSink<'s> { fields.push(format!("\"{}\" {} {}", hash, data_type_string, null_string)); } - self.connection.execute(&format!( - "CREATE TABLE {schema}.{table} ( {fields} );", - schema = &self.database_settings.schema, - table = self.database_settings.temp_dataset_table, - fields = fields.join(",") - ), &[])?; + self.connection.execute( + &format!( + "CREATE TABLE {schema}.{table} ( {fields} );", + schema = &self.database_settings.schema, + table = self.database_settings.temp_dataset_table, + fields = fields.join(",") + ), + &[], + )?; Ok(()) } @@ -190,17 +227,23 @@ impl<'s> DatabaseSink<'s> { fn drop_temporary_tables(&mut self) -> Result<(), Error> { for statement in &[ // unit temp table - format!("DROP TABLE IF EXISTS {schema}.{table};", - schema = &self.database_settings.schema, - table = &self.database_settings.temp_unit_table), + format!( + "DROP TABLE IF EXISTS {schema}.{table};", + schema = &self.database_settings.schema, + table = &self.database_settings.temp_unit_table + ), // dataset temp table - format!("DROP TABLE IF EXISTS {schema}.{table};", - schema = &self.database_settings.schema, - table = &self.database_settings.temp_dataset_table), + format!( + "DROP TABLE IF EXISTS {schema}.{table};", + schema = &self.database_settings.schema, + table = &self.database_settings.temp_dataset_table + ), // translation temp table - format!("DROP TABLE IF EXISTS {schema}.{table}_translation;", - schema = &self.database_settings.schema, - table = &self.database_settings.temp_dataset_table), + format!( + "DROP TABLE IF EXISTS {schema}.{table}_translation;", + schema = &self.database_settings.schema, + table = &self.database_settings.temp_dataset_table + ), ] { self.connection.execute(statement, &[])?; } @@ -216,7 +259,7 @@ impl<'s> DatabaseSink<'s> { let transaction = self.connection.transaction_with( postgres::transaction::Config::new() .isolation_level(postgres::transaction::IsolationLevel::Serializable) - .read_only(false) + .read_only(false), )?; self.drop_old_tables(&transaction)?; @@ -236,21 +279,29 @@ impl<'s> DatabaseSink<'s> { fn drop_old_tables(&self, transaction: &Transaction) -> Result<(), Error> { for statement in &[ // listing view - format!("DROP VIEW IF EXISTS {schema}.{view_name};", - schema = self.database_settings.schema, - view_name = self.database_settings.listing_view), + format!( + "DROP VIEW IF EXISTS {schema}.{view_name};", + schema = self.database_settings.schema, + view_name = self.database_settings.listing_view + ), // unit table - format!("DROP TABLE IF EXISTS {schema}.{table};", - schema = self.database_settings.schema, - table = self.database_settings.unit_table), + format!( + "DROP TABLE IF EXISTS {schema}.{table};", + schema = self.database_settings.schema, + table = self.database_settings.unit_table + ), // dataset table - format!("DROP TABLE IF EXISTS {schema}.{table};", - schema = self.database_settings.schema, - table = self.database_settings.dataset_table), + format!( + "DROP TABLE IF EXISTS {schema}.{table};", + schema = self.database_settings.schema, + table = self.database_settings.dataset_table + ), // translation table - format!("DROP TABLE IF EXISTS {schema}.{table}_translation;", - schema = self.database_settings.schema, - table = self.database_settings.dataset_table), + format!( + "DROP TABLE IF EXISTS {schema}.{table}_translation;", + schema = self.database_settings.schema, + table = self.database_settings.dataset_table + ), ] { transaction.execute(statement, &[])?; } @@ -262,20 +313,26 @@ impl<'s> DatabaseSink<'s> { fn rename_temporary_tables(&self, transaction: &Transaction) -> Result<(), Error> { for statement in &[ // unit table - format!("ALTER TABLE {schema}.{temp_table} RENAME TO {table};", - schema = self.database_settings.schema, - temp_table = self.database_settings.temp_unit_table, - table = self.database_settings.unit_table), + format!( + "ALTER TABLE {schema}.{temp_table} RENAME TO {table};", + schema = self.database_settings.schema, + temp_table = self.database_settings.temp_unit_table, + table = self.database_settings.unit_table + ), // dataset table - format!("ALTER TABLE {schema}.{temp_table} RENAME TO {table};", - schema = self.database_settings.schema, - temp_table = self.database_settings.temp_dataset_table, - table = self.database_settings.dataset_table), + format!( + "ALTER TABLE {schema}.{temp_table} RENAME TO {table};", + schema = self.database_settings.schema, + temp_table = self.database_settings.temp_dataset_table, + table = self.database_settings.dataset_table + ), // translation table - format!("ALTER TABLE {schema}.{temp_table}_translation RENAME TO {table}_translation;", - schema = self.database_settings.schema, - temp_table = self.database_settings.temp_dataset_table, - table = self.database_settings.dataset_table), + format!( + "ALTER TABLE {schema}.{temp_table}_translation RENAME TO {table}_translation;", + schema = self.database_settings.schema, + temp_table = self.database_settings.temp_dataset_table, + table = self.database_settings.dataset_table + ), ] { transaction.execute(statement, &[])?; } @@ -287,19 +344,23 @@ impl<'s> DatabaseSink<'s> { fn rename_constraints_and_indexes(&self, transaction: &Transaction) -> Result<(), Error> { for statement in &[ // foreign key - format!("ALTER TABLE {schema}.{table} \ - RENAME CONSTRAINT {temp_prefix}_{temp_suffix}_fk TO {prefix}_{suffix}_fk;", - schema = &self.database_settings.schema, - table = &self.database_settings.unit_table, - temp_prefix = &self.database_settings.temp_unit_table, - temp_suffix = &self.database_settings.dataset_id_column, - prefix = &self.database_settings.unit_table, - suffix = &self.database_settings.dataset_id_column), + format!( + "ALTER TABLE {schema}.{table} \ + RENAME CONSTRAINT {temp_prefix}_{temp_suffix}_fk TO {prefix}_{suffix}_fk;", + schema = &self.database_settings.schema, + table = &self.database_settings.unit_table, + temp_prefix = &self.database_settings.temp_unit_table, + temp_suffix = &self.database_settings.dataset_id_column, + prefix = &self.database_settings.unit_table, + suffix = &self.database_settings.dataset_id_column + ), // index - format!("ALTER INDEX {schema}.{temp_index}_idx RENAME TO {index}_idx;", - schema = &self.database_settings.schema, - temp_index = &self.database_settings.temp_unit_table, - index = &self.database_settings.unit_table), + format!( + "ALTER INDEX {schema}.{temp_index}_idx RENAME TO {index}_idx;", + schema = &self.database_settings.schema, + temp_index = &self.database_settings.temp_unit_table, + index = &self.database_settings.unit_table + ), ] { transaction.execute(statement, &[])?; } @@ -311,8 +372,8 @@ impl<'s> DatabaseSink<'s> { fn create_indexes_and_statistics(&mut self) -> Result<(), Error> { let foreign_key_statement = format!( "ALTER TABLE {schema}.{unit_table} \ - ADD CONSTRAINT {unit_table}_{dataset_id}_fk \ - FOREIGN KEY ({dataset_id}) REFERENCES {schema}.{dataset_table}({dataset_id});", + ADD CONSTRAINT {unit_table}_{dataset_id}_fk \ + FOREIGN KEY ({dataset_id}) REFERENCES {schema}.{dataset_table}({dataset_id});", schema = &self.database_settings.schema, unit_table = &self.database_settings.temp_unit_table, dataset_id = &self.database_settings.dataset_id_column, @@ -321,7 +382,10 @@ impl<'s> DatabaseSink<'s> { debug!("{}", &foreign_key_statement); self.connection.execute(&foreign_key_statement, &[])?; let mut hasher = sha1::Sha1::new(); - let indexed_unit_column_names = self.database_settings.unit_indexed_columns.iter() + let indexed_unit_column_names = self + .database_settings + .unit_indexed_columns + .iter() .map(|field| { hasher.reset(); hasher.update(field.as_bytes()); @@ -330,7 +394,7 @@ impl<'s> DatabaseSink<'s> { .collect::>(); let unit_index_statement = format!( "CREATE INDEX {unit_table}_idx ON {schema}.{unit_table} \ - USING btree ({dataset_id}, \"{other}\");", + USING btree ({dataset_id}, \"{other}\");", schema = &self.database_settings.schema, unit_table = &self.database_settings.temp_unit_table, dataset_id = &self.database_settings.dataset_id_column, @@ -364,8 +428,7 @@ impl<'s> DatabaseSink<'s> { } /// Create view that provides a listing view - pub fn create_listing_view(&self, - transaction: &Transaction) -> Result<(), Error> { + pub fn create_listing_view(&self, transaction: &Transaction) -> Result<(), Error> { // TODO: replace full names with settings call let mut hasher = sha1::Sha1::new(); @@ -450,12 +513,14 @@ impl<'s> DatabaseSink<'s> { } /// Insert the dataset metadata into the temporary schema - fn insert_dataset_metadata(database_settings: &settings::Database, - connection: &Connection, - dataset_fields: &[String], - dataset_fields_hash: &[String], - abcd_data: &AbcdResult, - id: u32) -> Result<(), Error> { + fn insert_dataset_metadata( + database_settings: &settings::Database, + connection: &Connection, + dataset_fields: &[String], + dataset_fields_hash: &[String], + abcd_data: &AbcdResult, + id: u32, + ) -> Result<(), Error> { let mut values = WriterBuilder::new() .terminator(csv::Terminator::Any(b'\n')) .delimiter(b'\t') @@ -497,10 +562,7 @@ impl<'s> DatabaseSink<'s> { // dbg!(String::from_utf8_lossy(value_string.as_slice())); let statement = connection.prepare(©_statement)?; - statement.copy_in( - &[], - &mut value_string.as_slice(), - )?; + statement.copy_in(&[], &mut value_string.as_slice())?; Ok(()) } @@ -544,11 +606,8 @@ impl<'s> DatabaseSink<'s> { ); let statement = self.connection.prepare(©_statement)?; -// dbg!(&value_string); - statement.copy_in( - &[], - &mut values.into_inner()?.as_slice(), - )?; + // dbg!(&value_string); + statement.copy_in(&[], &mut values.into_inner()?.as_slice())?; Ok(()) } diff --git a/src/main.rs b/src/main.rs index 1a97dc5..12ff45b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,7 +8,7 @@ use simplelog::{CombinedLogger, SharedLogger, TermLogger, WriteLogger}; use settings::Settings; -use crate::abcd_fields::load_abcd_fields; +use crate::abcd_fields::AbcdFields; use crate::abcd_parser::AbcdParser; use crate::archive_reader::ArchiveReader; use crate::bms_datasets::download_datasets; @@ -61,8 +61,8 @@ fn main() { } }; - let abcd_fields = match load_abcd_fields(Path::new(&settings.abcd.fields_file)) { - Ok(map) => map, + let abcd_fields = match AbcdFields::from_path(Path::new(&settings.abcd.fields_file)) { + Ok(fields) => fields, Err(e) => { error!("Unable to load ABCD file: {}", e); return; // stop program From a132cf9b2008f10e591ca216c1d55f24da11c659 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Mon, 27 May 2019 16:30:27 +0200 Subject: [PATCH 04/31] added more default fields to settings.toml --- settings.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/settings.toml b/settings.toml index 77e0223..951707f 100644 --- a/settings.toml +++ b/settings.toml @@ -22,14 +22,14 @@ database = "" user = "" password = "" schema = "" -dataset_table = "" -temp_dataset_table = "" +dataset_table = "abcd_datasets" +temp_dataset_table = "abcd_datasets_temp" dataset_id_column = "dataset_id" dataset_path_column = "dataset_path" dataset_landing_page_column = "dataset_landing_page" dataset_provider_column = "dataset_provider" -unit_table = "" -temp_unit_table = "" +unit_table = "abcd_units" +temp_unit_table = "abcd_units_temp" listing_view = "dataset_listing" unit_indexed_columns = [ "/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LongitudeDecimal", From 3dc30a8733d9f0252b877162bf44a6323814b756 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Mon, 27 May 2019 17:21:05 +0200 Subject: [PATCH 05/31] created tests for `ArchiveReader` --- src/archive_reader.rs | 101 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 10 deletions(-) diff --git a/src/archive_reader.rs b/src/archive_reader.rs index 4025392..14c9558 100644 --- a/src/archive_reader.rs +++ b/src/archive_reader.rs @@ -1,28 +1,23 @@ use failure::Error; use std::fs::File; use std::io::BufReader; -use zip::ZipArchive; use std::io::Read; use std::path::Path; +use zip::ZipArchive; /// This struct provides a reader that processes a stream of XML files in a ZIP archive. pub struct ArchiveReader { archive: ZipArchive>, -// archive_name: String, } impl ArchiveReader { /// Create an `ArchiveReader` from a path to a ZIP archive. pub fn from_path(path: &Path) -> Result { -// let archive_name = path.display().to_string(); - let file = File::open(path)?; let reader = BufReader::new(file); let archive = ZipArchive::new(reader)?; - Ok(Self { - archive, - }) + Ok(Self { archive }) } /// Creates an iterator that traverses over all XML files in the ZIP archive. @@ -35,7 +30,7 @@ impl ArchiveReader { } } -/// This iterator traverses over all XML files in the ZIP archive. +/// This iterator traverses over all files (bytes) in the ZIP archive. pub struct ArchiveReaderBytesIter<'a> { index: usize, end: usize, @@ -64,10 +59,96 @@ impl<'a> Iterator for ArchiveReaderBytesIter<'a> { } } -/// Read the `index`th XML file from a ZIP archive. -fn read_contents_of_file(archive: &mut ZipArchive>, index: usize) -> Result, Error> { +/// Read the `index`th file from a ZIP archive. +fn read_contents_of_file( + archive: &mut ZipArchive>, + index: usize, +) -> Result, Error> { let mut inner_file = archive.by_index(index)?; let mut content = Vec::new(); inner_file.read_to_end(&mut content)?; Ok(content) } + +#[cfg(test)] +mod test { + use super::*; + + use std::io::Write; + use tempfile::{NamedTempFile, TempPath}; + + #[test] + fn read_simple_zip_file() { + let path = create_zip_file(&[MockFile { + name: "Test".into(), + content: "Foobar".into(), + }]); + + let mut reader = ArchiveReader::from_path(&path).expect("Cannot read file."); + let mut archive_iter = reader.bytes_iter(); + + let file = archive_iter + .next() + .expect("Missing first file") + .expect("Unable to read first file"); + + assert_eq!(file, b"Foobar"); + + assert!(archive_iter.next().is_none()); + } + + #[test] + fn read_multiple_files_in_zip_file() { + let path = create_zip_file(&[ + MockFile { + name: "Test".into(), + content: "Foo".into(), + }, + MockFile { + name: "Test2".into(), + content: "Bar".into(), + }, + ]); + + let mut reader = ArchiveReader::from_path(&path).expect("Cannot read file."); + let archive_iter = reader.bytes_iter(); + + let mut number_of_files = 0; + let mut contents = Vec::>::new(); + for bytes in archive_iter.map(Result::unwrap) { + number_of_files += 1; + contents.push(bytes); + } + + assert_eq!(number_of_files, 2); + assert_eq!(contents, vec![b"Foo", b"Bar"]); + } + + struct MockFile { + pub name: String, + pub content: String, + } + + fn create_zip_file(files: &[MockFile]) -> TempPath { + let mut file = NamedTempFile::new().expect("Unable to create file to test."); + + { + let mut zip_writer = zip::ZipWriter::new(&mut file); + + let options = zip::write::FileOptions::default() + .compression_method(zip::CompressionMethod::Stored); + for file in files { + zip_writer + .start_file(file.name.as_str(), options) + .expect("Unable to start file in zip archive."); + zip_writer + .write_all(file.content.as_bytes()) + .expect("Unable to write file in zip archive."); + } + + zip_writer.finish().expect("Unable to finish zip archive."); + } + + file.into_temp_path() + } +} From 5ce0df30843fa6f6b1f0982ab64b52bd578d23e6 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Mon, 27 May 2019 17:27:07 +0200 Subject: [PATCH 06/31] refactored DatasetContainsNoFileError to DatasetContainsNoFile because it should indicate that it contains no file and not that it contains no error --- src/bms_datasets.rs | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/src/bms_datasets.rs b/src/bms_datasets.rs index b47473c..e5ec2d3 100644 --- a/src/bms_datasets.rs +++ b/src/bms_datasets.rs @@ -1,12 +1,12 @@ +use crate::bms_providers::BmsProvider; +use crate::settings::Settings; use failure::Error; use failure::Fail; -use std::path::Path; -use std::fs::File; use serde::{Deserialize, Serialize}; -use std::path::PathBuf; +use std::fs::File; use std::io::BufWriter; -use crate::bms_providers::BmsProvider; -use crate::settings::Settings; +use std::path::Path; +use std::path::PathBuf; /// This struct contains dataset information from the BMS #[derive(Debug, Deserialize, Serialize)] @@ -37,14 +37,19 @@ pub struct BmsLandingPage { impl BmsDataset { /// Retrieve the archive with the latest flag from a BMS archive. - pub fn get_latest_archive(&self) -> Result<&BmsXmlArchive, DatasetContainsNoFileError> { - self.xml_archives.iter() + pub fn get_latest_archive(&self) -> Result<&BmsXmlArchive, DatasetContainsNoFile> { + self.xml_archives + .iter() .find(|archive| archive.latest) // get latest archive version - .ok_or_else(|| DatasetContainsNoFileError::new(&self.dataset)) + .ok_or_else(|| DatasetContainsNoFile::new(&self.dataset)) } /// Call the landing page generator from the BMS and return the resulting url string. - pub fn get_landing_page(&self, settings: &Settings, providers: &BmsProvider) -> Result { + pub fn get_landing_page( + &self, + settings: &Settings, + providers: &BmsProvider, + ) -> Result { reqwest::Client::new() .get(&format!( "{}&provider={}&dsa={}", @@ -59,12 +64,7 @@ impl BmsDataset { /// This function downloads a list of dataset information from the BMS. pub fn load_bms_datasets(url: &str) -> Result, Error> { - Ok( - reqwest::Client::new() - .get(url) - .send()? - .json()? - ) + Ok(reqwest::Client::new().get(url).send()?.json()?) } /// This struct combines dataset information and a path to the downloaded archive file. @@ -84,7 +84,10 @@ impl<'d> DownloadedBmsDataset<'d> { /// Download all datasets into a given temporary directory. /// This function returns an iterator over `DownloadedBmsDataset`. -pub fn download_datasets<'d, 't>(temp_dir: &'t Path, datasets: &'d [BmsDataset]) -> impl Iterator, Error>> + 'd { +pub fn download_datasets<'d, 't>( + temp_dir: &'t Path, + datasets: &'d [BmsDataset], +) -> impl Iterator, Error>> + 'd { let temp_dir = temp_dir.to_path_buf(); datasets.iter().enumerate().map(move |(i, dataset)| { let url = dataset.get_latest_archive()?.xml_archive.clone(); @@ -96,11 +99,11 @@ pub fn download_datasets<'d, 't>(temp_dir: &'t Path, datasets: &'d [BmsDataset]) /// This error occurs when it is not possible to download a dataset archive. #[derive(Debug, Fail)] #[fail(display = "Dataset {} contains no file to download.", dataset)] -pub struct DatasetContainsNoFileError { +pub struct DatasetContainsNoFile { dataset: String, } -impl DatasetContainsNoFileError { +impl DatasetContainsNoFile { /// Create a new `DatasetContainsNoFileError` from a dataset name. pub fn new(dataset: &str) -> Self { Self { @@ -110,7 +113,11 @@ impl DatasetContainsNoFileError { } /// Download a dataset (the latest) into the given file path. -pub fn download_dataset(url: String, download_file_path: PathBuf, dataset: &BmsDataset) -> Result { +pub fn download_dataset( + url: String, + download_file_path: PathBuf, + dataset: &BmsDataset, +) -> Result { let mut response = reqwest::get(&url)?; let output = File::create(&download_file_path)?; From 86d7bc3032748ee27de0f7efa1069947275abe23 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Tue, 28 May 2019 16:06:38 +0200 Subject: [PATCH 07/31] refactored function name --- src/abcd_fields.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/abcd_fields.rs b/src/abcd_fields.rs index f96712f..ba74ddd 100644 --- a/src/abcd_fields.rs +++ b/src/abcd_fields.rs @@ -73,7 +73,7 @@ mod tests { #[test] fn simple_file() { - let path = test_file_path(); + let path = create_test_file_path(); let abcd_fields = AbcdFields::from_path(&path).expect("Unable to deserialize input."); @@ -105,7 +105,7 @@ mod tests { #[test] fn iterate_values() { - let path = test_file_path(); + let path = create_test_file_path(); let abcd_fields = AbcdFields::from_path(&path).expect("Unable to deserialize input."); @@ -117,7 +117,7 @@ mod tests { assert_eq!(number_of_fields, 2); } - fn test_file_path() -> TempPath { + fn create_test_file_path() -> TempPath { create_temp_file( r#"[ { From 874a989470bff8a9a475e86a2f78e6d09d7e32c6 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Wed, 29 May 2019 14:22:52 +0200 Subject: [PATCH 08/31] tests for abcd parser --- src/abcd_parser.rs | 173 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/src/abcd_parser.rs b/src/abcd_parser.rs index 484dea0..d471e46 100644 --- a/src/abcd_parser.rs +++ b/src/abcd_parser.rs @@ -199,3 +199,176 @@ impl AbcdResult { #[derive(Debug, Default, Fail)] #[fail(display = "ABCD file contains no dataset metadata.")] struct AbcdContainsNoDatasetMetadata {} + +#[cfg(test)] +mod tests { + use crate::test_utils; + + use super::*; + + const TECHNICAL_CONTACT_NAME: &str = "TECHNICAL CONTACT NAME"; + const DESCRIPTION_TITLE: &str = "DESCRIPTION TITLE"; + const UNIT_ID: &str = "UNIT ID"; + const UNIT_LONGITUDE: f64 = 10.911; + const UNIT_LATITUDE: f64 = 49.911; + const UNIT_SPATIAL_DATUM: &str = "TECHNICAL WGS84 EMAIL"; + + #[test] + fn simple_file() { + let abcd_fields = create_abcd_fields(); + let test_file = create_file_as_bytes(); + + let mut parser = AbcdParser::new(&abcd_fields); + + let dataset_path = "dataset_path"; + let landing_page = "landing_page"; + let provider_id = "provider_id"; + + let result = parser + .parse(dataset_path, landing_page, provider_id, &test_file) + .expect("Unable to parse bytes"); + + assert_eq!(result.dataset_path, dataset_path); + assert_eq!(result.landing_page, landing_page); + assert_eq!(result.provider_id, provider_id); + + assert_eq!( + Some(&VatType::Textual(TECHNICAL_CONTACT_NAME.into())), + result + .dataset + .get("/DataSets/DataSet/TechnicalContacts/TechnicalContact/Name") + ); + assert_eq!( + Some(&VatType::Textual(DESCRIPTION_TITLE.into())), + result + .dataset + .get("/DataSets/DataSet/Metadata/Description/Representation/Title") + ); + + assert_eq!(result.units.len(), 1); + + let unit = result.units.get(0).unwrap(); + + assert_eq!( + Some(&VatType::Textual(UNIT_ID.into())), + unit.get("/DataSets/DataSet/Units/Unit/UnitID") + ); + assert_eq!( + Some(&VatType::Textual(UNIT_SPATIAL_DATUM.into())), + unit.get("/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/SpatialDatum") + ); + + if let (Some(&VatType::Numeric(longitude)), Some(&VatType::Numeric(latitude))) = ( + unit.get("/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LongitudeDecimal"), + unit.get("/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LatitudeDecimal") + ) { + assert!(f64::abs(longitude - UNIT_LONGITUDE) < 0.01); + assert!(f64::abs(latitude - UNIT_LATITUDE) < 0.01); + } + } + + fn create_file_as_bytes() -> Vec { + format!( + r#" + + + + + + {TECHNICAL_CONTACT_NAME} + + + + + + {DESCRIPTION_TITLE} + + + + + + {UNIT_ID} + + + + + {UNIT_LONGITUDE} + {UNIT_LATITUDE} + {UNIT_SPATIAL_DATUM} + + + + + + + + + "#, + TECHNICAL_CONTACT_NAME = TECHNICAL_CONTACT_NAME, + DESCRIPTION_TITLE = DESCRIPTION_TITLE, + UNIT_ID = UNIT_ID, + UNIT_LONGITUDE = UNIT_LONGITUDE, + UNIT_LATITUDE = UNIT_LATITUDE, + UNIT_SPATIAL_DATUM = UNIT_SPATIAL_DATUM, + ).into_bytes() + } + + fn create_abcd_fields() -> AbcdFields { + let fields_file = test_utils::create_temp_file( + r#"[ + { + "name": "/DataSets/DataSet/TechnicalContacts/TechnicalContact/Name", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Metadata/Description/Representation/Title", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Units/Unit/UnitID", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": false, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LongitudeDecimal", + "numeric": true, + "vatMandatory": true, + "gfbioMandatory": true, + "globalField": false, + "unit": "°" + }, + { + "name": "/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LatitudeDecimal", + "numeric": true, + "vatMandatory": true, + "gfbioMandatory": true, + "globalField": false, + "unit": "°" + }, + { + "name": "/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/SpatialDatum", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": false, + "unit": "" + } + ]"#, + ); + + AbcdFields::from_path(&fields_file).expect("Unable to create ABCD Fields Spec") + } +} From b61399855374628b7db0a120c5bedb7525767706 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Wed, 29 May 2019 14:23:31 +0200 Subject: [PATCH 09/31] moved mockito to dev dependencies --- Cargo.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 1c80e9f..33815ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,6 @@ config = { version = "0.9", features = ["toml"] } failure = "0.1" failure_derive = "0.1" log = "0.4" -mockito = "0.17.1" postgres = { version = "0.15", features = ['with-openssl'] } quick-xml = "0.13" reqwest = "0.9" @@ -21,3 +20,6 @@ sha1 = "0.6" simplelog = "0.5" tempfile = "3.0" zip = "0.5" + +[dev-dependencies] +mockito = "0.17.1" From cdc072f00c44d1cc24fc4ed7b6bedb64019717c1 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Wed, 29 May 2019 14:25:33 +0200 Subject: [PATCH 10/31] refactored settings --- .gitignore | 2 +- settings.toml => settings-default.toml | 0 src/settings.rs | 35 +++++++++++++++++++++++--- 3 files changed, 32 insertions(+), 5 deletions(-) rename settings.toml => settings-default.toml (100%) diff --git a/.gitignore b/.gitignore index 9a5685a..82bb985 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ # Settings for debug -settings-example.toml +settings.toml vat_abcd_crawler.log # Created by https://www.gitignore.io/api/rust,clion+all diff --git a/settings.toml b/settings-default.toml similarity index 100% rename from settings.toml rename to settings-default.toml diff --git a/src/settings.rs b/src/settings.rs index b93e459..ae6b3c6 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -1,8 +1,9 @@ +use std::path::Path; + use config::Config; -use config::File; use config::ConfigError; +use config::File; use serde::Deserialize; -use std::path::Path; #[derive(Debug, Deserialize)] pub struct General { @@ -60,10 +61,36 @@ pub struct Settings { } impl Settings { - pub fn new(path: &Path) -> Result { + pub fn new(path: Option<&Path>) -> Result { let mut s = Config::new(); - s.merge(File::from(path))?; + s.merge(File::from(Path::new("settings-default.toml")))?; + s.merge(File::from(Path::new("settings.toml")))?; + if let Some(path) = path { + s.merge(File::from(path))?; + } s.try_into() } } + +#[cfg(test)] +mod test { + use crate::test_utils; + + use super::*; + + #[test] + fn load_file() { + let path = test_utils::create_temp_file_with_suffix( + ".toml", + r#" + [general] + debug = true + "#, + ); + + let settings = Settings::new(Some(&path)).expect("Unable to load settings."); + + assert!(settings.general.debug); + } +} From f6009e0c00a3bb711c8eab15dd2300999a81db7f Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Wed, 29 May 2019 14:25:57 +0200 Subject: [PATCH 11/31] added `PartialEq` to `VatType` --- src/vat_type.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vat_type.rs b/src/vat_type.rs index 6a83b8d..890926a 100644 --- a/src/vat_type.rs +++ b/src/vat_type.rs @@ -1,8 +1,8 @@ -use std::fmt; use std::borrow::Cow; +use std::fmt; /// This enum represents the VAT data types. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub enum VatType { Textual(String), Numeric(f64), From 7b9609692f9e519575f32f6ae3ac0e1f61af91d3 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Wed, 29 May 2019 14:26:11 +0200 Subject: [PATCH 12/31] created test utils --- src/test_utils.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 src/test_utils.rs diff --git a/src/test_utils.rs b/src/test_utils.rs new file mode 100644 index 0000000..bfe8b35 --- /dev/null +++ b/src/test_utils.rs @@ -0,0 +1,30 @@ +use std::io::Write; + +use mockito::{mock, Mock}; +use tempfile::TempPath; + +pub fn create_temp_file(content: &str) -> TempPath { + create_temp_file_with_suffix("", content) +} + +pub fn create_temp_file_with_suffix(suffix: &str, content: &str) -> TempPath { + let mut file = tempfile::Builder::new() + .suffix(suffix) + .tempfile() + .expect("Unable to create test file."); + + write!(file, "{}", content).expect("Unable to write content to test file."); + + file.into_temp_path() +} + +pub fn create_json_webserver(json_string: &str) -> Mock { + mock("GET", "/") + .with_header("content-type", "application/json") + .with_body(json_string) + .create() +} + +pub fn webserver_url() -> String { + mockito::server_url() +} From 5ea1f5c5e619b84e07e72aff7be7b49f333853d7 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Wed, 29 May 2019 14:26:52 +0200 Subject: [PATCH 13/31] use test_utils in tests --- src/abcd_fields.rs | 14 +++----------- src/{bms_providers.rs => bms/providers.rs} | 16 ++++++---------- 2 files changed, 9 insertions(+), 21 deletions(-) rename src/{bms_providers.rs => bms/providers.rs} (85%) diff --git a/src/abcd_fields.rs b/src/abcd_fields.rs index ba74ddd..68f0fce 100644 --- a/src/abcd_fields.rs +++ b/src/abcd_fields.rs @@ -65,9 +65,9 @@ impl<'a> IntoIterator for &'a AbcdFields { #[cfg(test)] mod tests { - use std::io::Write; + use tempfile::TempPath; - use tempfile::{NamedTempFile, TempPath}; + use crate::test_utils; use super::*; @@ -118,7 +118,7 @@ mod tests { } fn create_test_file_path() -> TempPath { - create_temp_file( + test_utils::create_temp_file( r#"[ { "name": "/DataSets/DataSet/DatasetGUID", @@ -139,12 +139,4 @@ mod tests { ]"#, ) } - - fn create_temp_file(content: &str) -> TempPath { - let mut file = NamedTempFile::new().expect("Unable to create file to test."); - - write!(file, "{}", content).expect("Unable to write content to test file."); - - file.into_temp_path() - } } diff --git a/src/bms_providers.rs b/src/bms/providers.rs similarity index 85% rename from src/bms_providers.rs rename to src/bms/providers.rs index f9735cd..5c8fe99 100644 --- a/src/bms_providers.rs +++ b/src/bms/providers.rs @@ -1,6 +1,7 @@ +use std::collections::HashMap; + use failure::Error; use serde::Deserialize; -use std::collections::HashMap; /// This struct contains all provider information. /// The identifier is the `url`, strange as it seems. @@ -37,12 +38,13 @@ impl BmsProviders { #[cfg(test)] mod tests { + use crate::test_utils; + use super::*; - use mockito::{mock, Mock}; #[test] fn downloads_providers() { - let _webserver = create_json_webserver(r#" + let _webserver = test_utils::create_json_webserver(r#" [ { "id": "6", @@ -61,7 +63,7 @@ mod tests { ]"# ); - let bms_providers = match BmsProviders::from_url(&mockito::server_url()) { + let bms_providers = match BmsProviders::from_url(&test_utils::webserver_url()) { Ok(providers) => providers, Err(error) => panic!(error), }; @@ -77,10 +79,4 @@ mod tests { assert!(bms_providers.value_of("").is_none()); } - fn create_json_webserver(json_string: &str) -> Mock { - mock("GET", "/") - .with_header("content-type", "application/json") - .with_body(json_string) - .create() - } } From 425e0abab4c58853c9f61f7c2f92c4e742421cf3 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Wed, 29 May 2019 14:27:18 +0200 Subject: [PATCH 14/31] moved bms functions to module --- src/{bms_datasets.rs => bms/datasets.rs} | 2 +- src/bms/mod.rs | 9 +++ src/main.rs | 93 +++++++++++++----------- 3 files changed, 61 insertions(+), 43 deletions(-) rename src/{bms_datasets.rs => bms/datasets.rs} (99%) create mode 100644 src/bms/mod.rs diff --git a/src/bms_datasets.rs b/src/bms/datasets.rs similarity index 99% rename from src/bms_datasets.rs rename to src/bms/datasets.rs index e5ec2d3..52f9859 100644 --- a/src/bms_datasets.rs +++ b/src/bms/datasets.rs @@ -1,4 +1,4 @@ -use crate::bms_providers::BmsProvider; +use crate::bms::BmsProvider; use crate::settings::Settings; use failure::Error; use failure::Fail; diff --git a/src/bms/mod.rs b/src/bms/mod.rs new file mode 100644 index 0000000..b127ad6 --- /dev/null +++ b/src/bms/mod.rs @@ -0,0 +1,9 @@ +mod datasets; +mod downloader; +mod providers; + +pub use self::datasets::{ + download_datasets, load_bms_datasets, BmsDataset, BmsLandingPage, BmsXmlArchive, + DownloadedBmsDataset, +}; +pub use self::providers::{BmsProvider, BmsProviders}; diff --git a/src/main.rs b/src/main.rs index 12ff45b..7f4ab1e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,56 +11,28 @@ use settings::Settings; use crate::abcd_fields::AbcdFields; use crate::abcd_parser::AbcdParser; use crate::archive_reader::ArchiveReader; -use crate::bms_datasets::download_datasets; -use crate::bms_datasets::load_bms_datasets; -use crate::bms_providers::BmsProviders; +use crate::bms::load_bms_datasets; +use crate::bms::BmsProviders; +use crate::bms::{download_datasets, BmsDataset}; use crate::database_sink::DatabaseSink; mod abcd_fields; mod abcd_parser; mod abcd_version; mod archive_reader; -mod bms_datasets; -mod bms_providers; +mod bms; mod database_sink; mod settings; +#[cfg(test)] +mod test_utils; mod vat_type; fn main() { - let matches = App::new("VAT ABCD Crawler") - .version(crate_version!()) - .author(crate_authors!()) - .about(crate_description!()) - .arg( - Arg::with_name("settings") - .index(1) - .short("s") - .long("settings") - .value_name("SETTINGS") - .help("Specify the settings file") - .required(true) - .takes_value(true), - ) - .get_matches(); - - let settings_path = Path::new( - matches - .value_of("settings") - .expect("There must be a settings path specified."), - ); - let settings = Settings::new(settings_path).expect("Unable to use config file."); + let settings = initialize_settings().expect("Unable to load settings file."); initialize_logger(Path::new(&settings.general.log_file), &settings) .expect("Unable to initialize logger."); - let temp_dir = match tempfile::tempdir() { - Ok(dir) => dir, - Err(e) => { - error!("Unable to create temporary directory: {}", e); - return; // stop program - } - }; - let abcd_fields = match AbcdFields::from_path(Path::new(&settings.abcd.fields_file)) { Ok(fields) => fields, Err(e) => { @@ -93,6 +65,26 @@ fn main() { } }; + if let Err(e) = process_datasets( + &settings, + &abcd_fields, + &mut database_sink, + bms_providers, + &bms_datasets, + ) { + error!("Error processing datasets: {}", e); + }; +} + +fn process_datasets( + settings: &Settings, + abcd_fields: &AbcdFields, + database_sink: &mut DatabaseSink, + bms_providers: BmsProviders, + bms_datasets: &Vec, +) -> Result<(), Error> { + let temp_dir = tempfile::tempdir()?; + let mut abcd_parser = AbcdParser::new(&abcd_fields); for path_result in download_datasets(temp_dir.path(), &bms_datasets) @@ -164,10 +156,6 @@ fn main() { } }; - // let mut string = String::from_utf8(xml_bytes).unwrap(); - // string.truncate(200); - // dbg!(string); - let abcd_data = match abcd_parser.parse( &download.url, &landing_page, @@ -182,9 +170,6 @@ fn main() { }; trace!("{:?}", abcd_data.dataset); - // for unit in abcd_data.units { - // trace!("{:?}", unit); - // } match database_sink.insert_dataset(&abcd_data) { Ok(_) => (), @@ -197,6 +182,30 @@ fn main() { Ok(_) => info!("Schema migration complete."), Err(e) => warn!("Unable to migrate schema: {}", e), }; + + Ok(()) +} + +fn initialize_settings() -> Result { + let matches = App::new("VAT ABCD Crawler") + .version(crate_version!()) + .author(crate_authors!()) + .about(crate_description!()) + .arg( + Arg::with_name("settings") + .index(1) + .short("s") + .long("settings") + .value_name("SETTINGS") + .help("Specify the settings file") + .required(true) + .takes_value(true), + ) + .get_matches(); + + let settings_path = matches.value_of("settings").map(Path::new); + + Ok(Settings::new(settings_path)?) } /// Initialize the logger. From 282277bb349c3065a0e1292700c25b2c46f6fe77 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Wed, 29 May 2019 14:36:03 +0200 Subject: [PATCH 15/31] moved abcd stuff to module --- src/{ => abcd}/abcd_fields.rs | 0 src/{ => abcd}/abcd_parser.rs | 3 +-- src/{ => abcd}/abcd_version.rs | 0 src/{ => abcd}/archive_reader.rs | 0 src/abcd/mod.rs | 9 +++++++++ src/database_sink.rs | 4 +--- src/main.rs | 13 +++---------- 7 files changed, 14 insertions(+), 15 deletions(-) rename src/{ => abcd}/abcd_fields.rs (100%) rename src/{ => abcd}/abcd_parser.rs (99%) rename src/{ => abcd}/abcd_version.rs (100%) rename src/{ => abcd}/archive_reader.rs (100%) create mode 100644 src/abcd/mod.rs diff --git a/src/abcd_fields.rs b/src/abcd/abcd_fields.rs similarity index 100% rename from src/abcd_fields.rs rename to src/abcd/abcd_fields.rs diff --git a/src/abcd_parser.rs b/src/abcd/abcd_parser.rs similarity index 99% rename from src/abcd_parser.rs rename to src/abcd/abcd_parser.rs index d471e46..874a9eb 100644 --- a/src/abcd_parser.rs +++ b/src/abcd/abcd_parser.rs @@ -5,8 +5,7 @@ use failure::Fail; use quick_xml::events::Event; use quick_xml::Reader; -use crate::abcd_fields::AbcdFields; -use crate::abcd_version::AbcdVersion; +use crate::abcd::{AbcdFields, AbcdVersion}; use crate::vat_type::VatType; pub type ValueMap = HashMap; diff --git a/src/abcd_version.rs b/src/abcd/abcd_version.rs similarity index 100% rename from src/abcd_version.rs rename to src/abcd/abcd_version.rs diff --git a/src/archive_reader.rs b/src/abcd/archive_reader.rs similarity index 100% rename from src/archive_reader.rs rename to src/abcd/archive_reader.rs diff --git a/src/abcd/mod.rs b/src/abcd/mod.rs new file mode 100644 index 0000000..d097e71 --- /dev/null +++ b/src/abcd/mod.rs @@ -0,0 +1,9 @@ +mod abcd_fields; +mod abcd_parser; +mod abcd_version; +mod archive_reader; + +pub use self::abcd_fields::{AbcdField, AbcdFields}; +pub use self::abcd_parser::{AbcdParser, AbcdResult, ValueMap}; +pub use self::abcd_version::AbcdVersion; +pub use self::archive_reader::ArchiveReader; diff --git a/src/database_sink.rs b/src/database_sink.rs index 7397f76..05f3823 100644 --- a/src/database_sink.rs +++ b/src/database_sink.rs @@ -10,9 +10,7 @@ use postgres::tls::openssl::OpenSsl; use postgres::transaction::Transaction; use postgres::{Connection, TlsMode}; -use crate::abcd_fields::AbcdFields; -use crate::abcd_parser::AbcdResult; -use crate::abcd_parser::ValueMap; +use crate::abcd::{AbcdFields, AbcdResult, ValueMap}; use crate::settings; const POSTGRES_CSV_CONFIGURATION: &str = diff --git a/src/main.rs b/src/main.rs index 7f4ab1e..08d9d94 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,18 +8,11 @@ use simplelog::{CombinedLogger, SharedLogger, TermLogger, WriteLogger}; use settings::Settings; -use crate::abcd_fields::AbcdFields; -use crate::abcd_parser::AbcdParser; -use crate::archive_reader::ArchiveReader; -use crate::bms::load_bms_datasets; -use crate::bms::BmsProviders; -use crate::bms::{download_datasets, BmsDataset}; +use crate::abcd::{AbcdFields, AbcdParser, ArchiveReader}; +use crate::bms::{download_datasets, load_bms_datasets, BmsDataset, BmsProviders}; use crate::database_sink::DatabaseSink; -mod abcd_fields; -mod abcd_parser; -mod abcd_version; -mod archive_reader; +mod abcd; mod bms; mod database_sink; mod settings; From 889fb7db2aeae7667c7bc7120c55707ecc2c286c Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Tue, 4 Jun 2019 14:18:18 +0200 Subject: [PATCH 16/31] tests for `BmsDataset`s --- src/bms/datasets.rs | 128 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 121 insertions(+), 7 deletions(-) diff --git a/src/bms/datasets.rs b/src/bms/datasets.rs index 52f9859..7043e24 100644 --- a/src/bms/datasets.rs +++ b/src/bms/datasets.rs @@ -1,15 +1,17 @@ -use crate::bms::BmsProvider; -use crate::settings::Settings; -use failure::Error; -use failure::Fail; -use serde::{Deserialize, Serialize}; use std::fs::File; use std::io::BufWriter; use std::path::Path; use std::path::PathBuf; +use failure::Error; +use failure::Fail; +use serde::{Deserialize, Serialize}; + +use crate::bms::BmsProvider; +use crate::settings::Settings; + /// This struct contains dataset information from the BMS -#[derive(Debug, Deserialize, Serialize)] +#[derive(Debug, Deserialize, Serialize, PartialEq)] pub struct BmsDataset { pub provider_datacenter: String, pub provider_url: String, @@ -19,7 +21,7 @@ pub struct BmsDataset { } /// This struct contains archive download information for a BMS dataset. -#[derive(Debug, Deserialize, Serialize)] +#[derive(Debug, Deserialize, Serialize, PartialEq)] pub struct BmsXmlArchive { pub id: String, pub xml_archive: String, @@ -128,3 +130,115 @@ pub fn download_dataset( Ok(DownloadedBmsDataset::new(dataset, download_file_path, url)) } + +#[cfg(test)] +mod tests { + use std::io::Read; + + use crate::test_utils; + + use super::*; + use tempfile::NamedTempFile; + + #[test] + fn download_dataset_metadata() { + let bms_provider_datacenter = "provider_datacenter"; + let bms_provider_url = "provider_url"; + let bms_dsa = "dsa"; + let bms_dataset = "dataset"; + let xml_archive_id = "xml_archive_id"; + let xml_archive_xml_archive = "xml_archive_xml_archive"; + let xml_archive_latest = true; + + let _webserver = test_utils::create_json_webserver(&format!( + r#" + [ + {{ + "provider_datacenter": "{provider_datacenter}", + "provider_url": "{provider_url}", + "dsa": "{dsa}", + "dataset": "{dataset}", + "xml_archives": [ + {{ + "id": "{xml_archive_id}", + "xml_archive": "{xml_archive_xml_archive}", + "latest": {xml_archive_latest} + }} + ] + }} + ] + "#, + provider_datacenter = bms_provider_datacenter, + provider_url = bms_provider_url, + dsa = bms_dsa, + dataset = bms_dataset, + xml_archive_id = xml_archive_id, + xml_archive_xml_archive = xml_archive_xml_archive, + xml_archive_latest = xml_archive_latest, + )); + + let datasets = load_bms_datasets(&test_utils::webserver_url()).unwrap(); + + assert_eq!(datasets.len(), 1); + + let dataset = datasets.get(0).unwrap(); + + assert_eq!(dataset.provider_datacenter, bms_provider_datacenter); + assert_eq!(dataset.provider_url, bms_provider_url); + assert_eq!(dataset.dsa, bms_dsa); + assert_eq!(dataset.dataset, bms_dataset); + + let latest_archive = dataset.get_latest_archive().unwrap(); + + assert_eq!(latest_archive.id, xml_archive_id); + assert_eq!(latest_archive.xml_archive, xml_archive_xml_archive); + assert_eq!(latest_archive.latest, xml_archive_latest); + } + + #[test] + fn retrieve_a_landing_page() { + unimplemented!(); + } + + #[test] + fn download_a_dataset() { + let bms_provider_datacenter = "provider_datacenter"; + let bms_provider_url = "provider_url"; + let bms_dsa = "dsa"; + let bms_dataset = "dataset"; + let xml_archive_id = "xml_archive_id"; + let xml_archive_xml_archive = "xml_archive_xml_archive"; + let xml_archive_latest = true; + + let test_file = "abcde"; + + let temp_file = NamedTempFile::new().unwrap().into_temp_path(); + + let _webserver = test_utils::create_json_webserver(test_file); + let webserver_url = test_utils::webserver_url(); + + let bms_dataset = BmsDataset { + provider_datacenter: bms_provider_datacenter.into(), + provider_url: bms_provider_url.into(), + dsa: bms_dsa.into(), + dataset: bms_dataset.into(), + xml_archives: vec![BmsXmlArchive { + id: xml_archive_id.into(), + xml_archive: xml_archive_xml_archive.into(), + latest: xml_archive_latest, + }], + }; + + let downloaded_dataset = + download_dataset(webserver_url.clone(), temp_file.to_path_buf(), &bms_dataset).unwrap(); + + assert_eq!(downloaded_dataset.dataset, &bms_dataset); + assert_eq!(downloaded_dataset.url, webserver_url); + + let mut bytes = Vec::new(); + let mut file = File::open(downloaded_dataset.path).unwrap(); + file.read_to_end(&mut bytes).unwrap(); + + assert_eq!(String::from_utf8(bytes).unwrap().as_str(), test_file); + } +} From 59c407283f7f91568437d7e92407c8f74aec6bc0 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Wed, 5 Jun 2019 09:01:59 +0200 Subject: [PATCH 17/31] deleted bms and added pangaea functionality --- settings-default.toml | 10 +- src/bms/datasets.rs | 244 ----------------------------------- src/bms/mod.rs | 9 -- src/bms/providers.rs | 82 ------------ src/main.rs | 2 +- src/pangaea/downloader.rs | 0 src/pangaea/mod.rs | 2 + src/pangaea/search_result.rs | 243 ++++++++++++++++++++++++++++++++++ 8 files changed, 252 insertions(+), 340 deletions(-) delete mode 100644 src/bms/datasets.rs delete mode 100644 src/bms/mod.rs delete mode 100644 src/bms/providers.rs create mode 100644 src/pangaea/downloader.rs create mode 100644 src/pangaea/mod.rs create mode 100644 src/pangaea/search_result.rs diff --git a/settings-default.toml b/settings-default.toml index 951707f..3e69cab 100644 --- a/settings-default.toml +++ b/settings-default.toml @@ -9,10 +9,12 @@ dataset_limit = 3 [abcd] fields_file = "abcd-fields.json" -[bms] -monitor_url = "http://bms.gfbio.org/services/xml-archives/?provider=&dsa=" -provider_url = "https://bms.gfbio.org/services/providers/?provider=&name=" -landing_page_url = "http://bms.gfbio.org/services/landingpages/?output=json" +[pangaea] +search_url = "https://ws.pangaea.de/es/dataportal-gfbio/pansimple/_search" +scroll_url = "https://ws.pangaea.de/es/dataportal-gfbio/pansimple/_search/scroll" + +[terminology_service] +landingpage_url = "https://terminologies.gfbio.org/tools/landingpages/landingpage.php" [database] host = "localhost" diff --git a/src/bms/datasets.rs b/src/bms/datasets.rs deleted file mode 100644 index 7043e24..0000000 --- a/src/bms/datasets.rs +++ /dev/null @@ -1,244 +0,0 @@ -use std::fs::File; -use std::io::BufWriter; -use std::path::Path; -use std::path::PathBuf; - -use failure::Error; -use failure::Fail; -use serde::{Deserialize, Serialize}; - -use crate::bms::BmsProvider; -use crate::settings::Settings; - -/// This struct contains dataset information from the BMS -#[derive(Debug, Deserialize, Serialize, PartialEq)] -pub struct BmsDataset { - pub provider_datacenter: String, - pub provider_url: String, - pub dsa: String, - pub dataset: String, - pub xml_archives: Vec, -} - -/// This struct contains archive download information for a BMS dataset. -#[derive(Debug, Deserialize, Serialize, PartialEq)] -pub struct BmsXmlArchive { - pub id: String, - pub xml_archive: String, - pub latest: bool, -} - -/// This struct reflects the result of a BMS landing page generator request. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct BmsLandingPage { - provider: String, - data_set: String, - data_unit: String, -} - -impl BmsDataset { - /// Retrieve the archive with the latest flag from a BMS archive. - pub fn get_latest_archive(&self) -> Result<&BmsXmlArchive, DatasetContainsNoFile> { - self.xml_archives - .iter() - .find(|archive| archive.latest) // get latest archive version - .ok_or_else(|| DatasetContainsNoFile::new(&self.dataset)) - } - - /// Call the landing page generator from the BMS and return the resulting url string. - pub fn get_landing_page( - &self, - settings: &Settings, - providers: &BmsProvider, - ) -> Result { - reqwest::Client::new() - .get(&format!( - "{}&provider={}&dsa={}", - &settings.bms.landing_page_url, providers.id, self.dsa - )) - .send()? - .json::() - .map(|bms_landing_page| bms_landing_page.data_set) - .map_err(|e| e.into()) - } -} - -/// This function downloads a list of dataset information from the BMS. -pub fn load_bms_datasets(url: &str) -> Result, Error> { - Ok(reqwest::Client::new().get(url).send()?.json()?) -} - -/// This struct combines dataset information and a path to the downloaded archive file. -#[derive(Debug)] -pub struct DownloadedBmsDataset<'d> { - pub dataset: &'d BmsDataset, - pub path: PathBuf, - pub url: String, -} - -impl<'d> DownloadedBmsDataset<'d> { - /// Create a new descriptor for a downloaded BMS dataset. - pub fn new(dataset: &'d BmsDataset, path: PathBuf, url: String) -> Self { - Self { dataset, path, url } - } -} - -/// Download all datasets into a given temporary directory. -/// This function returns an iterator over `DownloadedBmsDataset`. -pub fn download_datasets<'d, 't>( - temp_dir: &'t Path, - datasets: &'d [BmsDataset], -) -> impl Iterator, Error>> + 'd { - let temp_dir = temp_dir.to_path_buf(); - datasets.iter().enumerate().map(move |(i, dataset)| { - let url = dataset.get_latest_archive()?.xml_archive.clone(); - let download_file_path = temp_dir.join(Path::new(&format!("{}.zip", i))); - download_dataset(url, download_file_path, dataset) - }) -} - -/// This error occurs when it is not possible to download a dataset archive. -#[derive(Debug, Fail)] -#[fail(display = "Dataset {} contains no file to download.", dataset)] -pub struct DatasetContainsNoFile { - dataset: String, -} - -impl DatasetContainsNoFile { - /// Create a new `DatasetContainsNoFileError` from a dataset name. - pub fn new(dataset: &str) -> Self { - Self { - dataset: dataset.to_string(), - } - } -} - -/// Download a dataset (the latest) into the given file path. -pub fn download_dataset( - url: String, - download_file_path: PathBuf, - dataset: &BmsDataset, -) -> Result { - let mut response = reqwest::get(&url)?; - - let output = File::create(&download_file_path)?; - - // copy file to temp path - let mut writer = BufWriter::new(&output); - std::io::copy(&mut response, &mut writer)?; - - Ok(DownloadedBmsDataset::new(dataset, download_file_path, url)) -} - -#[cfg(test)] -mod tests { - use std::io::Read; - - use crate::test_utils; - - use super::*; - use tempfile::NamedTempFile; - - #[test] - fn download_dataset_metadata() { - let bms_provider_datacenter = "provider_datacenter"; - let bms_provider_url = "provider_url"; - let bms_dsa = "dsa"; - let bms_dataset = "dataset"; - let xml_archive_id = "xml_archive_id"; - let xml_archive_xml_archive = "xml_archive_xml_archive"; - let xml_archive_latest = true; - - let _webserver = test_utils::create_json_webserver(&format!( - r#" - [ - {{ - "provider_datacenter": "{provider_datacenter}", - "provider_url": "{provider_url}", - "dsa": "{dsa}", - "dataset": "{dataset}", - "xml_archives": [ - {{ - "id": "{xml_archive_id}", - "xml_archive": "{xml_archive_xml_archive}", - "latest": {xml_archive_latest} - }} - ] - }} - ] - "#, - provider_datacenter = bms_provider_datacenter, - provider_url = bms_provider_url, - dsa = bms_dsa, - dataset = bms_dataset, - xml_archive_id = xml_archive_id, - xml_archive_xml_archive = xml_archive_xml_archive, - xml_archive_latest = xml_archive_latest, - )); - - let datasets = load_bms_datasets(&test_utils::webserver_url()).unwrap(); - - assert_eq!(datasets.len(), 1); - - let dataset = datasets.get(0).unwrap(); - - assert_eq!(dataset.provider_datacenter, bms_provider_datacenter); - assert_eq!(dataset.provider_url, bms_provider_url); - assert_eq!(dataset.dsa, bms_dsa); - assert_eq!(dataset.dataset, bms_dataset); - - let latest_archive = dataset.get_latest_archive().unwrap(); - - assert_eq!(latest_archive.id, xml_archive_id); - assert_eq!(latest_archive.xml_archive, xml_archive_xml_archive); - assert_eq!(latest_archive.latest, xml_archive_latest); - } - - #[test] - fn retrieve_a_landing_page() { - unimplemented!(); - } - - #[test] - fn download_a_dataset() { - let bms_provider_datacenter = "provider_datacenter"; - let bms_provider_url = "provider_url"; - let bms_dsa = "dsa"; - let bms_dataset = "dataset"; - let xml_archive_id = "xml_archive_id"; - let xml_archive_xml_archive = "xml_archive_xml_archive"; - let xml_archive_latest = true; - - let test_file = "abcde"; - - let temp_file = NamedTempFile::new().unwrap().into_temp_path(); - - let _webserver = test_utils::create_json_webserver(test_file); - let webserver_url = test_utils::webserver_url(); - - let bms_dataset = BmsDataset { - provider_datacenter: bms_provider_datacenter.into(), - provider_url: bms_provider_url.into(), - dsa: bms_dsa.into(), - dataset: bms_dataset.into(), - xml_archives: vec![BmsXmlArchive { - id: xml_archive_id.into(), - xml_archive: xml_archive_xml_archive.into(), - latest: xml_archive_latest, - }], - }; - - let downloaded_dataset = - download_dataset(webserver_url.clone(), temp_file.to_path_buf(), &bms_dataset).unwrap(); - - assert_eq!(downloaded_dataset.dataset, &bms_dataset); - assert_eq!(downloaded_dataset.url, webserver_url); - - let mut bytes = Vec::new(); - let mut file = File::open(downloaded_dataset.path).unwrap(); - file.read_to_end(&mut bytes).unwrap(); - - assert_eq!(String::from_utf8(bytes).unwrap().as_str(), test_file); - } -} diff --git a/src/bms/mod.rs b/src/bms/mod.rs deleted file mode 100644 index b127ad6..0000000 --- a/src/bms/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -mod datasets; -mod downloader; -mod providers; - -pub use self::datasets::{ - download_datasets, load_bms_datasets, BmsDataset, BmsLandingPage, BmsXmlArchive, - DownloadedBmsDataset, -}; -pub use self::providers::{BmsProvider, BmsProviders}; diff --git a/src/bms/providers.rs b/src/bms/providers.rs deleted file mode 100644 index 5c8fe99..0000000 --- a/src/bms/providers.rs +++ /dev/null @@ -1,82 +0,0 @@ -use std::collections::HashMap; - -use failure::Error; -use serde::Deserialize; - -/// This struct contains all provider information. -/// The identifier is the `url`, strange as it seems. -#[derive(Debug, Deserialize)] -pub struct BmsProvider { - pub id: String, - pub shortname: String, - pub name: String, - pub url: String, - pub biocase_url: String, -} - -#[derive(Debug)] -pub struct BmsProviders { - providers: HashMap, -} - -impl BmsProviders { - pub fn from_url(url: &str) -> Result { - let providers: Vec = reqwest::Client::new().get(url).send()?.json()?; - let provider_map = providers - .into_iter() - .map(|provider| (provider.url.clone(), provider)) - .collect(); - Ok(Self { - providers: provider_map, - }) - } - - pub fn value_of(&self, url: &str) -> Option<&BmsProvider> { - self.providers.get(url) - } -} - -#[cfg(test)] -mod tests { - use crate::test_utils; - - use super::*; - - #[test] - fn downloads_providers() { - let _webserver = test_utils::create_json_webserver(r#" - [ - { - "id": "6", - "shortname": "BGBM", - "name": "Botanic Garden and Botanical Museum Berlin, Freie Universit\u00e4t Berlin", - "url": "www.bgbm.org", - "biocase_url": "https:\/\/ww3.bgbm.org\/biocase\/" - }, - { - "id": "5", - "shortname": "DSMZ", - "name": "Leibniz Institute DSMZ \u2013 German Collection of Microorganisms and Cell Cultures, Braunschweig", - "url": "www.dsmz.de", - "biocase_url": "http:\/\/biocase.dsmz.de\/wrappers\/biocase" - } - ]"# - ); - - let bms_providers = match BmsProviders::from_url(&test_utils::webserver_url()) { - Ok(providers) => providers, - Err(error) => panic!(error), - }; - - let bgbm = bms_providers.value_of("www.bgbm.org"); - assert!(bgbm.is_some()); - assert_eq!(bgbm.unwrap().id, "6"); - - let dsmz = bms_providers.value_of("www.dsmz.de"); - assert!(dsmz.is_some()); - assert_eq!(dsmz.unwrap().id, "5"); - - assert!(bms_providers.value_of("").is_none()); - } - -} diff --git a/src/main.rs b/src/main.rs index 08d9d94..3d4ea10 100644 --- a/src/main.rs +++ b/src/main.rs @@ -13,8 +13,8 @@ use crate::bms::{download_datasets, load_bms_datasets, BmsDataset, BmsProviders} use crate::database_sink::DatabaseSink; mod abcd; -mod bms; mod database_sink; +mod pangaea; mod settings; #[cfg(test)] mod test_utils; diff --git a/src/pangaea/downloader.rs b/src/pangaea/downloader.rs new file mode 100644 index 0000000..e69de29 diff --git a/src/pangaea/mod.rs b/src/pangaea/mod.rs new file mode 100644 index 0000000..b3e98fc --- /dev/null +++ b/src/pangaea/mod.rs @@ -0,0 +1,2 @@ +mod downloader; +mod search_result; diff --git a/src/pangaea/search_result.rs b/src/pangaea/search_result.rs new file mode 100644 index 0000000..adad66e --- /dev/null +++ b/src/pangaea/search_result.rs @@ -0,0 +1,243 @@ +use failure::Error; +use serde::Deserialize; + +#[derive(Clone, Debug, Deserialize, PartialEq)] +pub struct SearchResult { + #[serde(rename = "_scroll_id")] + scroll_id: String, + hits: SearchResultHits, +} + +#[derive(Clone, Debug, Deserialize, PartialEq)] +struct SearchResultHits { + total: u64, + hits: Vec, +} + +#[derive(Clone, Debug, Deserialize, PartialEq)] +struct SearchResultEntry { + #[serde(rename = "_id")] + id: String, + #[serde(rename = "_source")] + source: SearchResultEntrySource, +} + +#[derive(Clone, Debug, Deserialize, PartialEq)] +struct SearchResultEntrySource { + citation_publisher: String, + datalink: String, +} + +impl SearchResult { + fn from_url(url: &str) -> Result { + const SCROLL_TIMEOUT: &str = "1m"; + reqwest::Client::new() + .get(&format!( + "{url}?scroll={scroll}", + url = url, + scroll = SCROLL_TIMEOUT, + )) + .json( + r#"{ + "query": { + "bool": { + "filter": [ + { + "term": { + "internal-source": "gfbio-abcd-collections" + } + }, + { + "match_phrase": { + "type": "ABCD_Dataset" + } + }, + { + "term": { + "accessRestricted": false + } + } + ] + } + } + }"#, + ) + .send()? + .json::() + .map_err(|e| e.into()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::test_utils; + + const CITATION_PUBLISHER: &str = "Test Publisher"; + const CITATION_PUBLISHER_2: &str = "Test Publisher"; + const DATALINK: &str = "https://foobar.de"; + const DATALINK_2: &str = "https://foobar2.de"; + const RESULT_ID: &str = "test_id"; + const RESULT_ID_2: &str = "test_id"; + const SEARCH_RESULT_HITS: u64 = 64; + const SCROLL_ID: &str = "SCROLL_ID_SCROLL_ID"; + + const SEARCH_RESULT_ENTRY_SOURCE_JSON: fn() -> String = || { + format!( + r#" + {{ + "citation_publisher": "{citation_publisher}", + "datalink": "{datalink}" + }} + "#, + citation_publisher = CITATION_PUBLISHER, + datalink = DATALINK, + ) + }; + const SEARCH_RESULT_ENTRY_SOURCE_JSON_2: fn() -> String = || { + format!( + r#" + {{ + "citation_publisher": "{citation_publisher}", + "datalink": "{datalink}" + }} + "#, + citation_publisher = CITATION_PUBLISHER_2, + datalink = DATALINK_2, + ) + }; + const SEARCH_RESULT_ENTRY_JSON: fn() -> String = || { + format!( + r#" + {{ + "_id": "{test_id}", + "_source": {source} + }} + "#, + test_id = RESULT_ID, + source = SEARCH_RESULT_ENTRY_SOURCE_JSON(), + ) + }; + const SEARCH_RESULT_ENTRY_JSON_2: fn() -> String = || { + format!( + r#" + {{ + "_id": "{test_id}", + "_source": {source} + }} + "#, + test_id = RESULT_ID_2, + source = SEARCH_RESULT_ENTRY_SOURCE_JSON_2(), + ) + }; + const SEARCH_RESULT_HITS_JSON: fn() -> String = || { + format!( + r#" + {{ + "total": {hits}, + "max_score": 1.0, + "hits": [ + {r1}, + {r2} + ] + }} + "#, + hits = SEARCH_RESULT_HITS, + r1 = SEARCH_RESULT_ENTRY_JSON(), + r2 = SEARCH_RESULT_ENTRY_JSON_2(), + ) + }; + const SEARCH_RESULT_JSON: fn() -> String = || { + format!( + r#" + {{ + "_scroll_id": "{scroll_id}", + "took": 1373, + "hits": {hits} + }} + "#, + scroll_id = SCROLL_ID, + hits = SEARCH_RESULT_HITS_JSON(), + ) + }; + + #[test] + fn parse_search_result_entry_source() { + let search_result_entry_source = + serde_json::from_str::(&SEARCH_RESULT_ENTRY_SOURCE_JSON()) + .unwrap(); + + assert_eq!( + search_result_entry_source, + SearchResultEntrySource { + citation_publisher: CITATION_PUBLISHER.into(), + datalink: DATALINK.into(), + } + ) + } + + #[test] + fn parse_search_result_entry() { + let search_result_entry = + serde_json::from_str::(&SEARCH_RESULT_ENTRY_JSON()).unwrap(); + + assert_eq!( + search_result_entry, + SearchResultEntry { + id: RESULT_ID.to_string(), + source: SearchResultEntrySource { + citation_publisher: CITATION_PUBLISHER.into(), + datalink: DATALINK.into(), + }, + } + ) + } + + #[test] + fn parse_search_result_hits() { + let search_result_hits = + serde_json::from_str::(&SEARCH_RESULT_HITS_JSON()).unwrap(); + + assert_eq!( + search_result_hits, + SearchResultHits { + total: SEARCH_RESULT_HITS, + hits: vec![ + SearchResultEntry { + id: RESULT_ID.to_string(), + source: SearchResultEntrySource { + citation_publisher: CITATION_PUBLISHER.into(), + datalink: DATALINK.into(), + }, + }, + SearchResultEntry { + id: RESULT_ID_2.to_string(), + source: SearchResultEntrySource { + citation_publisher: CITATION_PUBLISHER_2.into(), + datalink: DATALINK_2.into(), + }, + }, + ], + } + ) + } + + #[test] + fn parse_search_result() { + let search_result = serde_json::from_str::(&SEARCH_RESULT_JSON()).unwrap(); + + assert_eq!(search_result.scroll_id, SCROLL_ID); + assert_eq!(search_result.hits.hits.len(), 2); + } + + #[test] + fn parse_webserver_result() { + let _webserver = test_utils::create_json_webserver(&SEARCH_RESULT_JSON()); + + let search_result = SearchResult::from_url(&test_utils::webserver_url()).unwrap(); + + assert_eq!(search_result.scroll_id, SCROLL_ID); + assert_eq!(search_result.hits.hits.len(), 2); + } +} From 857564b5d604d000fcde3f7eecd62017cc43039f Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Thu, 6 Jun 2019 11:50:13 +0200 Subject: [PATCH 18/31] added pangaea search tool --- src/pangaea/search_result.rs | 310 ++++++++++++++--------- src/settings.rs | 15 +- src/{test_utils.rs => test_utils/mod.rs} | 16 +- src/test_utils/webserver.rs | 35 +++ tests/learning/mockito.rs | 62 +++++ tests/learning/mod.rs | 1 + tests/learning_tests.rs | 1 + 7 files changed, 309 insertions(+), 131 deletions(-) rename src/{test_utils.rs => test_utils/mod.rs} (63%) create mode 100644 src/test_utils/webserver.rs create mode 100644 tests/learning/mockito.rs create mode 100644 tests/learning/mod.rs create mode 100644 tests/learning_tests.rs diff --git a/src/pangaea/search_result.rs b/src/pangaea/search_result.rs index adad66e..314d29c 100644 --- a/src/pangaea/search_result.rs +++ b/src/pangaea/search_result.rs @@ -1,5 +1,7 @@ use failure::Error; use serde::Deserialize; +use serde_json::json; +use std::collections::HashMap; #[derive(Clone, Debug, Deserialize, PartialEq)] pub struct SearchResult { @@ -15,7 +17,7 @@ struct SearchResultHits { } #[derive(Clone, Debug, Deserialize, PartialEq)] -struct SearchResultEntry { +pub struct SearchResultEntry { #[serde(rename = "_id")] id: String, #[serde(rename = "_source")] @@ -29,144 +31,149 @@ struct SearchResultEntrySource { } impl SearchResult { + const SCROLL_TIMEOUT: &'static str = "1m"; + fn from_url(url: &str) -> Result { - const SCROLL_TIMEOUT: &str = "1m"; + let body = json!({ + "query": { + "bool": { + "filter": [ + { + "term": { + "internal-source": "gfbio-abcd-collections" + } + }, + { + "match_phrase": { + "type": "ABCD_Dataset" + } + }, + { + "term": { + "accessRestricted": false + } + } + ] + } + } + }); + reqwest::Client::new() - .get(&format!( - "{url}?scroll={scroll}", + .post(&format!( + "{url}?scroll={scroll_timeout}", url = url, - scroll = SCROLL_TIMEOUT, + scroll_timeout = Self::SCROLL_TIMEOUT, )) - .json( - r#"{ - "query": { - "bool": { - "filter": [ - { - "term": { - "internal-source": "gfbio-abcd-collections" - } - }, - { - "match_phrase": { - "type": "ABCD_Dataset" - } - }, - { - "term": { - "accessRestricted": false - } - } - ] - } - } - }"#, - ) + .json(&body) + .send()? + .json::() + .map_err(|e| e.into()) + } + + fn from_scroll_url(url: &str, scroll_id: &str) -> Result { + let mut body = HashMap::new(); + body.insert("scroll", Self::SCROLL_TIMEOUT); + body.insert("scroll_id", scroll_id); + + reqwest::Client::new() + .post(url) + .json(&body) .send()? .json::() .map_err(|e| e.into()) } + + pub fn retrieve_all_entries( + query_url: &str, + scroll_url: &str, + ) -> Result, Error> { + let mut entries = Vec::new(); + + eprintln!("Start {}", query_url); + + let mut result = Self::from_url(query_url)?; + + eprintln!("First request {}, Hits: {}", scroll_url, result.hits.total); + + while result.hits.total > 0 { + entries.append(&mut result.hits.hits); + + result = Self::from_scroll_url(scroll_url, &result.scroll_id)?; + + eprintln!("Another request --> Hits: {} --> -->", result.hits.total); + } + + entries.append(&mut result.hits.hits); + + Ok(entries) + } } #[cfg(test)] mod tests { use super::*; - use crate::test_utils; + use crate::test_utils::MockWebserver; + use serde_json::Value as JsonValue; const CITATION_PUBLISHER: &str = "Test Publisher"; - const CITATION_PUBLISHER_2: &str = "Test Publisher"; + const CITATION_PUBLISHER_2: &str = "Test Publisher 2"; const DATALINK: &str = "https://foobar.de"; const DATALINK_2: &str = "https://foobar2.de"; const RESULT_ID: &str = "test_id"; - const RESULT_ID_2: &str = "test_id"; + const RESULT_ID_2: &str = "test_id_2"; const SEARCH_RESULT_HITS: u64 = 64; const SCROLL_ID: &str = "SCROLL_ID_SCROLL_ID"; + const SCROLL_ID_2: &str = "SCROLL_ID_SCROLL_ID_2"; - const SEARCH_RESULT_ENTRY_SOURCE_JSON: fn() -> String = || { - format!( - r#" - {{ - "citation_publisher": "{citation_publisher}", - "datalink": "{datalink}" - }} - "#, - citation_publisher = CITATION_PUBLISHER, - datalink = DATALINK, - ) + const SEARCH_RESULT_ENTRY_SOURCE_JSON: fn() -> JsonValue = || { + json!({ + "citation_publisher": CITATION_PUBLISHER, + "datalink": DATALINK, + }) }; - const SEARCH_RESULT_ENTRY_SOURCE_JSON_2: fn() -> String = || { - format!( - r#" - {{ - "citation_publisher": "{citation_publisher}", - "datalink": "{datalink}" - }} - "#, - citation_publisher = CITATION_PUBLISHER_2, - datalink = DATALINK_2, - ) + const SEARCH_RESULT_ENTRY_SOURCE_JSON_2: fn() -> JsonValue = || { + json!({ + "citation_publisher": CITATION_PUBLISHER_2, + "datalink": DATALINK_2, + }) }; - const SEARCH_RESULT_ENTRY_JSON: fn() -> String = || { - format!( - r#" - {{ - "_id": "{test_id}", - "_source": {source} - }} - "#, - test_id = RESULT_ID, - source = SEARCH_RESULT_ENTRY_SOURCE_JSON(), - ) + const SEARCH_RESULT_ENTRY_JSON: fn() -> JsonValue = || { + json!({ + "_id": RESULT_ID, + "_source": SEARCH_RESULT_ENTRY_SOURCE_JSON(), + }) }; - const SEARCH_RESULT_ENTRY_JSON_2: fn() -> String = || { - format!( - r#" - {{ - "_id": "{test_id}", - "_source": {source} - }} - "#, - test_id = RESULT_ID_2, - source = SEARCH_RESULT_ENTRY_SOURCE_JSON_2(), - ) + const SEARCH_RESULT_ENTRY_JSON_2: fn() -> JsonValue = || { + json!({ + "_id": RESULT_ID_2, + "_source": SEARCH_RESULT_ENTRY_SOURCE_JSON_2(), + }) }; - const SEARCH_RESULT_HITS_JSON: fn() -> String = || { - format!( - r#" - {{ - "total": {hits}, - "max_score": 1.0, - "hits": [ - {r1}, - {r2} - ] - }} - "#, - hits = SEARCH_RESULT_HITS, - r1 = SEARCH_RESULT_ENTRY_JSON(), - r2 = SEARCH_RESULT_ENTRY_JSON_2(), - ) + const SEARCH_RESULT_HITS_JSON: fn() -> JsonValue = || { + json!({ + "total": SEARCH_RESULT_HITS, + "max_score": 1.0, + "hits": [ + SEARCH_RESULT_ENTRY_JSON(), + SEARCH_RESULT_ENTRY_JSON_2(), + ], + }) }; - const SEARCH_RESULT_JSON: fn() -> String = || { - format!( - r#" - {{ - "_scroll_id": "{scroll_id}", - "took": 1373, - "hits": {hits} - }} - "#, - scroll_id = SCROLL_ID, - hits = SEARCH_RESULT_HITS_JSON(), - ) + const SEARCH_RESULT_JSON: fn() -> JsonValue = || { + json!({ + "_scroll_id": SCROLL_ID, + "took": 1373, + "hits": SEARCH_RESULT_HITS_JSON(), + }) }; #[test] fn parse_search_result_entry_source() { - let search_result_entry_source = - serde_json::from_str::(&SEARCH_RESULT_ENTRY_SOURCE_JSON()) - .unwrap(); + let search_result_entry_source = serde_json::from_str::( + &SEARCH_RESULT_ENTRY_SOURCE_JSON().to_string(), + ) + .unwrap(); assert_eq!( search_result_entry_source, @@ -180,7 +187,8 @@ mod tests { #[test] fn parse_search_result_entry() { let search_result_entry = - serde_json::from_str::(&SEARCH_RESULT_ENTRY_JSON()).unwrap(); + serde_json::from_str::(&SEARCH_RESULT_ENTRY_JSON().to_string()) + .unwrap(); assert_eq!( search_result_entry, @@ -197,7 +205,8 @@ mod tests { #[test] fn parse_search_result_hits() { let search_result_hits = - serde_json::from_str::(&SEARCH_RESULT_HITS_JSON()).unwrap(); + serde_json::from_str::(&SEARCH_RESULT_HITS_JSON().to_string()) + .unwrap(); assert_eq!( search_result_hits, @@ -220,12 +229,13 @@ mod tests { }, ], } - ) + ); } #[test] fn parse_search_result() { - let search_result = serde_json::from_str::(&SEARCH_RESULT_JSON()).unwrap(); + let search_result = + serde_json::from_str::(&SEARCH_RESULT_JSON().to_string()).unwrap(); assert_eq!(search_result.scroll_id, SCROLL_ID); assert_eq!(search_result.hits.hits.len(), 2); @@ -233,11 +243,83 @@ mod tests { #[test] fn parse_webserver_result() { - let _webserver = test_utils::create_json_webserver(&SEARCH_RESULT_JSON()); + let webserver = MockWebserver::from_json( + &format!("/?scroll={}", SearchResult::SCROLL_TIMEOUT), + "POST", + &SEARCH_RESULT_JSON().to_string(), + ); - let search_result = SearchResult::from_url(&test_utils::webserver_url()).unwrap(); + let search_result = SearchResult::from_url(&webserver.webserver_root_url()).unwrap(); assert_eq!(search_result.scroll_id, SCROLL_ID); assert_eq!(search_result.hits.hits.len(), 2); } + + #[test] + fn parse_scroll_result() { + let webserver = MockWebserver::from_json("/", "POST", &SEARCH_RESULT_JSON().to_string()); + + let search_result = + SearchResult::from_scroll_url(&webserver.webserver_root_url(), SCROLL_ID).unwrap(); + + assert_eq!(search_result.scroll_id, SCROLL_ID); + assert_eq!(search_result.hits.hits.len(), 2); + } + + #[test] + fn collect_multiple_request_data() { + let _m1 = + MockWebserver::from_json("/?scroll=1m", "POST", &SEARCH_RESULT_JSON().to_string()); + let _m2 = MockWebserver::from_json_with_json_condition( + "/scroll", + "POST", + &json!({ + "scroll" : SearchResult::SCROLL_TIMEOUT, + "scroll_id" : SCROLL_ID, + }) + .to_string(), + &json!({ + "_scroll_id": SCROLL_ID_2, + "took": 1373, + "hits": { + "total": SEARCH_RESULT_HITS, // <-- CONTINUE + "hits": [ + SEARCH_RESULT_ENTRY_JSON(), + SEARCH_RESULT_ENTRY_JSON_2(), + ], + }, + }) + .to_string(), + ); + let _m3 = MockWebserver::from_json_with_json_condition( + "/scroll", + "POST", + &json!({ + "scroll" : SearchResult::SCROLL_TIMEOUT, + "scroll_id" : SCROLL_ID_2, + }) + .to_string(), + &json!({ + "_scroll_id": SCROLL_ID_2, + "took": 1373, + "hits": { + "total": 0, // <-- NO CONTINUE + "hits": [ + SEARCH_RESULT_ENTRY_JSON(), + ], + }, + }) + .to_string(), + ); + + assert_eq!(_m2.webserver_root_url(), _m3.webserver_root_url()); + + let entries = SearchResult::retrieve_all_entries( + &_m1.webserver_root_url(), + &format!("{}/scroll", _m2.webserver_root_url()), + ) + .unwrap(); + + assert_eq!(5, entries.len()); + } } diff --git a/src/settings.rs b/src/settings.rs index ae6b3c6..16cfd86 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -17,10 +17,14 @@ pub struct Abcd { } #[derive(Debug, Deserialize)] -pub struct Bms { - pub monitor_url: String, - pub provider_url: String, - pub landing_page_url: String, +pub struct Pangaea { + pub search_url: String, + pub scroll_url: String, +} + +#[derive(Debug, Deserialize)] +pub struct TerminologyService { + pub landingpage_url: String, } #[derive(Debug, Deserialize)] @@ -54,7 +58,8 @@ pub struct Debug { #[derive(Debug, Deserialize)] pub struct Settings { pub abcd: Abcd, - pub bms: Bms, + pub pangaea: Pangaea, + pub terminology_service: TerminologyService, pub database: Database, pub debug: Debug, pub general: General, diff --git a/src/test_utils.rs b/src/test_utils/mod.rs similarity index 63% rename from src/test_utils.rs rename to src/test_utils/mod.rs index bfe8b35..8450d94 100644 --- a/src/test_utils.rs +++ b/src/test_utils/mod.rs @@ -1,8 +1,11 @@ +mod webserver; + use std::io::Write; -use mockito::{mock, Mock}; use tempfile::TempPath; +pub use self::webserver::MockWebserver; + pub fn create_temp_file(content: &str) -> TempPath { create_temp_file_with_suffix("", content) } @@ -17,14 +20,3 @@ pub fn create_temp_file_with_suffix(suffix: &str, content: &str) -> TempPath { file.into_temp_path() } - -pub fn create_json_webserver(json_string: &str) -> Mock { - mock("GET", "/") - .with_header("content-type", "application/json") - .with_body(json_string) - .create() -} - -pub fn webserver_url() -> String { - mockito::server_url() -} diff --git a/src/test_utils/webserver.rs b/src/test_utils/webserver.rs new file mode 100644 index 0000000..036be69 --- /dev/null +++ b/src/test_utils/webserver.rs @@ -0,0 +1,35 @@ +use mockito::{mock, Matcher, Mock}; + +pub struct MockWebserver { + _mock: Mock, +} + +impl MockWebserver { + pub fn from_json(path: &str, method: &str, json_string: &str) -> Self { + Self { + _mock: mock(method, path) + .with_header("content-type", "application/json") + .with_body(json_string) + .create(), + } + } + + pub fn from_json_with_json_condition( + path: &str, + method: &str, + json_condition: &str, + json_result: &str, + ) -> Self { + Self { + _mock: mock(method, path) + .match_body(Matcher::JsonString(json_condition.to_string())) + .with_header("content-type", "application/json") + .with_body(json_result) + .create(), + } + } + + pub fn webserver_root_url(&self) -> String { + mockito::server_url() + } +} diff --git a/tests/learning/mockito.rs b/tests/learning/mockito.rs new file mode 100644 index 0000000..2106f29 --- /dev/null +++ b/tests/learning/mockito.rs @@ -0,0 +1,62 @@ +use mockito::{mock, Matcher}; +use reqwest::Client; +use std::collections::HashMap; + +#[test] +fn mockito_expect_body() { + let _webserver = mock("POST", Matcher::Any) + .match_body("FOOBAR") + .with_body("GOTCHA") + .create(); + + let client = Client::new(); + let mut response = client + .post(&mockito::server_url()) + .body("FOOBAR") + .send() + .unwrap(); + + assert_eq!(response.status(), reqwest::StatusCode::OK); + assert_eq!(response.text().unwrap(), "GOTCHA"); +} + +#[test] +fn mockito_expect_json() { + const JSON_STRING: &str = r#"{"foo" : "bar"}"#; + + let _webserver = mock("POST", Matcher::Any) + .match_body(Matcher::JsonString(JSON_STRING.into())) + .with_body("GOTCHA") + .create(); + + let client = Client::new(); + let mut response = client + .post(&mockito::server_url()) + .body(JSON_STRING) + .send() + .unwrap(); + + assert_eq!(response.status(), reqwest::StatusCode::OK); + assert_eq!(response.text().unwrap(), "GOTCHA"); +} + +#[test] +fn mockito_expect_json_from_map() { + let _webserver = mock("POST", Matcher::Any) + .match_body(Matcher::JsonString(r#"{"foo" : "bar"}"#.into())) + .with_body("GOTCHA") + .create(); + + let mut map = HashMap::new(); + map.insert("foo", "bar"); + + let client = Client::new(); + let mut response = client + .post(&mockito::server_url()) + .json(&map) + .send() + .unwrap(); + + assert_eq!(response.status(), reqwest::StatusCode::OK); + assert_eq!(response.text().unwrap(), "GOTCHA"); +} diff --git a/tests/learning/mod.rs b/tests/learning/mod.rs new file mode 100644 index 0000000..4c9d171 --- /dev/null +++ b/tests/learning/mod.rs @@ -0,0 +1 @@ +pub mod mockito; diff --git a/tests/learning_tests.rs b/tests/learning_tests.rs new file mode 100644 index 0000000..a1a55fd --- /dev/null +++ b/tests/learning_tests.rs @@ -0,0 +1 @@ +pub mod learning; From 597a581203a5b00158fca14615549398893f3289 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Thu, 6 Jun 2019 11:50:42 +0200 Subject: [PATCH 19/31] removed printlines --- src/pangaea/search_result.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/pangaea/search_result.rs b/src/pangaea/search_result.rs index 314d29c..39c5b62 100644 --- a/src/pangaea/search_result.rs +++ b/src/pangaea/search_result.rs @@ -89,18 +89,12 @@ impl SearchResult { ) -> Result, Error> { let mut entries = Vec::new(); - eprintln!("Start {}", query_url); - let mut result = Self::from_url(query_url)?; - eprintln!("First request {}, Hits: {}", scroll_url, result.hits.total); - while result.hits.total > 0 { entries.append(&mut result.hits.hits); result = Self::from_scroll_url(scroll_url, &result.scroll_id)?; - - eprintln!("Another request --> Hits: {} --> -->", result.hits.total); } entries.append(&mut result.hits.hits); From aa9d3b6615792813a4ebe8548b18d82ee94eb964 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Thu, 6 Jun 2019 13:45:04 +0200 Subject: [PATCH 20/31] added accessor methods to SearchResultEntry --- src/pangaea/search_result.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/pangaea/search_result.rs b/src/pangaea/search_result.rs index 39c5b62..a6ad0fd 100644 --- a/src/pangaea/search_result.rs +++ b/src/pangaea/search_result.rs @@ -103,6 +103,20 @@ impl SearchResult { } } +impl SearchResultEntry { + pub fn id(&self) -> &str { + &self.id + } + + pub fn publisher(&self) -> &str { + &self.source.citation_publisher + } + + pub fn download_url(&self) -> &str { + &self.source.datalink + } +} + #[cfg(test)] mod tests { use super::*; @@ -315,5 +329,10 @@ mod tests { .unwrap(); assert_eq!(5, entries.len()); + + let entry = &entries[0]; + assert_eq!(RESULT_ID, entry.id()); + assert_eq!(DATALINK, entry.download_url()); + assert_eq!(CITATION_PUBLISHER, entry.publisher()); } } From facfff7edb20c4955449eb7f08e1a4b746aa0231 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Thu, 6 Jun 2019 15:00:57 +0200 Subject: [PATCH 21/31] added file downloader, modified pangaea search result name and modified main.rs accordingly --- src/file_downloader.rs | 25 +++++++++ src/main.rs | 103 +++++++++++------------------------ src/pangaea/downloader.rs | 0 src/pangaea/mod.rs | 3 +- src/pangaea/search_result.rs | 76 +++++++++++++------------- 5 files changed, 99 insertions(+), 108 deletions(-) create mode 100644 src/file_downloader.rs delete mode 100644 src/pangaea/downloader.rs diff --git a/src/file_downloader.rs b/src/file_downloader.rs new file mode 100644 index 0000000..ca9f292 --- /dev/null +++ b/src/file_downloader.rs @@ -0,0 +1,25 @@ +use failure::Error; +use std::fs::File; +use std::io::BufWriter; +use std::path::{Path, PathBuf}; + +pub struct FileDownloader { + url: String, +} + +impl FileDownloader { + pub fn from_url(url: &str) -> Self { + Self { url: url.into() } + } + + pub fn to_path(&self, path: &Path) -> Result<(), Error> { + let mut response = reqwest::get(&self.url)?; + + let output_file = File::create(&path)?; + + let mut writer = BufWriter::new(&output_file); + std::io::copy(&mut response, &mut writer)?; + + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs index 3d4ea10..748ef0b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,18 +9,20 @@ use simplelog::{CombinedLogger, SharedLogger, TermLogger, WriteLogger}; use settings::Settings; use crate::abcd::{AbcdFields, AbcdParser, ArchiveReader}; -use crate::bms::{download_datasets, load_bms_datasets, BmsDataset, BmsProviders}; use crate::database_sink::DatabaseSink; +use crate::file_downloader::FileDownloader; +use crate::pangaea::{PangaeaSearchResult, PangaeaSearchResultEntry}; mod abcd; mod database_sink; +mod file_downloader; mod pangaea; mod settings; #[cfg(test)] mod test_utils; mod vat_type; -fn main() { +fn main() -> Result<(), Error> { let settings = initialize_settings().expect("Unable to load settings file."); initialize_logger(Path::new(&settings.general.log_file), &settings) @@ -30,7 +32,7 @@ fn main() { Ok(fields) => fields, Err(e) => { error!("Unable to load ABCD file: {}", e); - return; // stop program + return Err(e); // stop program } }; @@ -38,49 +40,37 @@ fn main() { Ok(sink) => sink, Err(e) => { error!("Unable to create database sink: {}", e); - return; // stop program + return Err(e); // stop program } }; - let bms_providers = match BmsProviders::from_url(&settings.bms.provider_url) { - Ok(providers) => providers, + let datasets = match PangaeaSearchResult::retrieve_all_entries(&settings.pangaea) { + Ok(search_entries) => search_entries, Err(e) => { - error!("Unable to download providers from BMS: {}", e); - return; // stop program + error!("Unable to download dataset metadata from Pangaea: {}", e); + return Err(e); // stop program } }; - let bms_datasets = match load_bms_datasets(&settings.bms.monitor_url) { - Ok(datasets) => datasets, - Err(e) => { - error!("Unable to download datasets from BMS: {}", e); - return; // stop program - } - }; - - if let Err(e) = process_datasets( - &settings, - &abcd_fields, - &mut database_sink, - bms_providers, - &bms_datasets, - ) { + if let Err(e) = process_datasets(&settings, &abcd_fields, &mut database_sink, &datasets) { error!("Error processing datasets: {}", e); }; + + Ok(()) } fn process_datasets( settings: &Settings, abcd_fields: &AbcdFields, database_sink: &mut DatabaseSink, - bms_providers: BmsProviders, - bms_datasets: &Vec, + datasets: &[PangaeaSearchResultEntry], ) -> Result<(), Error> { let temp_dir = tempfile::tempdir()?; let mut abcd_parser = AbcdParser::new(&abcd_fields); - for path_result in download_datasets(temp_dir.path(), &bms_datasets) + for dataset in datasets + .iter() .skip( settings .debug @@ -96,51 +86,24 @@ fn process_datasets( .unwrap_or(std::usize::MAX), ) { - let download = match path_result { - Ok(d) => d, - Err(e) => { - warn!("Unable to download file: {}", e); - continue; - } - }; - trace!("Temp file: {}", download.path.display()); + let file_path = temp_dir.path().join(dataset.id()).join(".zip"); + if let Err(e) = FileDownloader::from_url(dataset.download_url()).to_path(&file_path) { + warn!("Unable to download file: {}", e); + continue; + } + + trace!("Temp file: {}", file_path.display()); info!( "Processing `{}` @ `{}` ({})", - download.dataset.dataset, - download.dataset.provider_datacenter, - download - .dataset - .get_latest_archive() - .map(|archive| archive.xml_archive.as_str()) - .unwrap_or_else(|_| "-") + dataset.id(), + dataset.publisher(), + dataset.download_url(), ); - let bms_provider = match bms_providers.value_of(&download.dataset.provider_url) { - Some(provider) => provider, - None => { - warn!( - "Unable to retrieve BMS provider from map for {}", - download.dataset.provider_url - ); - continue; - } - }; - - let landing_page = match download.dataset.get_landing_page(&settings, &bms_provider) { - Ok(landing_page) => landing_page, - Err(e) => { - warn!( - "Unable to generate landing page for {}; {}", - download.dataset.dataset, e - ); - continue; - } - }; - - for xml_bytes_result in ArchiveReader::from_path(&download.path) - .unwrap() - .bytes_iter() - { + // TODO: generate landing page url + let landing_page_url: String = String::new(); + + for xml_bytes_result in ArchiveReader::from_path(&file_path).unwrap().bytes_iter() { let xml_bytes = match xml_bytes_result { Ok(bytes) => bytes, Err(e) => { @@ -150,9 +113,9 @@ fn process_datasets( }; let abcd_data = match abcd_parser.parse( - &download.url, - &landing_page, - &bms_provider.name, + dataset.download_url(), + &landing_page_url, + &dataset.publisher(), &xml_bytes, ) { Ok(data) => data, diff --git a/src/pangaea/downloader.rs b/src/pangaea/downloader.rs deleted file mode 100644 index e69de29..0000000 diff --git a/src/pangaea/mod.rs b/src/pangaea/mod.rs index b3e98fc..4b2b49e 100644 --- a/src/pangaea/mod.rs +++ b/src/pangaea/mod.rs @@ -1,2 +1,3 @@ -mod downloader; mod search_result; + +pub use self::search_result::{PangaeaSearchResult, PangaeaSearchResultEntry}; diff --git a/src/pangaea/search_result.rs b/src/pangaea/search_result.rs index a6ad0fd..8871de9 100644 --- a/src/pangaea/search_result.rs +++ b/src/pangaea/search_result.rs @@ -1,36 +1,37 @@ +use crate::settings::Pangaea; use failure::Error; use serde::Deserialize; use serde_json::json; use std::collections::HashMap; #[derive(Clone, Debug, Deserialize, PartialEq)] -pub struct SearchResult { +pub struct PangaeaSearchResult { #[serde(rename = "_scroll_id")] scroll_id: String, - hits: SearchResultHits, + hits: PangaeaSearchResultHits, } #[derive(Clone, Debug, Deserialize, PartialEq)] -struct SearchResultHits { +struct PangaeaSearchResultHits { total: u64, - hits: Vec, + hits: Vec, } #[derive(Clone, Debug, Deserialize, PartialEq)] -pub struct SearchResultEntry { +pub struct PangaeaSearchResultEntry { #[serde(rename = "_id")] id: String, #[serde(rename = "_source")] - source: SearchResultEntrySource, + source: PangaeaSearchResultEntrySource, } #[derive(Clone, Debug, Deserialize, PartialEq)] -struct SearchResultEntrySource { +struct PangaeaSearchResultEntrySource { citation_publisher: String, datalink: String, } -impl SearchResult { +impl PangaeaSearchResult { const SCROLL_TIMEOUT: &'static str = "1m"; fn from_url(url: &str) -> Result { @@ -84,17 +85,16 @@ impl SearchResult { } pub fn retrieve_all_entries( - query_url: &str, - scroll_url: &str, - ) -> Result, Error> { + pangaea_settings: &Pangaea, + ) -> Result, Error> { let mut entries = Vec::new(); - let mut result = Self::from_url(query_url)?; + let mut result = Self::from_url(&pangaea_settings.search_url)?; while result.hits.total > 0 { entries.append(&mut result.hits.hits); - result = Self::from_scroll_url(scroll_url, &result.scroll_id)?; + result = Self::from_scroll_url(&pangaea_settings.scroll_url, &result.scroll_id)?; } entries.append(&mut result.hits.hits); @@ -103,7 +103,7 @@ impl SearchResult { } } -impl SearchResultEntry { +impl PangaeaSearchResultEntry { pub fn id(&self) -> &str { &self.id } @@ -178,14 +178,14 @@ mod tests { #[test] fn parse_search_result_entry_source() { - let search_result_entry_source = serde_json::from_str::( + let search_result_entry_source = serde_json::from_str::( &SEARCH_RESULT_ENTRY_SOURCE_JSON().to_string(), ) .unwrap(); assert_eq!( search_result_entry_source, - SearchResultEntrySource { + PangaeaSearchResultEntrySource { citation_publisher: CITATION_PUBLISHER.into(), datalink: DATALINK.into(), } @@ -194,15 +194,16 @@ mod tests { #[test] fn parse_search_result_entry() { - let search_result_entry = - serde_json::from_str::(&SEARCH_RESULT_ENTRY_JSON().to_string()) - .unwrap(); + let search_result_entry = serde_json::from_str::( + &SEARCH_RESULT_ENTRY_JSON().to_string(), + ) + .unwrap(); assert_eq!( search_result_entry, - SearchResultEntry { + PangaeaSearchResultEntry { id: RESULT_ID.to_string(), - source: SearchResultEntrySource { + source: PangaeaSearchResultEntrySource { citation_publisher: CITATION_PUBLISHER.into(), datalink: DATALINK.into(), }, @@ -213,24 +214,24 @@ mod tests { #[test] fn parse_search_result_hits() { let search_result_hits = - serde_json::from_str::(&SEARCH_RESULT_HITS_JSON().to_string()) + serde_json::from_str::(&SEARCH_RESULT_HITS_JSON().to_string()) .unwrap(); assert_eq!( search_result_hits, - SearchResultHits { + PangaeaSearchResultHits { total: SEARCH_RESULT_HITS, hits: vec![ - SearchResultEntry { + PangaeaSearchResultEntry { id: RESULT_ID.to_string(), - source: SearchResultEntrySource { + source: PangaeaSearchResultEntrySource { citation_publisher: CITATION_PUBLISHER.into(), datalink: DATALINK.into(), }, }, - SearchResultEntry { + PangaeaSearchResultEntry { id: RESULT_ID_2.to_string(), - source: SearchResultEntrySource { + source: PangaeaSearchResultEntrySource { citation_publisher: CITATION_PUBLISHER_2.into(), datalink: DATALINK_2.into(), }, @@ -243,7 +244,7 @@ mod tests { #[test] fn parse_search_result() { let search_result = - serde_json::from_str::(&SEARCH_RESULT_JSON().to_string()).unwrap(); + serde_json::from_str::(&SEARCH_RESULT_JSON().to_string()).unwrap(); assert_eq!(search_result.scroll_id, SCROLL_ID); assert_eq!(search_result.hits.hits.len(), 2); @@ -252,12 +253,12 @@ mod tests { #[test] fn parse_webserver_result() { let webserver = MockWebserver::from_json( - &format!("/?scroll={}", SearchResult::SCROLL_TIMEOUT), + &format!("/?scroll={}", PangaeaSearchResult::SCROLL_TIMEOUT), "POST", &SEARCH_RESULT_JSON().to_string(), ); - let search_result = SearchResult::from_url(&webserver.webserver_root_url()).unwrap(); + let search_result = PangaeaSearchResult::from_url(&webserver.webserver_root_url()).unwrap(); assert_eq!(search_result.scroll_id, SCROLL_ID); assert_eq!(search_result.hits.hits.len(), 2); @@ -268,7 +269,8 @@ mod tests { let webserver = MockWebserver::from_json("/", "POST", &SEARCH_RESULT_JSON().to_string()); let search_result = - SearchResult::from_scroll_url(&webserver.webserver_root_url(), SCROLL_ID).unwrap(); + PangaeaSearchResult::from_scroll_url(&webserver.webserver_root_url(), SCROLL_ID) + .unwrap(); assert_eq!(search_result.scroll_id, SCROLL_ID); assert_eq!(search_result.hits.hits.len(), 2); @@ -282,7 +284,7 @@ mod tests { "/scroll", "POST", &json!({ - "scroll" : SearchResult::SCROLL_TIMEOUT, + "scroll" : PangaeaSearchResult::SCROLL_TIMEOUT, "scroll_id" : SCROLL_ID, }) .to_string(), @@ -303,7 +305,7 @@ mod tests { "/scroll", "POST", &json!({ - "scroll" : SearchResult::SCROLL_TIMEOUT, + "scroll" : PangaeaSearchResult::SCROLL_TIMEOUT, "scroll_id" : SCROLL_ID_2, }) .to_string(), @@ -322,10 +324,10 @@ mod tests { assert_eq!(_m2.webserver_root_url(), _m3.webserver_root_url()); - let entries = SearchResult::retrieve_all_entries( - &_m1.webserver_root_url(), - &format!("{}/scroll", _m2.webserver_root_url()), - ) + let entries = PangaeaSearchResult::retrieve_all_entries(&Pangaea { + search_url: _m1.webserver_root_url(), + scroll_url: format!("{}/scroll", _m2.webserver_root_url()), + }) .unwrap(); assert_eq!(5, entries.len()); From 849642f9b2a727abce59f602dc48fa2087590a88 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Thu, 6 Jun 2019 16:54:41 +0200 Subject: [PATCH 22/31] tests, renaming of settings components and moving of storage files --- settings-default.toml | 1 + src/file_downloader.rs | 26 ++++++++- src/main.rs | 25 ++++++-- src/pangaea/search_result.rs | 6 +- src/settings.rs | 25 ++++---- src/{ => storage}/database_sink.rs | 92 ++++++++++++++---------------- src/storage/field.rs | 21 +++++++ src/storage/mod.rs | 5 ++ src/test_utils/mod.rs | 7 +++ src/test_utils/webserver.rs | 6 ++ 10 files changed, 142 insertions(+), 72 deletions(-) rename src/{ => storage}/database_sink.rs (89%) create mode 100644 src/storage/field.rs create mode 100644 src/storage/mod.rs diff --git a/settings-default.toml b/settings-default.toml index 3e69cab..b81e2a3 100644 --- a/settings-default.toml +++ b/settings-default.toml @@ -8,6 +8,7 @@ dataset_limit = 3 [abcd] fields_file = "abcd-fields.json" +landing_page_field = "/DataSets/DataSet/Metadata/Description/Representation/URI" [pangaea] search_url = "https://ws.pangaea.de/es/dataportal-gfbio/pansimple/_search" diff --git a/src/file_downloader.rs b/src/file_downloader.rs index ca9f292..89f411b 100644 --- a/src/file_downloader.rs +++ b/src/file_downloader.rs @@ -1,7 +1,7 @@ use failure::Error; use std::fs::File; use std::io::BufWriter; -use std::path::{Path, PathBuf}; +use std::path::Path; pub struct FileDownloader { url: String, @@ -23,3 +23,27 @@ impl FileDownloader { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + + use crate::test_utils::{create_empty_temp_file, MockWebserver}; + use std::fs; + + #[test] + fn download_file() { + const CONTENT: &str = "foobar"; + + let webserver = MockWebserver::from_text("/", "GET", CONTENT); + let download_file = create_empty_temp_file(); + + FileDownloader::from_url(&webserver.webserver_root_url()) + .to_path(&download_file) + .unwrap(); + + let file_content = fs::read_to_string(download_file).unwrap(); + + assert_eq!(CONTENT, file_content); + } +} diff --git a/src/main.rs b/src/main.rs index 748ef0b..3a8d109 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,15 +9,16 @@ use simplelog::{CombinedLogger, SharedLogger, TermLogger, WriteLogger}; use settings::Settings; use crate::abcd::{AbcdFields, AbcdParser, ArchiveReader}; -use crate::database_sink::DatabaseSink; use crate::file_downloader::FileDownloader; use crate::pangaea::{PangaeaSearchResult, PangaeaSearchResultEntry}; +use crate::settings::TerminologyServiceSettings; +use crate::storage::DatabaseSink; mod abcd; -mod database_sink; mod file_downloader; mod pangaea; mod settings; +mod storage; #[cfg(test)] mod test_utils; mod vat_type; @@ -39,7 +40,7 @@ fn main() -> Result<(), Error> { let mut database_sink = match DatabaseSink::new(&settings.database, &abcd_fields) { Ok(sink) => sink, Err(e) => { - error!("Unable to create database sink: {}", e); + error!("Unable to create storage sink: {}", e); return Err(e); // stop program } }; @@ -100,8 +101,9 @@ fn process_datasets( dataset.download_url(), ); - // TODO: generate landing page url - let landing_page_url: String = String::new(); + // TODO: update landing page url from field + let landing_page_url: String = + propose_landing_page(&settings.terminology_service, dataset.download_url()); for xml_bytes_result in ArchiveReader::from_path(&file_path).unwrap().bytes_iter() { let xml_bytes = match xml_bytes_result { @@ -129,7 +131,7 @@ fn process_datasets( match database_sink.insert_dataset(&abcd_data) { Ok(_) => (), - Err(e) => warn!("Unable to insert dataset into database: {}", e), + Err(e) => warn!("Unable to insert dataset into storage: {}", e), }; } } @@ -190,3 +192,14 @@ fn initialize_logger(file_path: &Path, settings: &Settings) -> Result<(), Error> Ok(()) } + +fn propose_landing_page( + terminology_service_settings: &TerminologyServiceSettings, + dataset_url: &str, +) -> String { + format!( + "{base_url}?archive={dataset_url}", + base_url = terminology_service_settings.landingpage_url, + dataset_url = dataset_url, + ) +} diff --git a/src/pangaea/search_result.rs b/src/pangaea/search_result.rs index 8871de9..b4e3417 100644 --- a/src/pangaea/search_result.rs +++ b/src/pangaea/search_result.rs @@ -1,4 +1,4 @@ -use crate::settings::Pangaea; +use crate::settings::PangaeaSettings; use failure::Error; use serde::Deserialize; use serde_json::json; @@ -85,7 +85,7 @@ impl PangaeaSearchResult { } pub fn retrieve_all_entries( - pangaea_settings: &Pangaea, + pangaea_settings: &PangaeaSettings, ) -> Result, Error> { let mut entries = Vec::new(); @@ -324,7 +324,7 @@ mod tests { assert_eq!(_m2.webserver_root_url(), _m3.webserver_root_url()); - let entries = PangaeaSearchResult::retrieve_all_entries(&Pangaea { + let entries = PangaeaSearchResult::retrieve_all_entries(&PangaeaSettings { search_url: _m1.webserver_root_url(), scroll_url: format!("{}/scroll", _m2.webserver_root_url()), }) diff --git a/src/settings.rs b/src/settings.rs index 16cfd86..a6a16f8 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -6,29 +6,30 @@ use config::File; use serde::Deserialize; #[derive(Debug, Deserialize)] -pub struct General { +pub struct GeneralSettings { pub log_file: String, pub debug: bool, } #[derive(Debug, Deserialize)] -pub struct Abcd { +pub struct AbcdSettings { pub fields_file: String, + pub landing_page_field: String, } #[derive(Debug, Deserialize)] -pub struct Pangaea { +pub struct PangaeaSettings { pub search_url: String, pub scroll_url: String, } #[derive(Debug, Deserialize)] -pub struct TerminologyService { +pub struct TerminologyServiceSettings { pub landingpage_url: String, } #[derive(Debug, Deserialize)] -pub struct Database { +pub struct DatabaseSettings { pub host: String, pub port: u16, pub tls: bool, @@ -49,7 +50,7 @@ pub struct Database { } #[derive(Debug, Deserialize)] -pub struct Debug { +pub struct DebugSettings { pub dataset_start: Option, pub dataset_limit: Option, } @@ -57,12 +58,12 @@ pub struct Debug { /// This struct stores the program settings. #[derive(Debug, Deserialize)] pub struct Settings { - pub abcd: Abcd, - pub pangaea: Pangaea, - pub terminology_service: TerminologyService, - pub database: Database, - pub debug: Debug, - pub general: General, + pub abcd: AbcdSettings, + pub pangaea: PangaeaSettings, + pub terminology_service: TerminologyServiceSettings, + pub database: DatabaseSettings, + pub debug: DebugSettings, + pub general: GeneralSettings, } impl Settings { diff --git a/src/database_sink.rs b/src/storage/database_sink.rs similarity index 89% rename from src/database_sink.rs rename to src/storage/database_sink.rs index 05f3823..b264b9d 100644 --- a/src/database_sink.rs +++ b/src/storage/database_sink.rs @@ -12,29 +12,28 @@ use postgres::{Connection, TlsMode}; use crate::abcd::{AbcdFields, AbcdResult, ValueMap}; use crate::settings; +use crate::storage::Field; const POSTGRES_CSV_CONFIGURATION: &str = "DELIMITER '\t', NULL '', QUOTE '\"', ESCAPE '\"', FORMAT CSV"; -/// A PostgreSQL database DAO for storing datasets. +/// A PostgreSQL storage DAO for storing datasets. pub struct DatabaseSink<'s> { connection: Connection, - database_settings: &'s settings::Database, - dataset_fields: Vec, - dataset_fields_hash: Vec, + database_settings: &'s settings::DatabaseSettings, + dataset_fields: Vec, datasets_to_ids: HashMap, next_dataset_id: u32, - unit_fields: Vec, - unit_fields_hash: Vec, + unit_fields: Vec, } impl<'s> DatabaseSink<'s> { - /// Create a new PostgreSQL database sink (DAO). + /// Create a new PostgreSQL storage sink (DAO). pub fn new( - database_settings: &'s settings::Database, + database_settings: &'s settings::DatabaseSettings, abcd_fields: &AbcdFields, ) -> Result { - // create database connection params from the settings, including optional tls + // create storage connection params from the settings, including optional tls let negotiator = if database_settings.tls { Some(OpenSsl::new()?) } else { @@ -46,24 +45,14 @@ impl<'s> DatabaseSink<'s> { .database(&database_settings.database) .build(Host::Tcp(database_settings.host.clone())); - // fill lists of dataset and unit fields and give them a fixed order for the database inserts + // fill lists of dataset and unit fields and give them a fixed order for the storage inserts let mut dataset_fields = Vec::new(); - let mut dataset_fields_hash = Vec::new(); let mut unit_fields = Vec::new(); - let mut unit_fields_hash = Vec::new(); - let mut hasher = sha1::Sha1::new(); for field in abcd_fields { - let hash = { - hasher.reset(); - hasher.update(field.name.as_bytes()); - hasher.digest().to_string() - }; if field.global_field { - dataset_fields.push(field.name.clone()); - dataset_fields_hash.push(hash); + dataset_fields.push(field.name.as_str().into()); } else { - unit_fields.push(field.name.clone()); - unit_fields_hash.push(hash); + unit_fields.push(field.name.as_str().into()); } } @@ -78,11 +67,9 @@ impl<'s> DatabaseSink<'s> { )?, database_settings, dataset_fields, - dataset_fields_hash, datasets_to_ids: HashMap::new(), next_dataset_id: 1, unit_fields, - unit_fields_hash, }; sink.initialize_temporary_schema(abcd_fields)?; @@ -90,7 +77,7 @@ impl<'s> DatabaseSink<'s> { Ok(sink) } - /// Initialize the temporary database schema. + /// Initialize the temporary storage schema. fn initialize_temporary_schema(&mut self, abcd_fields: &AbcdFields) -> Result<(), Error> { self.drop_temporary_tables()?; @@ -121,11 +108,8 @@ impl<'s> DatabaseSink<'s> { schema = self.database_settings.schema, table = self.database_settings.temp_dataset_table ))?; - for (name, hash) in self.dataset_fields.iter().zip(&self.dataset_fields_hash) { - statement.execute(&[name, hash])?; - } - for (name, hash) in self.unit_fields.iter().zip(&self.unit_fields_hash) { - statement.execute(&[name, hash])?; + for field in self.dataset_fields.iter().chain(&self.unit_fields) { + statement.execute(&[&field.name, &field.hash])?; } Ok(()) @@ -138,10 +122,10 @@ impl<'s> DatabaseSink<'s> { self.database_settings.dataset_id_column )]; - for (field, hash) in self.unit_fields.iter().zip(&self.unit_fields_hash) { + for field in &self.unit_fields { let abcd_field = abcd_fields - .value_of(field.as_bytes()) - .ok_or_else(|| DatabaseSinkError::InconsistentUnitColumns(field.clone()))?; + .value_of(field.name.as_bytes()) + .ok_or_else(|| DatabaseSinkError::InconsistentUnitColumns(field.name.clone()))?; let data_type_string = if abcd_field.numeric { "double precision" @@ -153,7 +137,12 @@ impl<'s> DatabaseSink<'s> { // let null_string = if abcd_field.vat_mandatory { "NOT NULL" } else { "" } let null_string = ""; - fields.push(format!("\"{}\" {} {}", hash, data_type_string, null_string)); + fields.push(format!( + "\"{hash}\" {datatype} {nullable}", + hash = field.hash, + datatype = data_type_string, + nullable = null_string, + )); } self.connection.execute( @@ -190,10 +179,10 @@ impl<'s> DatabaseSink<'s> { ), // provider name ]; - for (field, hash) in self.dataset_fields.iter().zip(&self.dataset_fields_hash) { + for field in &self.dataset_fields { let abcd_field = abcd_fields - .value_of(field.as_bytes()) - .ok_or_else(|| DatabaseSinkError::InconsistentDatasetColumns(field.clone()))?; + .value_of(field.name.as_bytes()) + .ok_or_else(|| DatabaseSinkError::InconsistentDatasetColumns(field.name.clone()))?; let data_type_string = if abcd_field.numeric { "double precision" @@ -205,7 +194,12 @@ impl<'s> DatabaseSink<'s> { // let null_string = if abcd_field.vat_mandatory { "NOT NULL" } else { "" } let null_string = ""; - fields.push(format!("\"{}\" {} {}", hash, data_type_string, null_string)); + fields.push(format!( + "\"{hash}\" {datatype} {nullable}", + hash = field.hash, + datatype = data_type_string, + nullable = null_string, + )); } self.connection.execute( @@ -480,7 +474,7 @@ impl<'s> DatabaseSink<'s> { /// Insert a dataset and its units into the temporary tables. pub fn insert_dataset(&mut self, abcd_data: &AbcdResult) -> Result<(), Error> { // retrieve the id for the dataset - // if the dataset is not found, it is necessary to create a dataset database entry at first + // if the dataset is not found, it is necessary to create a dataset storage entry at first let dataset_unique_string = self.to_combined_string(&abcd_data.dataset); let dataset_id = match self.datasets_to_ids.entry(dataset_unique_string) { Entry::Occupied(e) => *e.get(), @@ -492,7 +486,6 @@ impl<'s> DatabaseSink<'s> { &self.database_settings, &self.connection, self.dataset_fields.as_slice(), - self.dataset_fields_hash.as_slice(), abcd_data, id, )?; @@ -512,10 +505,9 @@ impl<'s> DatabaseSink<'s> { /// Insert the dataset metadata into the temporary schema fn insert_dataset_metadata( - database_settings: &settings::Database, + database_settings: &settings::DatabaseSettings, connection: &Connection, - dataset_fields: &[String], - dataset_fields_hash: &[String], + dataset_fields: &[Field], abcd_data: &AbcdResult, id: u32, ) -> Result<(), Error> { @@ -536,9 +528,9 @@ impl<'s> DatabaseSink<'s> { values.write_field(abcd_data.dataset_path.clone())?; values.write_field(abcd_data.landing_page.clone())?; values.write_field(abcd_data.provider_id.clone())?; - for (field, hash) in dataset_fields.iter().zip(dataset_fields_hash.iter()) { - columns.push(&hash); - if let Some(value) = abcd_data.dataset.get(field) { + for field in dataset_fields { + columns.push(&field.hash); + if let Some(value) = abcd_data.dataset.get(&field.name) { values.write_field(value.to_string())?; } else { values.write_field("")?; @@ -568,7 +560,7 @@ impl<'s> DatabaseSink<'s> { /// Insert the dataset units into the temporary schema fn insert_units(&mut self, abcd_data: &AbcdResult, dataset_id: u32) -> Result<(), Error> { let mut columns: Vec = vec![self.database_settings.dataset_id_column.clone()]; - columns.extend_from_slice(self.unit_fields_hash.as_slice()); + columns.extend(self.unit_fields.iter().map(|field| field.name.clone())); let dataset_id_string = dataset_id.to_string(); @@ -585,7 +577,7 @@ impl<'s> DatabaseSink<'s> { values.write_field(&dataset_id_string)?; // put id first for field in &self.unit_fields { - if let Some(value) = unit_data.get(field) { + if let Some(value) = unit_data.get(&field.name) { values.write_field(value.to_string())?; } else { values.write_field("")?; @@ -615,7 +607,7 @@ impl<'s> DatabaseSink<'s> { let mut hash = String::new(); for field in &self.dataset_fields { - if let Some(value) = dataset_data.get(field) { + if let Some(value) = dataset_data.get(&field.name) { hash.push_str(&value.to_string()); } } @@ -624,7 +616,7 @@ impl<'s> DatabaseSink<'s> { } } -/// An error enum for different database sink errors. +/// An error enum for different storage sink errors. #[derive(Debug, Fail)] pub enum DatabaseSinkError { /// This error occurs when there is an inconsistency between the ABCD dataset data and the sink's columns. diff --git a/src/storage/field.rs b/src/storage/field.rs new file mode 100644 index 0000000..baf84f8 --- /dev/null +++ b/src/storage/field.rs @@ -0,0 +1,21 @@ +use sha1::Sha1; + +pub struct Field { + pub name: String, + pub hash: String, +} + +impl Field { + pub fn new(name: &str) -> Self { + Self { + name: name.into(), + hash: Sha1::from(name).digest().to_string(), + } + } +} + +impl From<&str> for Field { + fn from(name: &str) -> Self { + Self::new(name) + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs new file mode 100644 index 0000000..5f7f40e --- /dev/null +++ b/src/storage/mod.rs @@ -0,0 +1,5 @@ +mod database_sink; +mod field; + +pub use self::database_sink::DatabaseSink; +pub(self) use self::field::Field; diff --git a/src/test_utils/mod.rs b/src/test_utils/mod.rs index 8450d94..ddb06c8 100644 --- a/src/test_utils/mod.rs +++ b/src/test_utils/mod.rs @@ -20,3 +20,10 @@ pub fn create_temp_file_with_suffix(suffix: &str, content: &str) -> TempPath { file.into_temp_path() } + +pub fn create_empty_temp_file() -> TempPath { + tempfile::Builder::new() + .tempfile() + .expect("Unable to create test file.") + .into_temp_path() +} diff --git a/src/test_utils/webserver.rs b/src/test_utils/webserver.rs index 036be69..bac0d62 100644 --- a/src/test_utils/webserver.rs +++ b/src/test_utils/webserver.rs @@ -5,6 +5,12 @@ pub struct MockWebserver { } impl MockWebserver { + pub fn from_text(path: &str, method: &str, text: &str) -> Self { + Self { + _mock: mock(method, path).with_body(text).create(), + } + } + pub fn from_json(path: &str, method: &str, json_string: &str) -> Self { Self { _mock: mock(method, path) From 560671bf13c6f4d1e01bceb7c3ec1ea80c0931c7 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Thu, 6 Jun 2019 17:21:59 +0200 Subject: [PATCH 23/31] use landing page from archive if available, otherwise use the proposal url from the terminology server --- src/abcd/abcd_parser.rs | 66 +++++++++++++++++++++++++++++------- src/main.rs | 4 +-- src/storage/database_sink.rs | 2 +- 3 files changed, 56 insertions(+), 16 deletions(-) diff --git a/src/abcd/abcd_parser.rs b/src/abcd/abcd_parser.rs index 874a9eb..d7450c2 100644 --- a/src/abcd/abcd_parser.rs +++ b/src/abcd/abcd_parser.rs @@ -6,6 +6,7 @@ use quick_xml::events::Event; use quick_xml::Reader; use crate::abcd::{AbcdFields, AbcdVersion}; +use crate::settings::AbcdSettings; use crate::vat_type::VatType; pub type ValueMap = HashMap; @@ -14,6 +15,7 @@ pub type ValueMap = HashMap; #[derive(Debug)] pub struct AbcdParser<'a> { abcd_fields: &'a AbcdFields, + abcd_settings: &'a AbcdSettings, abcd_version: AbcdVersion, xml_tag_path: Vec, xml_buffer: Vec, @@ -22,8 +24,9 @@ pub struct AbcdParser<'a> { impl<'a> AbcdParser<'a> { /// Create a new `AbcdParser`. - pub fn new(abcd_fields: &'a AbcdFields) -> Self { + pub fn new(abcd_settings: &'a AbcdSettings, abcd_fields: &'a AbcdFields) -> Self { Self { + abcd_settings, abcd_fields, abcd_version: AbcdVersion::Unknown, xml_tag_path: Vec::new(), @@ -35,9 +38,10 @@ impl<'a> AbcdParser<'a> { /// Parse a binary XML file to `AbcdResult`s. pub fn parse( &mut self, + dataset_id: &str, dataset_path: &str, - landing_page: &str, - provider_id: &str, + landing_page_proposal: &str, + provider_name: &str, xml_bytes: &[u8], ) -> Result { let mut xml_reader = Reader::from_reader(xml_bytes); @@ -131,10 +135,19 @@ impl<'a> AbcdParser<'a> { self.clear(); // clear resources like buffers if let Some(dataset_data) = dataset_data { + let landing_page = if let Some(VatType::Textual(value)) = + dataset_data.get(&self.abcd_settings.landing_page_field) + { + value + } else { + landing_page_proposal + }; + Ok(AbcdResult::new( + dataset_id.into(), dataset_path.into(), landing_page.into(), - provider_id.into(), + provider_name.into(), dataset_data, units, )) @@ -168,9 +181,10 @@ impl<'a> AbcdParser<'a> { /// This struct reflects the result of a parsed xml file with miscellaneous additional static meta data pub struct AbcdResult { + pub dataset_id: String, pub dataset_path: String, pub landing_page: String, - pub provider_id: String, + pub provider_name: String, pub dataset: ValueMap, pub units: Vec, } @@ -178,16 +192,18 @@ pub struct AbcdResult { impl AbcdResult { /// This constructor creates a new `AbcdResult` from dataset and unit data. pub fn new( + dataset_id: String, dataset_path: String, landing_page: String, - provider_id: String, + provider_name: String, dataset_data: ValueMap, units_data: Vec, ) -> Self { AbcdResult { + dataset_id, dataset_path, landing_page, - provider_id, + provider_name, dataset: dataset_data, units: units_data, } @@ -207,6 +223,7 @@ mod tests { const TECHNICAL_CONTACT_NAME: &str = "TECHNICAL CONTACT NAME"; const DESCRIPTION_TITLE: &str = "DESCRIPTION TITLE"; + const LANDING_PAGE: &str = "http://LANDING-PAGE/"; const UNIT_ID: &str = "UNIT ID"; const UNIT_LONGITUDE: f64 = 10.911; const UNIT_LATITUDE: f64 = 49.911; @@ -215,21 +232,34 @@ mod tests { #[test] fn simple_file() { let abcd_fields = create_abcd_fields(); + let abcd_settings = AbcdSettings { + fields_file: "".into(), + landing_page_field: "/DataSets/DataSet/Metadata/Description/Representation/URI".into(), + }; + let test_file = create_file_as_bytes(); - let mut parser = AbcdParser::new(&abcd_fields); + let mut parser = AbcdParser::new(&abcd_settings, &abcd_fields); + let dataset_id = "dataset_id"; let dataset_path = "dataset_path"; - let landing_page = "landing_page"; - let provider_id = "provider_id"; + let landing_page_proposal = "landing_page proposal"; + let provider_name = "provider_id"; let result = parser - .parse(dataset_path, landing_page, provider_id, &test_file) + .parse( + dataset_id, + dataset_path, + landing_page_proposal, + provider_name, + &test_file, + ) .expect("Unable to parse bytes"); + assert_eq!(result.dataset_id, dataset_id); assert_eq!(result.dataset_path, dataset_path); - assert_eq!(result.landing_page, landing_page); - assert_eq!(result.provider_id, provider_id); + assert_eq!(result.landing_page, LANDING_PAGE); + assert_eq!(result.provider_name, provider_name); assert_eq!( Some(&VatType::Textual(TECHNICAL_CONTACT_NAME.into())), @@ -283,6 +313,7 @@ mod tests { {DESCRIPTION_TITLE} + {LANDING_PAGE} @@ -307,6 +338,7 @@ mod tests { "#, TECHNICAL_CONTACT_NAME = TECHNICAL_CONTACT_NAME, DESCRIPTION_TITLE = DESCRIPTION_TITLE, + LANDING_PAGE = LANDING_PAGE, UNIT_ID = UNIT_ID, UNIT_LONGITUDE = UNIT_LONGITUDE, UNIT_LATITUDE = UNIT_LATITUDE, @@ -333,6 +365,14 @@ mod tests { "globalField": true, "unit": "" }, + { + "name": "/DataSets/DataSet/Metadata/Description/Representation/URI", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, { "name": "/DataSets/DataSet/Units/Unit/UnitID", "numeric": false, diff --git a/src/main.rs b/src/main.rs index 3a8d109..1201f55 100644 --- a/src/main.rs +++ b/src/main.rs @@ -68,7 +68,7 @@ fn process_datasets( ) -> Result<(), Error> { let temp_dir = tempfile::tempdir()?; - let mut abcd_parser = AbcdParser::new(&abcd_fields); + let mut abcd_parser = AbcdParser::new(&settings.abcd, &abcd_fields); for dataset in datasets .iter() @@ -101,7 +101,6 @@ fn process_datasets( dataset.download_url(), ); - // TODO: update landing page url from field let landing_page_url: String = propose_landing_page(&settings.terminology_service, dataset.download_url()); @@ -115,6 +114,7 @@ fn process_datasets( }; let abcd_data = match abcd_parser.parse( + dataset.id(), dataset.download_url(), &landing_page_url, &dataset.publisher(), diff --git a/src/storage/database_sink.rs b/src/storage/database_sink.rs index b264b9d..6d9a946 100644 --- a/src/storage/database_sink.rs +++ b/src/storage/database_sink.rs @@ -527,7 +527,7 @@ impl<'s> DatabaseSink<'s> { values.write_field(id.to_string())?; values.write_field(abcd_data.dataset_path.clone())?; values.write_field(abcd_data.landing_page.clone())?; - values.write_field(abcd_data.provider_id.clone())?; + values.write_field(abcd_data.provider_name.clone())?; for field in dataset_fields { columns.push(&field.hash); if let Some(value) = abcd_data.dataset.get(&field.name) { From c5f43ac7048d78be625f2dc502e976ebbe8f75c6 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Thu, 6 Jun 2019 17:24:43 +0200 Subject: [PATCH 24/31] field -> hash bytes of string instead of string --- src/storage/field.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/storage/field.rs b/src/storage/field.rs index baf84f8..ba83f49 100644 --- a/src/storage/field.rs +++ b/src/storage/field.rs @@ -9,7 +9,7 @@ impl Field { pub fn new(name: &str) -> Self { Self { name: name.into(), - hash: Sha1::from(name).digest().to_string(), + hash: Sha1::from(name.as_bytes()).digest().to_string(), } } } From 4e41ebdcffc53c2581aed7e2052a8444622cf2be Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Tue, 25 Jun 2019 08:57:55 +0200 Subject: [PATCH 25/31] database sink tests and refactoring --- settings-default.toml | 1 + src/settings.rs | 1 + src/storage/database_sink.rs | 951 +++++++++++++++++++++++++++++++---- src/storage/field.rs | 12 + src/storage/mod.rs | 2 + src/storage/surrogate_key.rs | 67 +++ 6 files changed, 924 insertions(+), 110 deletions(-) create mode 100644 src/storage/surrogate_key.rs diff --git a/settings-default.toml b/settings-default.toml index b81e2a3..09309d0 100644 --- a/settings-default.toml +++ b/settings-default.toml @@ -27,6 +27,7 @@ password = "" schema = "" dataset_table = "abcd_datasets" temp_dataset_table = "abcd_datasets_temp" +surrogate_key_column = "surrogate_key" dataset_id_column = "dataset_id" dataset_path_column = "dataset_path" dataset_landing_page_column = "dataset_landing_page" diff --git a/src/settings.rs b/src/settings.rs index a6a16f8..ff09a14 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -40,6 +40,7 @@ pub struct DatabaseSettings { pub dataset_table: String, pub listing_view: String, pub temp_dataset_table: String, + pub surrogate_key_column: String, pub dataset_id_column: String, pub dataset_path_column: String, pub dataset_landing_page_column: String, diff --git a/src/storage/database_sink.rs b/src/storage/database_sink.rs index 6d9a946..533324f 100644 --- a/src/storage/database_sink.rs +++ b/src/storage/database_sink.rs @@ -1,6 +1,3 @@ -use std::collections::hash_map::Entry; -use std::collections::HashMap; - use csv::WriterBuilder; use failure::{Error, Fail}; use log::debug; @@ -10,9 +7,10 @@ use postgres::tls::openssl::OpenSsl; use postgres::transaction::Transaction; use postgres::{Connection, TlsMode}; -use crate::abcd::{AbcdFields, AbcdResult, ValueMap}; +use crate::abcd::{AbcdFields, AbcdResult}; use crate::settings; -use crate::storage::Field; +use crate::settings::DatabaseSettings; +use crate::storage::{Field, SurrogateKey, SurrogateKeyType}; const POSTGRES_CSV_CONFIGURATION: &str = "DELIMITER '\t', NULL '', QUOTE '\"', ESCAPE '\"', FORMAT CSV"; @@ -22,8 +20,7 @@ pub struct DatabaseSink<'s> { connection: Connection, database_settings: &'s settings::DatabaseSettings, dataset_fields: Vec, - datasets_to_ids: HashMap, - next_dataset_id: u32, + surrogate_key: SurrogateKey, unit_fields: Vec, } @@ -33,21 +30,53 @@ impl<'s> DatabaseSink<'s> { database_settings: &'s settings::DatabaseSettings, abcd_fields: &AbcdFields, ) -> Result { - // create storage connection params from the settings, including optional tls - let negotiator = if database_settings.tls { - Some(OpenSsl::new()?) - } else { - None + let connection = >::create_database_connection(&database_settings)?; + + let (dataset_fields, unit_fields) = + >::create_lists_of_dataset_and_unit_fields(abcd_fields); + + let mut sink = Self { + connection, + database_settings, + dataset_fields, + surrogate_key: Default::default(), + unit_fields, }; + + sink.initialize_temporary_schema(abcd_fields)?; + + Ok(sink) + } + + fn create_database_connection( + database_settings: &DatabaseSettings, + ) -> Result { let connection_params = ConnectParams::builder() .user(&database_settings.user, Some(&database_settings.password)) .port(database_settings.port) .database(&database_settings.database) .build(Host::Tcp(database_settings.host.clone())); - // fill lists of dataset and unit fields and give them a fixed order for the storage inserts + let negotiator = if database_settings.tls { + Some(OpenSsl::new()?) + } else { + None + }; + let tls_mode = if let Some(ref negotiator) = negotiator { + TlsMode::Prefer(negotiator) + } else { + TlsMode::None + }; + + Ok(Connection::connect(connection_params, tls_mode)?) + } + + fn create_lists_of_dataset_and_unit_fields( + abcd_fields: &AbcdFields, + ) -> (Vec, Vec) { let mut dataset_fields = Vec::new(); let mut unit_fields = Vec::new(); + for field in abcd_fields { if field.global_field { dataset_fields.push(field.name.as_str().into()); @@ -56,25 +85,7 @@ impl<'s> DatabaseSink<'s> { } } - let mut sink = Self { - connection: Connection::connect( - connection_params, - if let Some(negotiator) = &negotiator { - TlsMode::Prefer(negotiator) - } else { - TlsMode::None - }, - )?, - database_settings, - dataset_fields, - datasets_to_ids: HashMap::new(), - next_dataset_id: 1, - unit_fields, - }; - - sink.initialize_temporary_schema(abcd_fields)?; - - Ok(sink) + (dataset_fields, unit_fields) } /// Initialize the temporary storage schema. @@ -95,10 +106,10 @@ impl<'s> DatabaseSink<'s> { // create table self.connection.execute( &format!( - "create table {schema}.{table}_translation (name text not null, hash text not null);", - schema = self.database_settings.schema, - table = self.database_settings.temp_dataset_table - ), + "create table {schema}.{table}_translation (name text not null, hash text not null);", + schema = self.database_settings.schema, + table = self.database_settings.temp_dataset_table + ), &[], )?; @@ -119,7 +130,7 @@ impl<'s> DatabaseSink<'s> { fn create_temporary_unit_table(&mut self, abcd_fields: &AbcdFields) -> Result<(), Error> { let mut fields = vec![format!( "{} int not null", - self.database_settings.dataset_id_column + self.database_settings.surrogate_key_column, )]; for field in &self.unit_fields { @@ -163,8 +174,9 @@ impl<'s> DatabaseSink<'s> { let mut fields = vec![ format!( "{} int primary key", - self.database_settings.dataset_id_column - ), // id + self.database_settings.surrogate_key_column, + ), // surrogate key + format!("{} text not null", self.database_settings.dataset_id_column), // id format!( "{} text not null", self.database_settings.dataset_path_column @@ -342,9 +354,9 @@ impl<'s> DatabaseSink<'s> { schema = &self.database_settings.schema, table = &self.database_settings.unit_table, temp_prefix = &self.database_settings.temp_unit_table, - temp_suffix = &self.database_settings.dataset_id_column, + temp_suffix = &self.database_settings.surrogate_key_column, prefix = &self.database_settings.unit_table, - suffix = &self.database_settings.dataset_id_column + suffix = &self.database_settings.surrogate_key_column ), // index format!( @@ -368,29 +380,35 @@ impl<'s> DatabaseSink<'s> { FOREIGN KEY ({dataset_id}) REFERENCES {schema}.{dataset_table}({dataset_id});", schema = &self.database_settings.schema, unit_table = &self.database_settings.temp_unit_table, - dataset_id = &self.database_settings.dataset_id_column, + dataset_id = &self.database_settings.surrogate_key_column, dataset_table = &self.database_settings.temp_dataset_table ); debug!("{}", &foreign_key_statement); self.connection.execute(&foreign_key_statement, &[])?; - let mut hasher = sha1::Sha1::new(); let indexed_unit_column_names = self .database_settings .unit_indexed_columns .iter() - .map(|field| { - hasher.reset(); - hasher.update(field.as_bytes()); - hasher.digest().to_string() - }) + .map(Field::from) + .map(|field| field.hash) .collect::>(); let unit_index_statement = format!( "CREATE INDEX {unit_table}_idx ON {schema}.{unit_table} \ - USING btree ({dataset_id}, \"{other}\");", + USING btree ({surrogate_key_column} {other_begin}{other}{other_end});", schema = &self.database_settings.schema, unit_table = &self.database_settings.temp_unit_table, - dataset_id = &self.database_settings.dataset_id_column, - other = indexed_unit_column_names.join("\", \"") + surrogate_key_column = &self.database_settings.surrogate_key_column, + other_begin = if indexed_unit_column_names.is_empty() { + "" + } else { + ", \"" + }, + other = indexed_unit_column_names.join("\", \""), + other_end = if indexed_unit_column_names.is_empty() { + "" + } else { + "\"" + }, ); debug!("{}", &unit_index_statement); self.connection.execute(&unit_index_statement, &[])?; @@ -422,50 +440,63 @@ impl<'s> DatabaseSink<'s> { /// Create view that provides a listing view pub fn create_listing_view(&self, transaction: &Transaction) -> Result<(), Error> { // TODO: replace full names with settings call - let mut hasher = sha1::Sha1::new(); - hasher.update(b"/DataSets/DataSet/Metadata/Description/Representation/Title"); - let dataset_name = hasher.digest().to_string(); - hasher.reset(); + let dataset_title = if let Some(field) = self.dataset_fields.iter().find(|field| { + field.name == "/DataSets/DataSet/Metadata/Description/Representation/Title" + }) { + format!("\"{}\"", field.hash) + } else { + "''".to_string() + }; - hasher.update(b"/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LatitudeDecimal"); - let latitude_column_hash = hasher.digest().to_string(); - hasher.reset(); + let latitude_column = if let Some(field) = self.unit_fields.iter().find(|field| { + field.name == "/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LatitudeDecimal" + }) { + format!("\"{}\"", field.hash) + } else { + "NULL".to_string() + }; - hasher.update(b"/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LongitudeDecimal"); - let longitude_column_hash = hasher.digest().to_string(); - hasher.reset(); + let longitude_column = if let Some(field) = self.unit_fields.iter().find(|field| { + field.name == "/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LongitudeDecimal" + }) { + format!("\"{}\"", field.hash) + } else { + "NULL".to_string() + }; let view_statement = format!( r#" CREATE VIEW {schema}.{view_name} AS ( - select link, dataset, file, provider, isGeoReferenced as available, isGeoReferenced + select link, dataset, id, provider, isGeoReferenced as available, isGeoReferenced from ( select {dataset_landing_page_column} as link, - "{dataset_name}" as dataset, - {dataset_path_column} as file, + {dataset_title} as dataset, + {dataset_id_column} as id, {dataset_provider_column} as provider, (SELECT EXISTS( select * from {schema}.{unit_table} - where {dataset_table}.{dataset_id_column} = {unit_table}.{dataset_id_column} - and "{latitude_column_hash}" is not null - and "{longitude_column_hash}" is not null + where {dataset_table}.{surrogate_key_column} = {unit_table}.{surrogate_key_column} + and {latitude_column} is not null + and {longitude_column} is not null )) as isGeoReferenced from {schema}.{dataset_table} ) sub);"#, schema = self.database_settings.schema, view_name = self.database_settings.listing_view, - dataset_name = dataset_name, + dataset_title = dataset_title, dataset_landing_page_column = self.database_settings.dataset_landing_page_column, - dataset_path_column = self.database_settings.dataset_path_column, + dataset_id_column = self.database_settings.dataset_id_column, dataset_provider_column = self.database_settings.dataset_provider_column, dataset_table = self.database_settings.dataset_table, unit_table = self.database_settings.unit_table, - dataset_id_column = self.database_settings.dataset_id_column, - latitude_column_hash = latitude_column_hash, - longitude_column_hash = longitude_column_hash, + surrogate_key_column = self.database_settings.surrogate_key_column, + latitude_column = latitude_column, + longitude_column = longitude_column, ); + eprintln!("{}", &view_statement); + transaction.execute(&view_statement, &[])?; Ok(()) @@ -473,32 +504,21 @@ impl<'s> DatabaseSink<'s> { /// Insert a dataset and its units into the temporary tables. pub fn insert_dataset(&mut self, abcd_data: &AbcdResult) -> Result<(), Error> { - // retrieve the id for the dataset - // if the dataset is not found, it is necessary to create a dataset storage entry at first - let dataset_unique_string = self.to_combined_string(&abcd_data.dataset); - let dataset_id = match self.datasets_to_ids.entry(dataset_unique_string) { - Entry::Occupied(e) => *e.get(), - Entry::Vacant(o) => { - // retrieve next dataset id - let id = self.next_dataset_id; - + match self.surrogate_key.for_id(&abcd_data.dataset_id) { + SurrogateKeyType::New(surrogate_key) => { Self::insert_dataset_metadata( &self.database_settings, &self.connection, self.dataset_fields.as_slice(), abcd_data, - id, + surrogate_key, )?; - - // store id in map and increase next id variable - o.insert(id); - self.next_dataset_id += 1; - - id + self.insert_units(&abcd_data, surrogate_key)?; } - }; - - self.insert_units(&abcd_data, dataset_id)?; + SurrogateKeyType::Existing(surrogate_key) => { + self.insert_units(&abcd_data, surrogate_key)?; + } + } Ok(()) } @@ -519,12 +539,14 @@ impl<'s> DatabaseSink<'s> { .has_headers(false) .from_writer(vec![]); let mut columns: Vec<&str> = vec![ + database_settings.surrogate_key_column.as_ref(), database_settings.dataset_id_column.as_ref(), database_settings.dataset_path_column.as_ref(), database_settings.dataset_landing_page_column.as_ref(), database_settings.dataset_provider_column.as_ref(), ]; values.write_field(id.to_string())?; + values.write_field(abcd_data.dataset_id.clone())?; values.write_field(abcd_data.dataset_path.clone())?; values.write_field(abcd_data.landing_page.clone())?; values.write_field(abcd_data.provider_name.clone())?; @@ -558,11 +580,9 @@ impl<'s> DatabaseSink<'s> { } /// Insert the dataset units into the temporary schema - fn insert_units(&mut self, abcd_data: &AbcdResult, dataset_id: u32) -> Result<(), Error> { - let mut columns: Vec = vec![self.database_settings.dataset_id_column.clone()]; - columns.extend(self.unit_fields.iter().map(|field| field.name.clone())); - - let dataset_id_string = dataset_id.to_string(); + fn insert_units(&mut self, abcd_data: &AbcdResult, id: u32) -> Result<(), Error> { + let mut columns: Vec = vec![self.database_settings.surrogate_key_column.clone()]; + columns.extend(self.unit_fields.iter().map(|field| field.hash.clone())); let mut values = WriterBuilder::new() .terminator(csv::Terminator::Any(b'\n')) @@ -574,7 +594,7 @@ impl<'s> DatabaseSink<'s> { // append units one by one to tsv for unit_data in &abcd_data.units { - values.write_field(&dataset_id_string)?; // put id first + values.write_field(&id.to_string())?; // put id first for field in &self.unit_fields { if let Some(value) = unit_data.get(&field.name) { @@ -601,19 +621,6 @@ impl<'s> DatabaseSink<'s> { Ok(()) } - - /// Combines all values of the dataset's metadata into a new string. - fn to_combined_string(&self, dataset_data: &ValueMap) -> String { - let mut hash = String::new(); - - for field in &self.dataset_fields { - if let Some(value) = dataset_data.get(&field.name) { - hash.push_str(&value.to_string()); - } - } - - hash - } } /// An error enum for different storage sink errors. @@ -626,3 +633,727 @@ pub enum DatabaseSinkError { #[fail(display = "Inconsistent unit columns: {}", 0)] InconsistentUnitColumns(String), } + +#[cfg(test)] +mod tests { + use super::*; + + use crate::settings::{DatabaseSettings, Settings}; + use crate::test_utils; + use postgres::rows::Rows; + use serde_json::json; + use std::collections::HashMap; + + #[test] + fn schema_creation_leads_to_required_tables() { + let database_settings = retrieve_settings_from_file_and_override_schema(); + let abcd_fields = create_abcd_fields_from_json(&json!([])); + + let database_sink = DatabaseSink::new(&database_settings, &abcd_fields).unwrap(); + + let tables = retrieve_ordered_table_names(&database_sink); + + assert_eq!( + tables, + sorted_vec(vec![ + database_settings.temp_dataset_table.clone(), + database_settings.temp_unit_table.clone(), + format!("{}_translation", database_settings.temp_dataset_table) + ]) + ); + } + + #[test] + fn schema_creation_leads_to_required_columns_in_dataset_table() { + let database_settings = retrieve_settings_from_file_and_override_schema(); + let abcd_fields = create_abcd_fields_from_json(&json!([ + { + "name": "/DataSets/DataSet/TechnicalContacts/TechnicalContact/Name", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Metadata/Description/Representation/Title", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Metadata/Description/Representation/URI", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + ])); + + let database_sink = DatabaseSink::new(&database_settings, &abcd_fields).unwrap(); + + let dataset_table_columns = retrieve_ordered_table_column_names( + &database_sink, + &database_settings.temp_dataset_table, + ); + + let dataset_columns = extract_dataset_fields(&abcd_fields) + .iter() + .map(|field| field.hash.clone()) + .chain(vec![ + database_settings.surrogate_key_column.clone(), + "dataset_id".to_string(), + "dataset_landing_page".to_string(), + "dataset_path".to_string(), + "dataset_provider".to_string(), + ]) + .collect::>(); + + assert!(!dataset_columns.is_empty()); + assert_eq!(dataset_table_columns, sorted_vec(dataset_columns)); + } + + #[test] + fn schema_creation_leads_to_required_columns_in_unit_table() { + let database_settings = retrieve_settings_from_file_and_override_schema(); + let abcd_fields = create_abcd_fields_from_json(&json!([ + { + "name": "/DataSets/DataSet/Units/Unit/UnitID", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": false, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LongitudeDecimal", + "numeric": true, + "vatMandatory": true, + "gfbioMandatory": true, + "globalField": false, + "unit": "°" + }, + { + "name": "/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LatitudeDecimal", + "numeric": true, + "vatMandatory": true, + "gfbioMandatory": true, + "globalField": false, + "unit": "°" + }, + { + "name": "/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/SpatialDatum", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": false, + "unit": "" + } + ])); + + let database_sink = DatabaseSink::new(&database_settings, &abcd_fields).unwrap(); + + let dataset_table_columns = + retrieve_ordered_table_column_names(&database_sink, &database_settings.temp_unit_table); + + let unit_columns = extract_unit_fields(&abcd_fields) + .iter() + .map(|field| field.hash.clone()) + .chain(vec![database_settings.surrogate_key_column.clone()]) + .collect::>(); + + assert!(!unit_columns.is_empty()); + assert_eq!(dataset_table_columns, sorted_vec(unit_columns)); + } + + #[test] + fn translation_table_contains_entries() { + let database_settings = retrieve_settings_from_file_and_override_schema(); + let abcd_fields = create_abcd_fields_from_json(&json!([ + { + "name": "/DataSets/DataSet/TechnicalContacts/TechnicalContact/Name", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Metadata/Description/Representation/Title", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Metadata/Description/Representation/URI", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + ])); + + let database_sink = DatabaseSink::new(&database_settings, &abcd_fields).unwrap(); + + let expected_translation_table_columns = vec![ + "/DataSets/DataSet/TechnicalContacts/TechnicalContact/Name", + "/DataSets/DataSet/Metadata/Description/Representation/Title", + "/DataSets/DataSet/Metadata/Description/Representation/URI", + ]; + + let queried_translation_table_columns = + retrieve_translation_table_keys(&database_settings, &database_sink); + + assert_eq!( + sorted_vec(expected_translation_table_columns), + sorted_vec(queried_translation_table_columns) + ); + } + + #[test] + fn translation_table_entries_match_table_columns() { + let database_settings = retrieve_settings_from_file_and_override_schema(); + let abcd_fields = create_abcd_fields_from_json(&json!([ + { + "name": "/DataSets/DataSet/TechnicalContacts/TechnicalContact/Name", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Metadata/Description/Representation/Title", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Units/Unit/UnitID", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": false, + "unit": "" + }, + ])); + + let database_sink = DatabaseSink::new(&database_settings, &abcd_fields).unwrap(); + + let dataset_table_columns = retrieve_ordered_table_column_names( + &database_sink, + &database_settings.temp_dataset_table, + ); + let unit_table_columns = + retrieve_ordered_table_column_names(&database_sink, &database_settings.temp_unit_table); + + let translation_table_values = + retrieve_translation_table_values(&database_settings, &database_sink); + + for column_name in translation_table_values { + assert!( + dataset_table_columns.contains(&column_name) + || unit_table_columns.contains(&column_name) + ); + } + } + + #[test] + fn dataset_table_contains_entry_after_insert() { + let database_settings = retrieve_settings_from_file_and_override_schema(); + let abcd_fields = create_abcd_fields_from_json(&json!([ + { + "name": "DS_TEXT", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "DS_NUM", + "numeric": true, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "UNIT_TEXT", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": false, + "unit": "" + }, + { + "name": "UNIT_NUM", + "numeric": true, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": false, + "unit": "" + }, + ])); + + let mut database_sink = DatabaseSink::new(&database_settings, &abcd_fields).unwrap(); + + database_sink + .insert_dataset(&AbcdResult { + dataset_id: "TEST_ID".to_string(), + dataset_path: "TEST_PATH".to_string(), + landing_page: "TEST_LANDING_PAGE".to_string(), + provider_name: "TEST_PROVIDER".to_string(), + dataset: { + let mut values = HashMap::new(); + values.insert("DS_TEXT".into(), "FOOBAR".into()); + values.insert("DS_NUM".into(), 42.0.into()); + values + }, + units: vec![ + { + let mut values = HashMap::new(); + values.insert("UNIT_TEXT".into(), "FOO".into()); + values.insert("UNIT_NUM".into(), 13.0.into()); + values + }, + { + let mut values = HashMap::new(); + values.insert("UNIT_TEXT".into(), "BAR".into()); + values.insert("UNIT_NUM".into(), 37.0.into()); + values + }, + ], + }) + .unwrap(); + + assert_eq!( + 1, + number_of_entries(&database_sink, &database_settings.temp_dataset_table) + ); + assert_eq!( + 2, + number_of_entries(&database_sink, &database_settings.temp_unit_table) + ); + + let dataset_result = + retrieve_rows(&mut database_sink, &database_settings.temp_dataset_table); + + let dataset = dataset_result.get(0); + assert_eq!( + "TEST_ID", + dataset.get::<_, String>(database_settings.dataset_id_column.as_str()) + ); + assert_eq!( + "TEST_PATH", + dataset.get::<_, String>(database_settings.dataset_path_column.as_str()) + ); + assert_eq!( + "TEST_LANDING_PAGE", + dataset.get::<_, String>(database_settings.dataset_landing_page_column.as_str()) + ); + assert_eq!( + "TEST_PROVIDER", + dataset.get::<_, String>(database_settings.dataset_provider_column.as_str()) + ); + assert_eq!( + "FOOBAR", + dataset.get::<_, String>(Field::new("DS_TEXT").hash.as_str()) + ); + assert_eq!( + 42.0, + dataset.get::<_, f64>(Field::new("DS_NUM").hash.as_str()) + ); + + let unit_result = retrieve_rows(&mut database_sink, &database_settings.temp_unit_table); + + let unit1 = unit_result.get(0); + assert_eq!( + "FOO", + unit1.get::<_, String>(Field::new("UNIT_TEXT").hash.as_str()) + ); + assert_eq!( + 13.0, + unit1.get::<_, f64>(Field::new("UNIT_NUM").hash.as_str()) + ); + + let unit2 = unit_result.get(1); + assert_eq!( + "BAR", + unit2.get::<_, String>(Field::new("UNIT_TEXT").hash.as_str()) + ); + assert_eq!( + 37.0, + unit2.get::<_, f64>(Field::new("UNIT_NUM").hash.as_str()) + ); + } + + #[test] + fn second_insert_of_same_dataset_does_not_lead_to_second_entry_in_dataset_table() { + let database_settings = retrieve_settings_from_file_and_override_schema(); + let abcd_fields = create_abcd_fields_from_json(&json!([ + { + "name": "DS_TEXT", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "DS_NUM", + "numeric": true, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "UNIT_TEXT", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": false, + "unit": "" + }, + { + "name": "UNIT_NUM", + "numeric": true, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": false, + "unit": "" + }, + ])); + + let mut database_sink = DatabaseSink::new(&database_settings, &abcd_fields).unwrap(); + + database_sink + .insert_dataset(&AbcdResult { + dataset_id: "TEST_ID".to_string(), + dataset_path: "TEST_PATH".to_string(), + landing_page: "TEST_LANDING_PAGE".to_string(), + provider_name: "TEST_PROVIDER".to_string(), + dataset: { + let mut values = HashMap::new(); + values.insert("DS_TEXT".into(), "FOOBAR".into()); + values.insert("DS_NUM".into(), 42.0.into()); + values + }, + units: vec![{ + let mut values = HashMap::new(); + values.insert("UNIT_TEXT".into(), "FOO".into()); + values.insert("UNIT_NUM".into(), 13.0.into()); + values + }], + }) + .unwrap(); + + database_sink + .insert_dataset(&AbcdResult { + dataset_id: "TEST_ID".to_string(), + dataset_path: "TEST_PATH".to_string(), + landing_page: "TEST_LANDING_PAGE".to_string(), + provider_name: "TEST_PROVIDER".to_string(), + dataset: { + let mut values = HashMap::new(); + values.insert("DS_TEXT".into(), "FOOBAR".into()); + values.insert("DS_NUM".into(), 42.0.into()); + values + }, + units: vec![{ + let mut values = HashMap::new(); + values.insert("UNIT_TEXT".into(), "BAR".into()); + values.insert("UNIT_NUM".into(), 37.0.into()); + values + }], + }) + .unwrap(); + + assert_eq!( + 1, + number_of_entries(&database_sink, &database_settings.temp_dataset_table) + ); + assert_eq!( + 2, + number_of_entries(&database_sink, &database_settings.temp_unit_table) + ); + } + + #[test] + fn correct_tables_after_schema_migration() { + let mut database_settings = retrieve_settings_from_file_and_override_schema(); + database_settings.unit_indexed_columns = vec![]; + + let abcd_fields = create_abcd_fields_from_json(&json!([])); + + let mut database_sink = DatabaseSink::new(&database_settings, &abcd_fields).unwrap(); + + database_sink + .insert_dataset(&AbcdResult { + dataset_id: "TEST_ID".to_string(), + dataset_path: "TEST_PATH".to_string(), + landing_page: "TEST_LANDING_PAGE".to_string(), + provider_name: "TEST_PROVIDER".to_string(), + dataset: Default::default(), + units: vec![], + }) + .unwrap(); + + database_sink.migrate_schema().unwrap(); + + let tables = retrieve_ordered_table_names(&database_sink); + + assert_eq!( + tables, + sorted_vec(vec![ + database_settings.dataset_table.clone(), + database_settings.unit_table.clone(), + format!("{}_translation", database_settings.dataset_table), + database_settings.listing_view.clone(), + ]) + ); + } + + #[test] + fn listing_view_contains_entry_after_migration() { + let mut database_settings = retrieve_settings_from_file_and_override_schema(); + database_settings.unit_indexed_columns = vec![]; + + let abcd_fields = create_abcd_fields_from_json(&json!([ + { + "name": "/DataSets/DataSet/Metadata/Description/Representation/Title", + "numeric": false, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": true, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LatitudeDecimal", + "numeric": true, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": false, + "unit": "" + }, + { + "name": "/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LongitudeDecimal", + "numeric": true, + "vatMandatory": false, + "gfbioMandatory": true, + "globalField": false, + "unit": "" + }, + ])); + + let mut database_sink = DatabaseSink::new(&database_settings, &abcd_fields).unwrap(); + + database_sink + .insert_dataset(&AbcdResult { + dataset_id: "TEST_ID".to_string(), + dataset_path: "TEST_PATH".to_string(), + landing_page: "TEST_LANDING_PAGE".to_string(), + provider_name: "TEST_PROVIDER".to_string(), + dataset: { + let mut values = HashMap::new(); + values.insert("/DataSets/DataSet/Metadata/Description/Representation/Title".into(), "FOOBAR".into()); + values + }, + units: vec![ + { + let mut values = HashMap::new(); + values.insert("/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LatitudeDecimal".into(), 10.0.into()); + values.insert("/DataSets/DataSet/Units/Unit/Gathering/SiteCoordinateSets/SiteCoordinates/CoordinatesLatLong/LongitudeDecimal".into(), 20.0.into()); + values + }, + ], + }) + .unwrap(); + + database_sink.migrate_schema().unwrap(); + + retrieve_ordered_table_column_names(&database_sink, &database_settings.listing_view); + + let rows = database_sink + .connection + .query( + &format!( + r#"SELECT * FROM pg_temp.{LISTING_VIEW}"#, + LISTING_VIEW = database_settings.listing_view + ), + &[], + ) + .unwrap(); + + assert_eq!(rows.len(), 1); + + let row = rows.iter().next().unwrap(); + assert_eq!(row.get::<_, String>("dataset"), "FOOBAR"); + assert_eq!(row.get::<_, String>("id"), "TEST_ID"); + assert_eq!(row.get::<_, String>("link"), "TEST_LANDING_PAGE"); + assert_eq!(row.get::<_, String>("provider"), "TEST_PROVIDER"); + assert!(row.get::<_, bool>("isGeoReferenced")); + } + + fn retrieve_rows(database_sink: &mut DatabaseSink, table_name: &str) -> Rows { + database_sink + .connection + .query( + &format!(r#"SELECT * FROM pg_temp.{TABLE}"#, TABLE = table_name,), + &[], + ) + .unwrap() + } + + fn number_of_entries(database_sink: &DatabaseSink, table_name: &str) -> i32 { + database_sink + .connection + .query( + &format!( + "select count(*)::integer as total from pg_temp.{}", + table_name + ), + &[], + ) + .unwrap() + .get(0) + .get("total") + } + + fn retrieve_translation_table_keys( + database_settings: &DatabaseSettings, + database_sink: &DatabaseSink, + ) -> Vec { + sorted_vec( + database_sink + .connection + .query( + &format!( + "select name from pg_temp.{}_translation;", + database_settings.temp_dataset_table, + ), + &[], + ) + .unwrap() + .iter() + .map(|row| row.get("name")) + .collect::>(), + ) + } + + fn retrieve_translation_table_values( + database_settings: &DatabaseSettings, + database_sink: &DatabaseSink, + ) -> Vec { + sorted_vec( + database_sink + .connection + .query( + &format!( + "select hash from pg_temp.{}_translation;", + database_settings.temp_dataset_table, + ), + &[], + ) + .unwrap() + .iter() + .map(|row| row.get("hash")) + .collect::>(), + ) + } + + fn sorted_vec(mut vec: Vec) -> Vec + where + T: Ord, + { + vec.sort(); + vec + } + + fn retrieve_ordered_table_names(database_sink: &DatabaseSink) -> Vec { + let mut tables = database_sink + .connection + .query( + r#" + SELECT table_name + FROM information_schema.tables + WHERE table_schema = (SELECT nspname FROM pg_namespace WHERE oid = pg_my_temp_schema()) + ; + "#, + &[], + ) + .unwrap() + .iter() + .map(|row| row.get("table_name")) + .collect::>(); + + tables.sort(); + + tables + } + + fn retrieve_ordered_table_column_names( + database_sink: &DatabaseSink, + table_name: &str, + ) -> Vec { + let mut tables = database_sink + .connection + .query( + r#" + SELECT column_name + FROM information_schema.columns + WHERE table_schema = (SELECT nspname FROM pg_namespace WHERE oid = pg_my_temp_schema()) + AND table_name = $1 + ; + "#, + &[&table_name.to_string()], + ) + .unwrap() + .iter() + .map(|row| row.get("column_name")) + .collect::>(); + + tables.sort(); + + tables + } + + fn retrieve_settings_from_file_and_override_schema() -> DatabaseSettings { + let mut settings = Settings::new(None).unwrap().database; + settings.schema = "pg_temp".into(); + settings + } + + fn create_abcd_fields_from_json(json: &serde_json::Value) -> AbcdFields { + let fields_file = test_utils::create_temp_file(&json.to_string()); + + AbcdFields::from_path(&fields_file).expect("Unable to create ABCD Fields Spec") + } + + fn extract_dataset_fields(abcd_fields: &AbcdFields) -> Vec { + abcd_fields + .into_iter() + .filter(|field| field.global_field) + .map(|field| field.name.as_ref()) + .map(Field::new) + .collect() + } + + fn extract_unit_fields(abcd_fields: &AbcdFields) -> Vec { + abcd_fields + .into_iter() + .filter(|field| !field.global_field) + .map(|field| field.name.as_ref()) + .map(Field::new) + .collect() + } +} diff --git a/src/storage/field.rs b/src/storage/field.rs index ba83f49..6d70776 100644 --- a/src/storage/field.rs +++ b/src/storage/field.rs @@ -19,3 +19,15 @@ impl From<&str> for Field { Self::new(name) } } + +impl From for Field { + fn from(name: String) -> Self { + Self::new(name.as_str()) + } +} + +impl From<&String> for Field { + fn from(name: &String) -> Self { + Self::new(name.as_str()) + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 5f7f40e..7c4d2ab 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -1,5 +1,7 @@ mod database_sink; mod field; +mod surrogate_key; pub use self::database_sink::DatabaseSink; pub(self) use self::field::Field; +pub(self) use self::surrogate_key::{SurrogateKey, SurrogateKeyType}; diff --git a/src/storage/surrogate_key.rs b/src/storage/surrogate_key.rs new file mode 100644 index 0000000..f73a0c3 --- /dev/null +++ b/src/storage/surrogate_key.rs @@ -0,0 +1,67 @@ +use std::collections::hash_map::Entry::{Occupied, Vacant}; +use std::collections::HashMap; + +#[derive(Debug, PartialEq)] +pub struct SurrogateKey { + id_to_key: HashMap, + next_key: u32, +} + +#[derive(Debug, PartialEq)] +pub enum SurrogateKeyType { + New(u32), + Existing(u32), +} + +impl SurrogateKey { + pub fn new() -> Self { + Self { + id_to_key: Default::default(), + next_key: 1, + } + } + + pub fn for_id(&mut self, id: &str) -> SurrogateKeyType { + match self.id_to_key.entry(id.into()) { + Occupied(entry) => SurrogateKeyType::Existing(*entry.get()), + Vacant(entry) => { + let key = *entry.insert(self.next_key); + self.next_key += 1; + + SurrogateKeyType::New(key) + } + } + } +} + +impl Default for SurrogateKey { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn new_keys() { + let mut surrogate_key = SurrogateKey::new(); + + for i in 1..=5 { + assert_eq!( + SurrogateKeyType::New(i), + surrogate_key.for_id(&i.to_string()) + ); + } + } + + #[test] + fn existing_key() { + let mut surrogate_key = SurrogateKey::new(); + + assert_eq!(SurrogateKeyType::New(1), surrogate_key.for_id("foo")); + assert_eq!(SurrogateKeyType::Existing(1), surrogate_key.for_id("foo")); + assert_eq!(SurrogateKeyType::New(2), surrogate_key.for_id("bar")); + } +} From cf70d43930cf39b587eba89bdc69e42391ef465a Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Tue, 25 Jun 2019 11:09:21 +0200 Subject: [PATCH 26/31] Travis CI script --- .travis.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..c5a2528 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,21 @@ +language: rust + +rust: + - stable + +services: + - postgresql + +cache: cargo + +before_script: + - psql -c 'create database travis_ci_test;' -U postgres + - echo '[database] + database = "travis_ci_test" + tls = false + user = "postgres" + password = ""' > settings.toml + +script: + - cargo build --verbose --all + - cargo test --verbose --all From 69466b49a72b1b60ef64d4fcfd1ce1f34b35519c Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Tue, 25 Jun 2019 17:38:41 +0200 Subject: [PATCH 27/31] fix postgres version to 11 to ensure that pg_temp works as schema --- .travis.yml | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index c5a2528..13a152d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,15 +6,26 @@ rust: services: - postgresql +addons: + postgresql: "11.2" + cache: cargo +before_install: + - sudo apt-get update + - sudo apt-get --yes remove postgresql\* + - sudo apt-get install -y postgresql-11 postgresql-client-11 + - sudo cp /etc/postgresql/{9.6,11}/main/pg_hba.conf + - sudo service postgresql restart 11 + before_script: - psql -c 'create database travis_ci_test;' -U postgres - - echo '[database] - database = "travis_ci_test" - tls = false - user = "postgres" - password = ""' > settings.toml + - touch settings.toml + - echo '[database]' >> settings.toml + - echo 'database = "travis_ci_test"' >> settings.toml + - echo 'tls = false' >> settings.toml + - echo 'user = "postgres"' >> settings.toml + - echo 'password = ""' >> settings.toml script: - cargo build --verbose --all From d26410619bed792b72030d3e8cdbbc2905b2e8b5 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Wed, 26 Jun 2019 11:21:52 +0200 Subject: [PATCH 28/31] update readme to reflect TravisCI status of master --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c985926..1bb7c8b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Build Status](https://travis-ci.org/gfbio/vat-abcd-crawler.svg?branch=master)](https://travis-ci.org/gfbio/vat-abcd-crawler) + # VAT ABCD Crawler This repository contains the ABCD crawler for the VAT system. From 6d1d70ec6607a8e6e544324df67e66cc81d6482e Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Wed, 26 Jun 2019 15:24:47 +0200 Subject: [PATCH 29/31] some fixes --- settings-default.toml | 2 +- src/main.rs | 29 +++++++++++++++++++++++++---- src/pangaea/search_result.rs | 22 ++++++++++++++-------- src/storage/database_sink.rs | 2 -- 4 files changed, 40 insertions(+), 15 deletions(-) diff --git a/settings-default.toml b/settings-default.toml index 09309d0..96e6e06 100644 --- a/settings-default.toml +++ b/settings-default.toml @@ -12,7 +12,7 @@ landing_page_field = "/DataSets/DataSet/Metadata/Description/Representation/URI" [pangaea] search_url = "https://ws.pangaea.de/es/dataportal-gfbio/pansimple/_search" -scroll_url = "https://ws.pangaea.de/es/dataportal-gfbio/pansimple/_search/scroll" +scroll_url = "https://ws.pangaea.de/es/_search/scroll" [terminology_service] landingpage_url = "https://terminologies.gfbio.org/tools/landingpages/landingpage.php" diff --git a/src/main.rs b/src/main.rs index 1201f55..533b091 100644 --- a/src/main.rs +++ b/src/main.rs @@ -87,9 +87,22 @@ fn process_datasets( .unwrap_or(std::usize::MAX), ) { - let file_path = temp_dir.path().join(dataset.id()).join(".zip"); + let file_name = dataset + .id() + .chars() + .map(|c| match c { + 'a'...'z' | 'A'...'Z' | '-' => c, + _ => '_', + }) + .collect::(); + let file_path = temp_dir.path().join(file_name).with_extension("zip"); if let Err(e) = FileDownloader::from_url(dataset.download_url()).to_path(&file_path) { - warn!("Unable to download file: {}", e); + warn!( + "Unable to download file {url} to {path}: {error}", + url = dataset.download_url(), + path = file_path.display(), + error = e, + ); continue; } @@ -104,7 +117,15 @@ fn process_datasets( let landing_page_url: String = propose_landing_page(&settings.terminology_service, dataset.download_url()); - for xml_bytes_result in ArchiveReader::from_path(&file_path).unwrap().bytes_iter() { + let mut archive_reader = match ArchiveReader::from_path(&file_path) { + Ok(reader) => reader, + Err(e) => { + warn!("Unable to read dataset archive: {}", e); + continue; + } + }; + + for xml_bytes_result in archive_reader.bytes_iter() { let xml_bytes = match xml_bytes_result { Ok(bytes) => bytes, Err(e) => { @@ -156,7 +177,7 @@ fn initialize_settings() -> Result { .long("settings") .value_name("SETTINGS") .help("Specify the settings file") - .required(true) + .required(false) .takes_value(true), ) .get_matches(); diff --git a/src/pangaea/search_result.rs b/src/pangaea/search_result.rs index b4e3417..222e44f 100644 --- a/src/pangaea/search_result.rs +++ b/src/pangaea/search_result.rs @@ -1,5 +1,6 @@ use crate::settings::PangaeaSettings; use failure::Error; +use log::info; use serde::Deserialize; use serde_json::json; use std::collections::HashMap; @@ -90,13 +91,20 @@ impl PangaeaSearchResult { let mut entries = Vec::new(); let mut result = Self::from_url(&pangaea_settings.search_url)?; + let mut number_of_results = result.hits.hits.len(); - while result.hits.total > 0 { + while number_of_results > 0 { + info!( + "Retrieved {} items from pangaea (continuing - {} total).", + number_of_results, result.hits.total, + ); entries.append(&mut result.hits.hits); result = Self::from_scroll_url(&pangaea_settings.scroll_url, &result.scroll_id)?; + number_of_results = result.hits.hits.len(); } + info!("Retrieved {} items from pangaea.", number_of_results); entries.append(&mut result.hits.hits); Ok(entries) @@ -292,8 +300,8 @@ mod tests { "_scroll_id": SCROLL_ID_2, "took": 1373, "hits": { - "total": SEARCH_RESULT_HITS, // <-- CONTINUE - "hits": [ + "total": SEARCH_RESULT_HITS, + "hits": [ // <-- CONTINUE SEARCH_RESULT_ENTRY_JSON(), SEARCH_RESULT_ENTRY_JSON_2(), ], @@ -313,10 +321,8 @@ mod tests { "_scroll_id": SCROLL_ID_2, "took": 1373, "hits": { - "total": 0, // <-- NO CONTINUE - "hits": [ - SEARCH_RESULT_ENTRY_JSON(), - ], + "total": SEARCH_RESULT_HITS, + "hits": [], // <-- NO CONTINUE }, }) .to_string(), @@ -330,7 +336,7 @@ mod tests { }) .unwrap(); - assert_eq!(5, entries.len()); + assert_eq!(4, entries.len()); let entry = &entries[0]; assert_eq!(RESULT_ID, entry.id()); diff --git a/src/storage/database_sink.rs b/src/storage/database_sink.rs index 533324f..05337e9 100644 --- a/src/storage/database_sink.rs +++ b/src/storage/database_sink.rs @@ -495,8 +495,6 @@ impl<'s> DatabaseSink<'s> { longitude_column = longitude_column, ); - eprintln!("{}", &view_statement); - transaction.execute(&view_statement, &[])?; Ok(()) From 42c7943165363f332605cbf2e0590efefce4e8d5 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Sat, 24 Aug 2019 09:07:10 +0200 Subject: [PATCH 30/31] changed openssl package for postgres --- Cargo.toml | 3 ++- src/storage/database_sink.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 33815ce..501719b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,8 @@ config = { version = "0.9", features = ["toml"] } failure = "0.1" failure_derive = "0.1" log = "0.4" -postgres = { version = "0.15", features = ['with-openssl'] } +postgres = "0.15" +postgres-openssl = "0.1" quick-xml = "0.13" reqwest = "0.9" serde = { version = "1.0", features = ["derive"] } diff --git a/src/storage/database_sink.rs b/src/storage/database_sink.rs index 05337e9..62f6f08 100644 --- a/src/storage/database_sink.rs +++ b/src/storage/database_sink.rs @@ -3,9 +3,9 @@ use failure::{Error, Fail}; use log::debug; use postgres::params::ConnectParams; use postgres::params::Host; -use postgres::tls::openssl::OpenSsl; use postgres::transaction::Transaction; use postgres::{Connection, TlsMode}; +use postgres_openssl::OpenSsl; use crate::abcd::{AbcdFields, AbcdResult}; use crate::settings; From 146ad1890774dbf5660ea17935d6118ac9ccbcae Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Sat, 24 Aug 2019 09:09:20 +0200 Subject: [PATCH 31/31] added recovery mechanism for eventually missing archive files - store successfully imported archives in storage directory - recover archive from storage directory if download does not work - `file_downloader` emits error on non-successful status code --- settings-default.toml | 1 + src/abcd/abcd_parser.rs | 1 + src/abcd/archive_reader.rs | 5 ++++ src/file_downloader.rs | 7 +++++ src/main.rs | 60 ++++++++++++++++++++++++++++++++------ src/settings.rs | 1 + 6 files changed, 66 insertions(+), 9 deletions(-) diff --git a/settings-default.toml b/settings-default.toml index 96e6e06..b135c6a 100644 --- a/settings-default.toml +++ b/settings-default.toml @@ -9,6 +9,7 @@ dataset_limit = 3 [abcd] fields_file = "abcd-fields.json" landing_page_field = "/DataSets/DataSet/Metadata/Description/Representation/URI" +storage_dir = "raw_data" [pangaea] search_url = "https://ws.pangaea.de/es/dataportal-gfbio/pansimple/_search" diff --git a/src/abcd/abcd_parser.rs b/src/abcd/abcd_parser.rs index d7450c2..e618dcd 100644 --- a/src/abcd/abcd_parser.rs +++ b/src/abcd/abcd_parser.rs @@ -235,6 +235,7 @@ mod tests { let abcd_settings = AbcdSettings { fields_file: "".into(), landing_page_field: "/DataSets/DataSet/Metadata/Description/Representation/URI".into(), + storage_dir: "raw_data".into(), }; let test_file = create_file_as_bytes(); diff --git a/src/abcd/archive_reader.rs b/src/abcd/archive_reader.rs index 14c9558..8317b88 100644 --- a/src/abcd/archive_reader.rs +++ b/src/abcd/archive_reader.rs @@ -28,6 +28,11 @@ impl ArchiveReader { archive: &mut self.archive, } } + + /// Output the number of files in the archive. + pub fn len(&self) -> usize { + self.archive.len() + } } /// This iterator traverses over all files (bytes) in the ZIP archive. diff --git a/src/file_downloader.rs b/src/file_downloader.rs index 89f411b..2bf5990 100644 --- a/src/file_downloader.rs +++ b/src/file_downloader.rs @@ -15,6 +15,13 @@ impl FileDownloader { pub fn to_path(&self, path: &Path) -> Result<(), Error> { let mut response = reqwest::get(&self.url)?; + if !response.status().is_success() { + return Err(failure::err_msg(format!( + "Webserver responded with code: {}", + response.status(), + ))); + } + let output_file = File::create(&path)?; let mut writer = BufWriter::new(&output_file); diff --git a/src/main.rs b/src/main.rs index 533b091..db134d1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -67,6 +67,9 @@ fn process_datasets( datasets: &[PangaeaSearchResultEntry], ) -> Result<(), Error> { let temp_dir = tempfile::tempdir()?; + let storage_dir = Path::new(&settings.abcd.storage_dir); + + create_or_check_for_directory(&storage_dir); let mut abcd_parser = AbcdParser::new(&settings.abcd, &abcd_fields); @@ -91,22 +94,37 @@ fn process_datasets( .id() .chars() .map(|c| match c { - 'a'...'z' | 'A'...'Z' | '-' => c, + 'a'..='z' | 'A'..='Z' | '-' => c, _ => '_', }) .collect::(); - let file_path = temp_dir.path().join(file_name).with_extension("zip"); - if let Err(e) = FileDownloader::from_url(dataset.download_url()).to_path(&file_path) { + let temp_file_path = temp_dir.path().join(&file_name).with_extension("zip"); + let storage_file_path = storage_dir.join(&file_name).with_extension("zip"); + + if let Err(e) = FileDownloader::from_url(dataset.download_url()).to_path(&temp_file_path) { warn!( "Unable to download file {url} to {path}: {error}", url = dataset.download_url(), - path = file_path.display(), + path = temp_file_path.display(), error = e, ); - continue; + + let recovery_file_path = storage_file_path.as_path(); + match std::fs::copy(recovery_file_path, &temp_file_path) { + Ok(_) => info!("Recovered file {file}", file = file_name), + Err(e) => { + warn!( + "Recovery of file {file} failed: {error}", + file = file_name, + error = e, + ); + + continue; // skip processing this dataset + } + }; } - trace!("Temp file: {}", file_path.display()); + trace!("Temp file: {}", temp_file_path.display()); info!( "Processing `{}` @ `{}` ({})", dataset.id(), @@ -117,7 +135,7 @@ fn process_datasets( let landing_page_url: String = propose_landing_page(&settings.terminology_service, dataset.download_url()); - let mut archive_reader = match ArchiveReader::from_path(&file_path) { + let mut archive_reader = match ArchiveReader::from_path(&temp_file_path) { Ok(reader) => reader, Err(e) => { warn!("Unable to read dataset archive: {}", e); @@ -125,11 +143,14 @@ fn process_datasets( } }; + let mut all_inserts_successful = true; + for xml_bytes_result in archive_reader.bytes_iter() { let xml_bytes = match xml_bytes_result { Ok(bytes) => bytes, Err(e) => { warn!("Unable to read file from zip archive: {}", e); + all_inserts_successful = false; continue; } }; @@ -144,6 +165,7 @@ fn process_datasets( Ok(data) => data, Err(e) => { warn!("Unable to retrieve ABCD data: {}", e); + all_inserts_successful = false; continue; } }; @@ -152,9 +174,18 @@ fn process_datasets( match database_sink.insert_dataset(&abcd_data) { Ok(_) => (), - Err(e) => warn!("Unable to insert dataset into storage: {}", e), + Err(e) => { + warn!("Unable to insert dataset into storage: {}", e); + all_inserts_successful = false; + } }; } + + if all_inserts_successful && archive_reader.len() > 0 { + if let Err(e) = std::fs::copy(&temp_file_path, storage_file_path) { + warn!("Unable to store ABCD file: {}", e); + } + } } match database_sink.migrate_schema() { @@ -165,6 +196,17 @@ fn process_datasets( Ok(()) } +fn create_or_check_for_directory(storage_dir: &&Path) { + if storage_dir.exists() { + assert!( + storage_dir.is_dir(), + "ABCD storage directory path is not a directory", + ); + } else { + std::fs::create_dir(&storage_dir).expect("ABCD storage directory is not creatable"); + } +} + fn initialize_settings() -> Result { let matches = App::new("VAT ABCD Crawler") .version(crate_version!()) @@ -189,7 +231,7 @@ fn initialize_settings() -> Result { /// Initialize the logger. fn initialize_logger(file_path: &Path, settings: &Settings) -> Result<(), Error> { - let mut loggers: Vec> = Vec::new(); + let mut loggers: Vec> = Vec::new(); let log_level = if settings.general.debug { simplelog::LevelFilter::Debug diff --git a/src/settings.rs b/src/settings.rs index ff09a14..0012c85 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -15,6 +15,7 @@ pub struct GeneralSettings { pub struct AbcdSettings { pub fields_file: String, pub landing_page_field: String, + pub storage_dir: String, } #[derive(Debug, Deserialize)]