riparias · PietrH · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023 · Oct 27, 2023
diff --git a/src/dwc_mapping.Rmd b/src/dwc_mapping.Rmd
@@ -2,8 +2,9 @@
 title: "Darwin Core mapping"
 subtitle: "For: The (...) occurrences of "
 author:
-- author_1
-- author_2
+- Pieter Huybrechts
+- Damiano Oldoni
+- Lien Reyserhove
 date: "`r Sys.Date()`"
 output:
   html_document:
@@ -32,7 +33,7 @@ library(dplyr)          # To do data wrangling
 library(tidyr)          # To create tidy data
 library(readr)          # To read data
 library(stringr)        # To work with strings (chars)
-library(purrr)          # TO work with functions and vectors
+library(purrr)          # To work with functions and vectors
 library(here)           # To find files
 library(tidylog)        # To provide feedback on dplyr functions
 library(magrittr)       # To use %<>% pipe
@@ -53,7 +54,7 @@ raw_data <- readr::read_csv(
 ) 
 ```
 
-Rename `raw_data` to `input_data` so we can keep original values
+Rename `raw_data` to `input_data` so we can keep original values, `input_data` will be overwritten at several steps during the mapping.
 
 ```{r}
 input_data <- raw_data
@@ -68,7 +69,7 @@ Clean data somewhat:
 ```{r}
 input_data <- 
   input_data %>% 
-    remove_empty("rows") %>% 
+    remove_empty(which = "rows", quiet = FALSE) %>% 
     clean_names()
 ```
 
@@ -78,6 +79,7 @@ Check whether `object_id` is unique (value should be `0`)
 
 ```{r}
 anyDuplicated(input_data$objectid)
+stopifnot(!anyDuplicated(input_data$objectid))
 ```
 
 Remove all rows for `Domein` = `Werken`
@@ -100,7 +102,7 @@ input_data <-
 
 The field `waarnemingen` is used to map `organismQuantity`, `organismQuantityType` and `occurrenceStatus`. This fields need to be cleaned before mapping.
 
-```{r}
+```{r list waarneming values}
 input_data %>% 
   group_by(waarneming) %>% 
   summarise(records = n()) %>% 
@@ -109,14 +111,14 @@ input_data %>%
 
 First, we need to clean `Haard vastgesteld = 0; Waarneming onzeker = 1;`. This should be `Waarneming onzeker = 1;` (error in database):
 
-```{r}
+```{r recode some known errors}
 input_data %<>% mutate(waarneming = recode(waarneming,
   "Haard vastgesteld = 0; Waarneming onzeker = 1;" = "Waarneming onzeker = 1;"))
 ```
 
 Remove occurrences containing multiple type - value pairs information in column `waarneming` (patch until [#23](https://github.com/riparias/rato-occurrences/issues/23) is solved):
 
-```{r}
+```{r remove records with multiple type value pairs for waarneming}
 input_data %<>%
   filter(is.na(.data$waarneming) | 
            !str_detect(.data$waarneming, pattern = "; "))
@@ -143,7 +145,7 @@ input_data <-
 
 Map `organism_quantity`:
 
-```{r}
+```{r map organism_quantity}
 input_data %<>% 
   mutate(organism_quantity = case_when(
     waarneming_type == "Vastgesteld (in m²)" |
@@ -155,7 +157,7 @@ input_data %<>%
 
 Map `organism_quantity_type`:
 
-```{r}
+```{r map organism_quantity_type}
 input_data <-
   input_data %>% 
     mutate(organism_quantity_type = case_when(
@@ -168,7 +170,7 @@ input_data <-
 
 Map `occurrence_status`:
 
-```{r}
+```{r map occurrence_status}
 input_data <-
   input_data %>%
     mutate(occurrence_status = case_when(
@@ -192,7 +194,7 @@ input_data <-
 
 Screen mapping:
 
-```{r}
+```{r display mapping for screening}
 input_data %>% 
   group_by(waarneming, organism_quantity, organism_quantity_type, occurrence_status) %>% 
   summarize(records = n()) 
@@ -203,7 +205,7 @@ Some records have information regarding occurrenceStatus in the "opmerkingen" fi
 
 When `Dossier_Status` is `Opvolging`, old field values get copied. So even if "Waarneming" or "Actie" lead to a presence record, the Opmerkingen could still indicate a absence record. 
 
-```{r}
+```{r parse opmerkingen and map to occurrence_status}
 input_data <-
   input_data %>%
   mutate(
@@ -227,7 +229,7 @@ input_data <-
 
 Information from `materiaal_vast` can be used for `samplingProtocol`
 
-```{r}
+```{r display values for materiaal_vast}
 input_data %>% 
   group_by(materiaal_vast) %>% 
   summarise(records = n()) %>% 
@@ -236,7 +238,7 @@ input_data %>%
 
 First split on ";" and separate on "="
 
-```{r}
+```{r split and seperate materiaal_vast}
 sampling_protocol <-
   input_data %>% 
     select(occurrence_id, materiaal_vast) %>% 
@@ -298,7 +300,8 @@ sampling_protocol %>%
 Translate to English (generate `protocol`):
 
 ```{r}
-sampling_protocol %<>% 
+sampling_protocol <-
+  sampling_protocol %>% 
   mutate(protocol = recode(materiaal,
     "Andere" = "other",
     "Conibearklem" = "conibear trap",
@@ -316,7 +319,8 @@ sampling_protocol %<>%
 Add `effort` to the dataset. This is the sampling effort and is based on the values in `kwantiteit`:
 
 ```{r}
-sampling_protocol %<>% 
+sampling_protocol <-
+  sampling_protocol %>% 
   mutate(effort = case_when(
     kwantiteit == "0" ~ "",
     TRUE ~ paste(kwantiteit, protocol, sep = " ")))
@@ -332,7 +336,7 @@ sampling_protocol <-
               sampling_effort = paste(effort, collapse = " | "))
 ```
 
-Now, the field `sampling_effort` contains some unwanted, repeated hashes. With this code, we remove them:
+Now, the field `sampling_effort` contains some unwanted, repeated pipes `|`. With this code, we remove them:
 
 ```{r}
 sampling_protocol <- 
@@ -360,12 +364,67 @@ sampling_protocol <-
 
 Merge `sampling_protocol` with `input_data`:
 
-```{r}
+```{r merge sampling_protocol df with input_data df}
 input_data %<>% left_join(
   y = sampling_protocol,
   by = "occurrence_id")
 ```
 
+## Split off the first intervention per event, and duplicate it to a strict observation
+
+Terminology:
+
+- Interventions: have a `samplingProtocol`, have `Dossier_status == "Opvolging"`
+- Strict observations: no sampling took place, these have no `samplingProtocol`
+
+All observations should have `samplingProtocol` set to `casual observation`, this 
+includes both strict observations and those created from interventions.
+
+```{r Extract the first interventions}
+first_interventions <- 
+  input_data %>% 
+  filter(dossier_status == "Opvolging",
+         !is.na(sampling_protocol)) %>% 
+  group_by(event_id) %>% 
+  slice_min(order_by = laatst_bewerkt_datum, n = 1)
+
+```
+
+We are keeping all original fields from the intervention in the new observation,
+except occurrence_id. If we need to drop fields in the future, this is where to 
+do it.
+
+```{r create new observations from first interventions}
+observations_from_interventions <- 
+  first_interventions %>% 
+  mutate(occurrence_id = paste0(occurrence_id, "-cas"),
+         sampling_protocol = "casual observation"
+         ) %>% 
+  ungroup() # remove grouping as to avoid it creating trouble later on
+```
+
+```{r add new occurrences to `input_data`}
+input_data <- 
+  input_data %>% 
+  # we will be merging the old occurrenceID which is a double with a character,
+  # so we need to convert types
+  mutate(occurrence_id = as.character(occurrence_id)) %>% 
+  bind_rows(observations_from_interventions)
+```
+## Set samplingProtocol to `casual observation` for casual observations
+
+An observation doesn't have a `samplingProtocol.` So all empty values for `samplingProtocol` should be casual observations, except when `dossier_status == "Verwerkt en afgesloten"` which _can_ mean that it's an absence. 
+
+```{r set most NA `sampling_protocol` to "casual observation"}
+input_data <- input_data %>% 
+  mutate(sampling_protocol = case_when(
+    dossier_status != "Verwerkt en afgesloten" & is.na(sampling_protocol) ~
+      "casual observation",
+    .default = sampling_protocol
+  ))
+```
+
+
 ## Transform Lambert coordinates to WGS84
 
 Coordinates in `x` and `y` are given in the Belgian Lambert system, they should be transformed to World Geodetic System 84 coordinate system

diff --git a/tests/test_dwc_occurrence.R b/tests/test_dwc_occurrence.R
@@ -197,3 +197,11 @@ testthat::test_that("taxonRank is always filled in and one of the list", {
     all(dwc_occurrence$taxonRank %in% taxon_ranks)
   )
 })
+
+testthat::test_that("There should be at least one casual observation", {
+  testthat::expect_gte(
+    nrow(filter(dwc_occurrence, samplingProtocol == "casual observation")),
+    1
+  )
+})
+