Validation and train data

Code

library(tidyverse)
library(readr)
library(gt)
library(gtExtras)
library(leaflet)
library(sf)
library(terra)

source(here_rel("R", "funs_data.R"))
source(here_rel("R", "funs_graphics.R"))
source(here_rel("R", "funs_plot_tables.R"))

theme_set(theme_public())

Stratified splitting train / validation datasets

# SampleID <- (Y_raw |>
#     group_by(Site) %>% # Stratified sampling : 90% from Site strata
#     sample_frac(0.9) |> # Sample_frac(0.9) selects round(n() * 0.9) inventory units within each site. As a consequence, sites with ≤4 inventory units contribute all units to the training set (because round(n*0.9) = n), whereas sites with >4 units contribute ~90% of their units to training.
#     ungroup())$PlotID

Y_raw <- readRDS(here_rel("data","raw_data","Y_raw.rds"))

SampleID <- (Y_raw |> filter(type == "train"))$PlotID

Y_train <- Y_raw |>
    filter(PlotID %in% SampleID)

Y_val <- Y_raw |> filter(!(PlotID %in% Y_train$PlotID)) # The remaining units in each site form the external validation dataset.

We split our data in a training set including all sampling sites with 4 or less inventory units and a stratified random sampling of 90% of inventory units by sampling site with more than 4 inventory units. The remaining part of the dataset was used for external validation.

Code

Shp_plots |> 
    st_drop_geometry() |> 
    group_by(type, DB) |> 
    summarise(n = n()) |> 
    ungroup() |> 
    group_by(DB) |> 
    mutate(perc = n / sum(n) * 100) |>
    select(-n) |>
    pivot_wider(names_from = DB, values_from = perc, values_fill = 0) |>
    gt() |>
    fmt_number(columns = c("GuyaDiv", "GuyaFor", "GuyENTRY"), decimals = 1) |>
    opts_theme()

Distribution of inventory units across inventory networks in training and validation datasets.
type	GuyENTRY	GuyaDiv	GuyaFor
train	93.5	90.2	88.3
validation	6.5	9.8	11.7

We check for biais in the train/validation split by comparing the distribution of variables across inventory units in the training and validation datasets:

Code

if (!file.exists(here_rel("notebook", "figs","fig-trainval-split.png"))) {
P1 <- Shp_plots |> ggplot(aes(x = Clim1, color = type)) + 
    geom_density() +
    labs(x = "Clim1", y = "Density", color = "Dataset") +
    theme(legend.position = "bottom")

P2 <- Shp_plots |> ggplot(aes(x = Clim2, color = type)) + 
    geom_density() +
    labs(x = "Clim2", y = "Density", color = "Dataset") +
    theme(legend.position = "bottom")

P3 <-Shp_plots |> ggplot(aes(x = log(SWI+1), color = type)) + 
    geom_density() +
    labs(x = expression(log(SWI+1)), y = "Density", color = "Dataset") +
    theme(legend.position = "bottom")

g <- patchwork::wrap_plots(P1 | P2, P3, ncol = 1) + 
    patchwork::plot_annotation(title = "Distribution of environmental predictors in training and validation datasets") &
    theme(legend.position = "bottom")  
ggsave(plot = g, filename = here_rel("notebook", "figs","fig-trainval-split.png"), bg = "white", width = 10,height = 5)
} 

knitr::include_graphics(here_rel("notebook", "figs","fig-trainval-split.png"))

Figure 1: Distribution of inventory units across inventory networks in training and validation datasets.

--- title: "Validation and train data" format: html: code-fold: true --- ```{r setup, include=FALSE} knitr::opts_chunk$set( fig.align = "center", fig.retina = 3, fig.width = 6, fig.height = (6 * 0.618), out.width = "80%", collapse = TRUE, dev = "ragg_png" ) options( digits = 3, width = 120, dplyr.summarise.inform = FALSE, knitr.kable.NA = "" ) here_rel <- function(...) { fs::path_rel(here::here(...)) } ``` ```{r libraries-data, warning=FALSE, message=FALSE} library(tidyverse) library(readr) library(gt) library(gtExtras) library(leaflet) library(sf) library(terra) source(here_rel("R", "funs_data.R")) source(here_rel("R", "funs_graphics.R")) source(here_rel("R", "funs_plot_tables.R")) theme_set(theme_public()) ``` # Stratified splitting train / validation datasets ```{r} #| echo: false #| message: false Plot_table_path <- here_rel("data", "inventory_data", "Plot_description.csv") Shp_plots_path <- here_rel("data", "shp", "GF_sampling.shp") Guyane_path <- here_rel("data", "shp", "guyane.shp") Plot_table <- read_csv2(Plot_table_path) Shp_plots <- read_sf(Shp_plots_path, quiet = TRUE) Guyane <- read_sf(Guyane_path, quiet = TRUE) |> st_transform("EPSG:4326") ``` ```{r} #| eval: false #| echo: true #| code-fold: false # SampleID <- (Y_raw |> # group_by(Site) %>% # Stratified sampling : 90% from Site strata # sample_frac(0.9) |> # Sample_frac(0.9) selects round(n() * 0.9) inventory units within each site. As a consequence, sites with ≤4 inventory units contribute all units to the training set (because round(n*0.9) = n), whereas sites with >4 units contribute ~90% of their units to training. # ungroup())$PlotID Y_raw <- readRDS(here_rel("data","raw_data","Y_raw.rds")) SampleID <- (Y_raw |> filter(type == "train"))$PlotID Y_train <- Y_raw |> filter(PlotID %in% SampleID) Y_val <- Y_raw |> filter(!(PlotID %in% Y_train$PlotID)) # The remaining units in each site form the external validation dataset. ``` We split our data in a training set including all sampling sites with 4 or less inventory units and a stratified random sampling of 90% of inventory units by sampling site with more than 4 inventory units. The remaining part of the dataset was used for external validation. ```{r} #| label: tab-trainval-split #| tbl-cap: "Distribution of inventory units across inventory networks in training and validation datasets." Shp_plots |> st_drop_geometry() |> group_by(type, DB) |> summarise(n = n()) |> ungroup() |> group_by(DB) |> mutate(perc = n / sum(n) * 100) |> select(-n) |> pivot_wider(names_from = DB, values_from = perc, values_fill = 0) |> gt() |> fmt_number(columns = c("GuyaDiv", "GuyaFor", "GuyENTRY"), decimals = 1) |> opts_theme() ``` We check for biais in the train/validation split by comparing the distribution of variables across inventory units in the training and validation datasets: ```{r} #| label: fig-trainval-split #| fig-cap: "Distribution of inventory units across inventory networks in training and validation datasets." if (!file.exists(here_rel("notebook", "figs","fig-trainval-split.png"))) { P1 <- Shp_plots |> ggplot(aes(x = Clim1, color = type)) + geom_density() + labs(x = "Clim1", y = "Density", color = "Dataset") + theme(legend.position = "bottom") P2 <- Shp_plots |> ggplot(aes(x = Clim2, color = type)) + geom_density() + labs(x = "Clim2", y = "Density", color = "Dataset") + theme(legend.position = "bottom") P3 <-Shp_plots |> ggplot(aes(x = log(SWI+1), color = type)) + geom_density() + labs(x = expression(log(SWI+1)), y = "Density", color = "Dataset") + theme(legend.position = "bottom") g <- patchwork::wrap_plots(P1 | P2, P3, ncol = 1) + patchwork::plot_annotation(title = "Distribution of environmental predictors in training and validation datasets") & theme(legend.position = "bottom") ggsave(plot = g, filename = here_rel("notebook", "figs","fig-trainval-split.png"), bg = "white", width = 10,height = 5) } knitr::include_graphics(here_rel("notebook", "figs","fig-trainval-split.png")) ```