Validation and train data

Code
library(tidyverse)
library(readr)
library(gt)
library(gtExtras)
library(leaflet)
library(sf)
library(terra)

source(here_rel("R", "funs_data.R"))
source(here_rel("R", "funs_graphics.R"))
source(here_rel("R", "funs_plot_tables.R"))

theme_set(theme_public())

Stratified splitting train / validation datasets

# SampleID <- (Y_raw |>
#     group_by(Site) %>% # Stratified sampling : 90% from Site strata
#     sample_frac(0.9) |> # Sample_frac(0.9) selects round(n() * 0.9) inventory units within each site. As a consequence, sites with ≤4 inventory units contribute all units to the training set (because round(n*0.9) = n), whereas sites with >4 units contribute ~90% of their units to training.
#     ungroup())$PlotID

Y_raw <- readRDS(here_rel("data","raw_data","Y_raw.rds"))

SampleID <- (Y_raw |> filter(type == "train"))$PlotID

Y_train <- Y_raw |>
    filter(PlotID %in% SampleID)

Y_val <- Y_raw |> filter(!(PlotID %in% Y_train$PlotID)) # The remaining units in each site form the external validation dataset.

We split our data in a training set including all sampling sites with 4 or less inventory units and a stratified random sampling of 90% of inventory units by sampling site with more than 4 inventory units. The remaining part of the dataset was used for external validation.

Code
Shp_plots |> 
    st_drop_geometry() |> 
    group_by(type, DB) |> 
    summarise(n = n()) |> 
    ungroup() |> 
    group_by(DB) |> 
    mutate(perc = n / sum(n) * 100) |>
    select(-n) |>
    pivot_wider(names_from = DB, values_from = perc, values_fill = 0) |>
    gt() |>
    fmt_number(columns = c("GuyaDiv", "GuyaFor", "GuyENTRY"), decimals = 1) |>
    opts_theme()
Distribution of inventory units across inventory networks in training and validation datasets.
type GuyENTRY GuyaDiv GuyaFor
train 93.5 90.2 88.3
validation 6.5 9.8 11.7

We check for biais in the train/validation split by comparing the distribution of variables across inventory units in the training and validation datasets:

Code
if (!file.exists(here_rel("notebook", "figs","fig-trainval-split.png"))) {
P1 <- Shp_plots |> ggplot(aes(x = Clim1, color = type)) + 
    geom_density() +
    labs(x = "Clim1", y = "Density", color = "Dataset") +
    theme(legend.position = "bottom")

P2 <- Shp_plots |> ggplot(aes(x = Clim2, color = type)) + 
    geom_density() +
    labs(x = "Clim2", y = "Density", color = "Dataset") +
    theme(legend.position = "bottom")

P3 <-Shp_plots |> ggplot(aes(x = log(SWI+1), color = type)) + 
    geom_density() +
    labs(x = expression(log(SWI+1)), y = "Density", color = "Dataset") +
    theme(legend.position = "bottom")

g <- patchwork::wrap_plots(P1 | P2, P3, ncol = 1) + 
    patchwork::plot_annotation(title = "Distribution of environmental predictors in training and validation datasets") &
    theme(legend.position = "bottom")  
ggsave(plot = g, filename = here_rel("notebook", "figs","fig-trainval-split.png"), bg = "white", width = 10,height = 5)
} 

knitr::include_graphics(here_rel("notebook", "figs","fig-trainval-split.png"))
Figure 1: Distribution of inventory units across inventory networks in training and validation datasets.