Code
path <- "results"

Model definitions

Notation

Let \(\left(i \in \{1,\dots,I\}\right)\) index inventory units and \(\left(j \in \{1,\dots,S\}\right)\) index modeled taxa (the \(\left(S=569\right)\) focal species plus one additional “other species” class used as the reference category). Let \(\left(y_{i,j}\right)\) be the observed stem count of taxon \(\left(j\right)\) in inventory unit \(\left(i\right)\), and \(\left(N_i=\sum_{j=1}^{S} y_{i,j}\right)\) the total number of stems in unit \(\left(i\right)\).

Model \(H_0\)

Null model is defined as identical probability of occurrence for all species across inventory units, with no effect of environmental predictors and sampling protocol on community composition.

\[ \begin{aligned}&\ \textbf{Dirichlet-Multinomial probability of specie}_j \textbf{ in inventory unit}_i \\(y_{i,1} \dots y_{i,S})^{T} \sim&\ \operatorname{Dirichlet-Multinomial}(N_{i},\overrightarrow{\pi_{i}} \times \kappa) \\[10pt]&\ \textbf{Model for probability of each specie} \\ (\pi_{i,1} \dots \pi_{i,S})^{T} = &\ softmax(\overrightarrow{\theta_{i}}) \\[5pt] &\ \text{For } j \in [1, J]: \\ \theta_{i,j} =&\ 1 \qquad\qquad\qquad\qquad\qquad\qquad \text{(Mean regional specie share)}\end{aligned} \]

Stan code

Code
functions {
  /* dirichlet-multinomial-logit log-PMF
   * Args:
   *   y: array of integer response values
   *   mu: vector of category logit probabilities
   *   phi: precision parameter (sum of Dirichlet alphas)
   * Returns:
   *   a scalar to be added to the log posterior
  */
  real dirichlet_multinomial_logit2_lpmf(array[] int y, vector mu, real phi) {
    // get Dirichlet alphas
    int N = num_elements(mu);
    vector[N] alpha = phi * softmax(mu);

    // get trials from y
    real T = sum(y);

    return lgamma(phi) + lgamma(T + 1.0) - lgamma(T + phi) +
      sum(lgamma(to_vector(y) + alpha)) - sum(lgamma(alpha)) - sum(lgamma(to_vector(y) + 1));
  }
  /* integer sequence of values
   * Args:
   *   start: starting integer
   *   end: ending integer
   * Returns:
   *   an integer sequence from start to end
   */
  array[] int sequence(int start, int end) {
    array[end - start + 1] int seq;
    for (n in 1:num_elements(seq)) {
      seq[n] = n + start - 1;
    }
    return seq;
  }
  // compute partial sums of the log-likelihood
  real partial_log_lik_lpmf(array[] int seq, int start, int end, 
  data int ncat, data array[,] int Y, data array[] int trials, real phi
  ) {
    real ptarget = 0;
    int N = end - start + 1;
    array[N] vector[ncat] wi;

    for (n in 1:N) {
      int nn = n + start - 1;
      wi[n] = rep_vector(0.0, ncat);
      ptarget += dirichlet_multinomial_logit2_lpmf(Y[nn]| wi[n], phi);
    }
    return ptarget;
  }

}
data {
  int<lower=1> N;  // total number of observations
  int<lower=2> ncat;  // number of categories
  array[N, ncat] int Y;  // response array
  array[N] int trials;  // number of trials
  int grainsize;  // grainsize for threading
  int prior_only;  // should the likelihood be ignored?
}
transformed data {
  array[N] int seq = sequence(1, N);
}
parameters {
  real<lower=0> phi; // precision parameter
}
transformed parameters {
  real lprior = 0;  // prior contributions to the log posterior
  lprior += lognormal_lpdf(phi | 0, 1)      
  - 4 * lognormal_lcdf(2000 | 0,1);
}
model {
  // likelihood including constants
  if (!prior_only) {
    target += reduce_sum(partial_log_lik_lpmf, seq, 
    grainsize, ncat, Y, trials, phi
    );
  }
  // priors including constants
  target += lprior;
  
}

R code

fit_empty_H0 <- brm(
    formula = bf(
        y | trials(size) ~ 0
    ),
    data = Y_train, cores = 1, chain = 4, threads = 25,
    family = dirichlet_multinomial(
        refcat = "Zzzinf30ind",
        link_phi = "identity"
    ),
    backend = "cmdstanr",
    seed = 1234,
    empty = TRUE
)

fitH0 <- estimate_fit0_brms(fit_empty_H0)
Code
if (!file.exists(
    here_rel("results", "models", "H0", "fitH0.rds")
)) {
    if (!file.exists(here_rel("data", "raw_data", "Y_raw.rds"))) {
        stop(paste0("missing ", here_rel("data", "raw_data", "Y_raw.rds")))
    } else {
        path_fitH0 <- here_rel("results", "models", "H0")
        Y_raw <- readRDS(here_rel("data", "raw_data", "Y_raw.rds"))
        fit_empty_H0 <- brms::brm(
            formula = brms::bf(
                y | trials(size) ~ 0
            ),
            data = Y_raw |> filter(type == "train"),
            cores = 1, chain = 4, threads = 25,
            family = brms::dirichlet_multinomial(
                refcat = "Zzzinf30ind",
                link_phi = "identity"
            ),
            backend = "cmdstanr",
            seed = 1234,
            empty = TRUE
        )
        fit_empty_H0$fit <- brms::read_csv_as_stanfit(paste0(
            path_fitH0, "/",
            list.files(path_fitH0, "*.csv")
        ))
        fitH0 <- fit_empty_H0 |> brms::rename_pars()
        fitH0$data$y <- NULL # Remove raw data
        saveRDS(fitH0, file = here_rel("results", "models", "H0", "fitH0.rds"))
    }
}

Model \(H_1\)

Model \(H_1\) includes the effects of environmental predictors and sampling protocol on community composition.

Environmental predictors for unit (\(i\)) are: - \(\left(Clim1_i, Clim2_i\right)\): continuous indices summarising regional climatic water–energy balance. - \(\left(geom_i \in \{1,\dots,13\}\right)\): geomorphological subtype (categorical; 13 levels). - \(\left(SWI_i\right)\): SAGA Wetness Index; we use \(x_i=\log(SWI_i+1)\) .

Sampling protocol is a categorical variable \(\delta_{1,\dots,4}\) capturing differences in sampling design and inventory geometry.

\[ \begin{aligned} &\ \textbf{Dirichlet-Multinomial probability of specie}_j \textbf{ in inventory unit}_i \\ (y_{i,1} \dots y_{i,S})^{T} \sim &\ \operatorname{Dirichlet-Multinomial}(N_{i},\overrightarrow{\pi_{i}} \times \kappa_{i}) \\[10pt] &\ \textbf{Inventory unit effect} \\ \kappa_{i} = &\ \sum^{4}_{k = 1} \delta_k \times \mathbb{1}(Protocole_i = k) > 0 \end{aligned} \]

\[ \begin{aligned} &\ \textbf{Model for probability of each specie} \\ (\pi_{i,1} \dots \pi_{i,S})^{T} = &\ softmax(\overrightarrow{\theta_{i}}) \\[5pt] &\ \text{For } j \in [1, J-1]: \\ \theta_{i,j} = &\ \mu_j + \qquad\qquad\qquad\qquad\qquad\qquad \text{(Mean regional specie share)} \\ &\ \beta_{1,j} \times Clim1_{i} + \beta_{2,j} \times Clim2_{i} + \quad\quad \text{(Regional climate effect)} \\ &\ \sum^{13}_{p=1}\eta_{p,j} \times \mathbb{1}(geom_i=p) + \text{(Geomorphological landscape effect)} \\ &\ \beta_{3,j} \times log(SWI_{i}+1) \qquad\qquad\qquad\quad \text{(local topography effect)}\\[5pt] &\ \text{For } j = J: \\ \theta_{i,J} = &\ 1 \qquad\qquad\qquad\qquad\qquad\qquad\qquad\quad \text{(Not studied specie class)}\\[10pt] &\ \textbf{Priors} \\ \delta_{1-4} \sim &\ \text{log}\mathcal{N}(0,1) \quad \text{Prior for sampling protocol effects} \\ \beta_{1-3,j} \sim &\ \mathcal{N} (0, 1) \quad \text{Prior for climate and local topography coefficients} \\ \eta_{1-13,j} \sim &\ \mathcal{N} (0, 1) \quad \text{Prior for geomorphological landscape coefficients} \end{aligned} \]

Stan code

Code
functions {
 /* dirichlet-multinomial-logit log-PMF
   * Args:
   *   y: array of integer response values
   *   mu: vector of category logit probabilities
   *   phi: precision parameter (sum of Dirichlet alphas)
   * Returns:
   *   a scalar to be added to the log posterior
  */
real dirichlet_multinomial_logit2_lpmf(array[] int y, vector mu, real phi) {
    // get Dirichlet alphas
    int N = num_elements(mu);
    vector[N] alpha = phi * softmax(mu);

    // get trials from y
    real T = sum(y);

    return lgamma(phi) + lgamma(T + 1.0) - lgamma(T + phi) +
      sum(lgamma(to_vector(y) + alpha)) - sum(lgamma(alpha)) - sum(lgamma(to_vector(y) + 1));
  }

  /* integer sequence of values
   * Args:
   *   start: starting integer
   *   end: ending integer
   * Returns:
   *   an integer sequence from start to end
   */
  array[] int sequence(int start, int end) {
    array[end - start + 1] int seq;
    for (n in 1:num_elements(seq)) {
      seq[n] = n + start - 1;
    }
    return seq;
  }


// compute partial sums of the log-likelihood
real partial_log_lik_lpmf(array[] int seq, int start, int end, 
  data int ncat, data array[,] int Y, data array[] int trials, 
  array[] real Intercept_mu,
  data matrix Xc_mu,  array[] vector mu, int Kc_mu,
  data matrix X_phi, vector b_phi
  ) {
    real ptarget = 0;
    int N = end - start + 1;
    vector[N] phi = rep_vector(0.0, N);
    array[N] vector[ncat] wi;
    // linear predictor phi
    phi += X_phi[start:end] * b_phi;

    for (n in 1:N) {
      int nn = n + start - 1; 
      wi[n] = rep_vector(0.0, ncat);
      for (j in 1:(ncat-1))  {
        // linear predictor matrix mu
        wi[n,j] += Intercept_mu[j];
        wi[n,j] += Xc_mu[nn] * mu[j];
      }
      ptarget += dirichlet_multinomial_logit2_lpmf(Y[nn]| wi[n], phi[n]);
    }
    return ptarget;
  }

}

data {
  int<lower=1> N;  // total number of observations
  int<lower=2> ncat;  // number of categories
  array[N, ncat] int Y;  // response array
  array[N] int trials;  // number of trials
  int<lower=1> K_phi;  // number of population-level effects
  matrix[N, K_phi] X_phi;  // population-level design matrix
  int<lower=1> K_mu;  // number of population-level effects
  matrix[N, K_mu] X_mu;  // population-level design matrix
  int<lower=1> Kc_mu;  // number of population-level effects after centering
  int grainsize;  // grainsize for threading
  int prior_only;  // should the likelihood be ignored?
}
transformed data {
  array[N] int seq = sequence(1, N);
  matrix[N, Kc_mu] Xc_mu;  // centered version of X_mu
  vector[Kc_mu] means_X_mu;  // column means of X_mu before centering
  for (i in 2:K_mu) {
    means_X_mu[i - 1] = mean(X_mu[, i]);
    Xc_mu[, i - 1] = X_mu[, i] - means_X_mu[i - 1];
  }
}
parameters {
  // array [ncat-1] vector [Kc_mu] zb_mu;  // unscaled coefficients
  array [ncat-1] vector [Kc_mu] mu;  // unscaled coefficients 
  // array [ncat-1] vector[Kc_mu] bQ_mu;  // regression coefficients on QR scale
  array [ncat-1] real Intercept_mu;  // temporary intercept for centered predictors
  vector<lower = 0,upper = 2000> [K_phi] b_phi;  // regression coefficients
}
transformed parameters {
  real lprior = 0;  // prior contributions to the log posterior

  for(j in 1:(ncat-1)) {

    lprior += student_t_lpdf(Intercept_mu[j] | 3, 0, 2.5);
    lprior += normal_lpdf(mu[j] | 0, 1);
  }
  lprior += lognormal_lpdf(b_phi | 0,1)
      - 4 * lognormal_lcdf(2000 | 0,1);
}
model {
  // likelihood including constants
  if (!prior_only) {
    target += reduce_sum(partial_log_lik_lpmf, seq, 
    grainsize, ncat, Y, trials, 
    Intercept_mu,
    Xc_mu, mu, Kc_mu,
    X_phi, b_phi
    );
  }
  // priors including constants
  target += lprior;  
}
generated quantities {

  array [ncat-1] real Intercept_recentered = rep_array(0.0,ncat-1);
  for(j in 1:(ncat-1)){
    Intercept_recentered[j] = Intercept_mu[j] - dot_product(means_X_mu, mu[j]);
  }
}

R code

fit_empty_H1 <- brm(
    formula = bf(
        y | trials(size) ~ 1 +
            log(SWI + 1) +
            Clim1 +
            Clim2 +
            GeomSub,
        phi ~ Protocole - 1
    ),
    data = Y_train,
    cores = 1, chain = 4, threads = 25,
    prior = c(
        set_prior("lognormal(0,1)",
            class = "b",
            dpar = "phi",
            lb = 0,
            ub = 2000
        )
    ),
    family = dirichlet_multinomial(
        refcat = "Zzzinf30ind",
        link_phi = "identity"
    ),
    backend = "cmdstanr",
    seed = 1234,
    empty = TRUE
)
fitH1 <- estimate_fit_brms(fit_empty_H1, nb_sd_stratum = 0, 
         path = here_rel("results", "models", "H1"))
Code
if (!file.exists(here_rel("results", "models", "H1", "fitH1.rds"))) {

    if(!file.exists(here_rel("data", "raw_data", "Y_raw.rds"))){
        stop(paste0("missing ", here_rel("data", "raw_data", "Y_raw.rds")))
    }else{
    path_fitH1 <- here_rel("results", "models", "H1")
    Y_raw <- readRDS(here_rel("data", "raw_data", "Y_raw.rds"))
    fit_empty_H1 <- brms::brm(
        formula = brms::bf(
            y | trials(size) ~ 1 +
                log(SWI + 1) +
                Clim1 +
                Clim2 +
                GeomSub,
            phi ~ Protocole - 1
        ),
        data = Y_raw |> filter(type == "train"),
        cores = 1, chain = 4, threads = 25,
        prior = c(
            brms::set_prior("lognormal(0,1)",
                class = "b",
                dpar = "phi",
                lb = 0,
                ub = 2000
            )
        ),
        family = brms::dirichlet_multinomial(
            refcat = "Zzzinf30ind",
            link_phi = "identity"
        ),
        backend = "cmdstanr",
        seed = 1234,
        empty = TRUE
    )
    fit_empty_H1$fit <- brms::read_csv_as_stanfit(paste0(
        path_fitH1, "/",
        list.files(path_fitH1, "*.csv")
    ))
    fitH1 <- fit_empty_H1 |> brms::rename_pars()
    fitH1$data$y <- NULL # Remove raw data
    saveRDS(fitH1, file = here_rel("results", "models", "H1", "fitH1.rds"))
  }
}