Computational Typology
  • Home
  • About

Case study II: number of segments and population size

load libraries

library(repr)
options(repr.plot.width=15, repr.plot.height=9)
library(tidyverse)
library(svglite)
library(ggthemes)
library(brms)
library(viridis)
library(loo)
library(bayesplot)
library(RColorBrewer)
library(coda)
library(ape)
library(phytools)
library(Matrix)
library(posterior)
library(phangorn)
library(geiger)
library(rstan)
library(posterior)
options(brms.backend = "rstan")

Attaching package: ‘coda’


The following object is masked from ‘package:rstan’:

    traceplot


The following object is masked from ‘package:rstan’:

    traceplot



Attaching package: ‘geiger’


The following object is masked from ‘package:bridgesampling’:

    bf


The following object is masked from ‘package:brms’:

    bf



Attaching package: ‘geiger’


The following object is masked from ‘package:bridgesampling’:

    bf


The following object is masked from ‘package:brms’:

    bf

Data Preparation

d <- read_csv("../data/soundpop.csv")

head(d)
Rows: 1508 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Glottocode, Macroarea, Family
dbl (4): nSegments, population, Longitude, Latitude

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Glottocode, Macroarea, Family
dbl (4): nSegments, population, Longitude, Latitude

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
A tibble: 6 × 7
Glottocode nSegments population Macroarea Longitude Latitude Family
<chr> <dbl> <dbl> <chr> <dbl> <dbl> <chr>
bafi1243 51 60000 Africa 11.17 5.00 Atlantic-Congo
nugu1242 35 35000 Africa 11.25 4.58 Atlantic-Congo
mmaa1238 34 8000 Africa 11.08 4.50 Atlantic-Congo
tuki1240 39 26000 Africa 11.50 4.58 Atlantic-Congo
ewon1239 34 578000 Africa 12.00 4.00 Atlantic-Congo
abar1238 36 1850 Africa 10.25 6.58 Atlantic-Congo

load tree and scale its height

tree <- read.tree("../data/soundpop.tre")
tree$edge.length <- tree$edge.length / median(tree$edge.length)

arrange data in order of tree tip.labels

d <- d[match(tree$tip.label, d$Glottocode), ]

create normalized numerical versions of relevant features

d$logpop_standardized <- scale(log(d$population))[,1]
d$lognSegments_standardized <- scale(log(d$nSegments))[,1]
d$family_index = as.numeric(as.factor(d$Family))

store relevant data in list for use by Stan

Nnodes <- tree$edge %>% as.vector %>% unique %>% length
stan_data <- list(
    Ntip = length(tree$tip.label),
    Nnodes = Nnodes,
    Nedges = nrow(tree$edge),
    edge_matrix = tree$edge,
    edge_lengths = tree$edge.length,
    tip_indices = match(d$Glottocode, tree$tip.label),
    x = d$logpop_standardized,
    y = d$lognSegments_standardized,
    root_index = setdiff(tree$edge[,1], tree$edge[, 2])[1],
    n_families = length(unique(d$Family)),
    family_index = d$family_index
)

fit and assess the vanilla model

Stan code for bivariate.stan:

data {
  int<lower=1> Ntip;
  vector[Ntip] x;
  vector[Ntip] y;

}

parameters {
  
  // Residual correlation structure
  vector<lower=0>[2] sigma;
  cholesky_factor_corr[2] L_Rho;

  // Trait means
  real mu_x;
  real mu_y;
}

transformed parameters {
  matrix[2, 2] L_cov = diag_pre_multiply(sigma, L_Rho);
}

model {
  // Priors
  sigma ~ lognormal(0, 1);
  
  L_Rho ~ lkj_corr_cholesky(1);
  
  mu_x ~ normal(0, 2);
  mu_y ~ normal(0, 2);

  // Likelihood
  for (i in 1:Ntip) {
    vector[2] obs = [x[i], y[i]]';
    vector[2] mu = [mu_x, mu_y]';
    obs ~ multi_normal_cholesky(mu, L_cov);
  }
}

generated quantities {
  matrix[2, 2] Rho = multiply_lower_tri_self_transpose(L_Rho);

  real rho = Rho[1, 2];

  vector[Ntip] log_lik;

  for (i in 1:Ntip) {
    vector[2] obs = [x[i], y[i]]';
    vector[2] mu = [mu_x, mu_y]';
    log_lik[i] = multi_normal_cholesky_lpdf(obs | mu, L_cov);
  }
}

if (file.exists("../models_soundpop/fit_bivariate.rds")) {
    fit_bivariate <- readRDS("../models_soundpop/fit_bivariate.rds")
} else {
    bivariate_model <- stan_model("bivariate.stan")
    fit_bivariate <- sampling(
        bivariate_model,
        data = stan_data,
        iter = 20000,
        chains = 4,
        cores = 4,
        control = list(adapt_delta = 0.95),
        save_warmup = FALSE
    )
    saveRDS(fit_bivariate, file = "../models_soundpop/fit_bivariate.rds")
}
summary(fit_bivariate, pars=c("mu_x", "mu_y", "sigma", "rho"))$summary
A matrix: 5 × 10 of type dbl
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
mu_x 2.445543e-05 1.208361e-04 0.02567578 -0.05064014 -0.01729466 0.0001628157 0.01733499 0.05031226 45149.62 0.9999652
mu_y -1.019540e-04 1.191305e-04 0.02568214 -0.05061620 -0.01736278 -0.0001039726 0.01725413 0.05025882 46474.67 0.9999275
sigma[1] 1.000659e+00 8.094021e-05 0.01835099 0.96544821 0.98813756 1.0003184640 1.01294436 1.03767208 51403.23 0.9999543
sigma[2] 1.000702e+00 8.350915e-05 0.01824059 0.96565603 0.98832229 1.0004303908 1.01280743 1.03738934 47710.04 0.9999792
rho 3.489089e-01 1.046222e-04 0.02250510 0.30455355 0.33372110 0.3489500276 0.36440550 0.39237676 46271.51 1.0000091
if (file.exists("../models_soundpop/loo_bivariate.rds")) {
    loo_bivariate <- readRDS("../models_soundpop/loo_bivariate.rds")
} else {
    loo_bivariate <- loo(fit_bivariate)
    saveRDS(loo_bivariate, file = "../models_soundpop/loo_bivariate.rds")
}
if (file.exists("../models_soundpop/marginal_bivariate.rds")) {
    marginal_bivariate <- readRDS("../models_soundpop/marginal_bivariate.rds")
} else {
    marginal_bivariate <- bridge_sampler(
        fit_bivariate,
        method = "warp3",
        repetitions = 10,
        silent = FALSE,
        verbose = TRUE
    )
    saveRDS(marginal_bivariate, file = "../models_soundpop/marginal_bivariate.rds")
}

fit and assess the bivariate model with family-level random effects

Stan code bivariate_family.stan:

data {
  int<lower=1> Ntip;
  vector[Ntip] x;
  vector[Ntip] y;

  int<lower=1> n_families;
  array[Ntip] int<lower=1, upper=n_families> family_index;
}

parameters {
  // Family-level correlated effects
  vector<lower=0>[2] sigma_family;
  cholesky_factor_corr[2] L_Rho_family;
  matrix[2, n_families] family_effect;

  // Residual correlation structure
  vector<lower=0>[2] sigma;
  cholesky_factor_corr[2] L_Rho;

  // Trait means
  real mu_x;
  real mu_y;
}

transformed parameters {
  matrix[2, 2] L_cov = diag_pre_multiply(sigma, L_Rho);
}

model {
  // Priors
  sigma ~ lognormal(0, 1);
  sigma_family ~ lognormal(0, 1);

  L_Rho ~ lkj_corr_cholesky(1);
  L_Rho_family ~ lkj_corr_cholesky(1);

  for (f in 1:n_families) {
    family_effect[, f] ~ multi_normal_cholesky([0, 0]', diag_pre_multiply(sigma_family, L_Rho_family));
  }

  mu_x ~ normal(0, 2);
  mu_y ~ normal(0, 2);

  // Likelihood
  for (i in 1:Ntip) {
    vector[2] obs = [x[i], y[i]]';
    vector[2] mu = [mu_x, mu_y]' + family_effect[, family_index[i]];
    obs ~ multi_normal_cholesky(mu, L_cov);
  }
}

generated quantities {
  matrix[2, 2] Rho = multiply_lower_tri_self_transpose(L_Rho);
  matrix[2, 2] Rho_family = multiply_lower_tri_self_transpose(L_Rho_family);

  real rho = Rho[1, 2];
  real rho_family = Rho_family[1, 2];

  vector[Ntip] log_lik;

  for (i in 1:Ntip) {
    vector[2] obs = [x[i], y[i]]';
    vector[2] mu = [mu_x, mu_y]' + family_effect[, family_index[i]];
    log_lik[i] = multi_normal_cholesky_lpdf(obs | mu, L_cov);
  }
}

#fit a correlation model without phylogenetic effect using brms
if (file.exists("../models_soundpop/fit_bivariate_family.rds")) {
    fit_bivariate_family <- readRDS("../models_soundpop/fit_bivariate_family.rds")
} else {
    bivariate_family_model <- stan_model("bivariate_family.stan")
    fit_bivariate_family <- sampling(
        bivariate_family_model,
        data = stan_data,
        iter = 2000,
        chains = 4,
        cores = 4,
        control = list(adapt_delta = 0.95)
    )
    saveRDS(fit_bivariate_family, file = "../models_soundpop/fit_bivariate_family.rds")
}

# Print a summary of the fit
summary(fit_bivariate_family, pars = c("mu_x", "mu_y", "sigma", "rho", "rho_family"))$summary
A matrix: 6 × 10 of type dbl
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
mu_x -0.57990818 0.0026850135 0.06423819 -0.70290307 -0.62286232 -0.58118549 -0.536549849 -0.4516725 572.3921 1.0109672
mu_y -0.35127149 0.0032750111 0.07186611 -0.49096564 -0.40117997 -0.35101639 -0.302118195 -0.2123885 481.5292 1.0067009
sigma[1] 0.69606355 0.0001680326 0.01344548 0.66988695 0.68692428 0.69605657 0.704936772 0.7230570 6402.7356 1.0001854
sigma[2] 0.68195948 0.0001787510 0.01345680 0.65673680 0.67247094 0.68168730 0.690867579 0.7091599 5667.4373 0.9994716
rho -0.00963805 0.0003313743 0.02663146 -0.06205059 -0.02754372 -0.00953678 0.008746902 0.0405565 6458.8070 0.9997995
rho_family 0.54005605 0.0018392111 0.07626657 0.38235444 0.49081028 0.54308441 0.592114440 0.6791468 1719.5121 1.0019424
# conduct model comparison

if (file.exists("../models_soundpop/loo_bivariate_family.rds")) {
    loo_bivariate_family <- readRDS("../models_soundpop/loo_bivariate_family.rds")
} else {
    loo_bivariate_family <- loo(fit_bivariate_family)
    saveRDS(loo_bivariate_family, file = "../models_soundpop/loo_bivariate_family.rds")
}
if (file.exists("../models_soundpop/marginal_bivariate_family.rds")) {
    marginal_bivariate_family <- readRDS("../models_soundpop/marginal_bivariate_family.rds")
} else {
    marginal_bivariate_family <- bridge_sampler(
        fit_bivariate_family,
        method = "warp3",
        repetitions = 10
    )
    saveRDS(marginal_bivariate_family, file = "../models_soundpop/marginal_bivariate_family.rds")
}

fit and assess the bivariate model with phylogenetic control

Stan code for OU_bivariate.stan:

data {
  int<lower=1> Ntip;
  int<lower=1> Nnodes;
  int<lower=1> Nedges;
  array[Nedges, 2] int<lower=1, upper=Nnodes> edge_matrix; // [parent, child]
  vector[Nedges] edge_lengths;

  array[Ntip] int<lower=1, upper=Nnodes> tip_indices; // observation to node map
  vector[Ntip] x;
  vector[Ntip] y;
  int<lower=Ntip+1, upper=Nnodes> root_index;
  int<lower=1> n_families;
  array[Ntip] int<lower=1, upper=n_families> family_index;
}

parameters {
  // Family-level correlated effects
  vector<lower=0>[2] sigma_family;
  cholesky_factor_corr[2] L_Rho_family;
  matrix[2, n_families] family_effect;

  // OU parameters
  real<lower=0> sigma_x;
  real<lower=0> sigma_y;
  real<lower=0> lambda_x;
  real<lower=0> lambda_y;
  real<lower=-1, upper=1> rho;

  real mu_x;
  real mu_y;

  // Latent internal node values (Ninternal = Nnodes - Ntip)
  vector[Nnodes - Ntip] z_x;
  vector[Nnodes - Ntip] z_y;
}

transformed parameters {
  matrix[2, 2] L_cov;
  {
    matrix[2, 2] R;
    R[1, 1] = 1;
    R[1, 2] = rho;
    R[2, 1] = rho;
    R[2, 2] = 1;
    L_cov = cholesky_decompose(R);
  }
}

model {
  // Priors
  sigma_family ~ lognormal(0, 1);
  L_Rho_family ~ lkj_corr_cholesky(2);
  for (f in 1:n_families) {
    family_effect[, f] ~ multi_normal_cholesky([0, 0]', diag_pre_multiply(sigma_family, L_Rho_family));
  }

  sigma_x ~ lognormal(0, 1);
  sigma_y ~ lognormal(0, 1);
  lambda_x ~ lognormal(0, 1);
  lambda_y ~ lognormal(0, 1);
  rho ~ uniform(-1, 1);
  mu_x ~ normal(0, 2);
  mu_y ~ normal(0, 2);

  // Root prior
  {
    matrix[2, 2] root_cov;
    root_cov[1, 1] = square(sigma_x) / (2 * lambda_x);
    root_cov[2, 2] = square(sigma_y) / (2 * lambda_y);
    root_cov[1, 2] = rho * sigma_x * sigma_y / sqrt(4 * lambda_x * lambda_y);
    root_cov[2, 1] = root_cov[1, 2];
    [z_x[root_index - Ntip], z_y[root_index - Ntip]] ~ multi_normal([mu_x, mu_y], root_cov);
  }

  // Evolution along tree
  for (e in 1:Nedges) {
    int parent = edge_matrix[e, 1];
    int child = edge_matrix[e, 2];
    real length = edge_lengths[e];

    real decay_x = exp(-lambda_x * length);
    real decay_y = exp(-lambda_y * length);
    real v_x = sigma_x * sqrt(-expm1(-2 * lambda_x * length) / (2 * lambda_x));
    real v_y = sigma_y * sqrt(-expm1(-2 * lambda_y * length) / (2 * lambda_y));

    real mn_x = mu_x + (z_x[parent - Ntip] - mu_x) * decay_x;
    real mn_y = mu_y + (z_y[parent - Ntip] - mu_y) * decay_y;

    matrix[2, 2] L_edge = diag_pre_multiply([v_x, v_y]', L_cov);

    if (child <= Ntip) {
      vector[2] obs_vec = [x[child], y[child]]';
      vector[2] mean_vec = [mn_x, mn_y]';
      vector[2] family_eff = family_effect[, family_index[child]];
      target += multi_normal_cholesky_lpdf(obs_vec | mean_vec + family_eff, L_edge);
    } else {
      [z_x[child - Ntip], z_y[child - Ntip]] ~ multi_normal_cholesky([mn_x, mn_y], L_edge);
    }
  }
}

generated quantities {
  vector[Ntip] log_lik;
  matrix[2, 2] Rho_family = multiply_lower_tri_self_transpose(L_Rho_family);
  real rho_family = Rho_family[1, 2];

  for (e in 1:Nedges) {
    int parent = edge_matrix[e, 1];
    int child = edge_matrix[e, 2];
    real length = edge_lengths[e];

    real decay_x = exp(-lambda_x * length);
    real decay_y = exp(-lambda_y * length);
    real v_x = sigma_x * sqrt(-expm1(-2 * lambda_x * length) / (2 * lambda_x));
    real v_y = sigma_y * sqrt(-expm1(-2 * lambda_y * length) / (2 * lambda_y));

    real mn_x = mu_x + (z_x[parent - Ntip] - mu_x) * decay_x;
    real mn_y = mu_y + (z_y[parent - Ntip] - mu_y) * decay_y;

    if (child <= Ntip) {
      vector[2] obs_vec = [x[child], y[child]]';
      vector[2] mean_vec = [mn_x, mn_y]';
      vector[2] family_eff = family_effect[, family_index[child]];
      matrix[2, 2] L_edge = diag_pre_multiply([v_x, v_y]', L_cov);
      log_lik[child] = multi_normal_cholesky_lpdf(obs_vec | mean_vec + family_eff, L_edge);
    }
  }


}
# Load or fit the bivariate OU model
if (file.exists("../models_soundpop/fit_bivariate_ou.rds")) {
  fit_bivariate_ou <- readRDS("../models_soundpop/fit_bivariate_ou.rds")
} else {
  bivariate_ou_model <- stan_model("OU_bivariate.stan")
  fit_bivariate_ou <- sampling(
    bivariate_ou_model,
    data = stan_data,
    iter = 20000,
    chains = 4,
    cores = 4,
    control = list(adapt_delta = 0.95),
    save_warmup = FALSE

  )
  saveRDS(fit_bivariate_ou, file = "../models_soundpop/fit_bivariate_ou.rds")
}
# Print a summary of the fit
summary(fit_bivariate_ou, pars = c("sigma_x", "sigma_y", "lambda_x", "lambda_y", "mu_x", "mu_y", "rho", "rho_family"))$summary
A matrix: 8 × 10 of type dbl
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
sigma_x 0.80653933 0.0006815855 0.03740842 0.73864921 0.78052412 0.80454272 0.830611558 0.88513907 3012.298 1.001036
sigma_y 0.74036977 0.0005773613 0.03397798 0.67844843 0.71668248 0.73872234 0.761805481 0.81189833 3463.377 1.001219
lambda_x 0.64817606 0.0012465188 0.06797915 0.52661202 0.60072757 0.64384875 0.691080690 0.79414549 2974.088 1.001207
lambda_y 0.54889715 0.0009651324 0.05889606 0.44481907 0.50759247 0.54542172 0.585912229 0.67628866 3723.906 1.000973
mu_x -0.58308574 0.0012629078 0.06504179 -0.71158658 -0.62679582 -0.58242744 -0.538968364 -0.45822509 2652.416 1.001979
mu_y -0.33357811 0.0015447217 0.07593576 -0.48270394 -0.38478076 -0.33357869 -0.282422594 -0.18459376 2416.530 1.000614
rho -0.01312946 0.0002037643 0.02754853 -0.06689844 -0.03168409 -0.01307791 0.005510488 0.04077955 18278.496 1.000248
rho_family 0.56612106 0.0008440469 0.08373925 0.38937578 0.51134013 0.57019730 0.625400949 0.71761905 9842.944 1.000151
# compute loo

# Compute LOO for the bivariate OU model
if (file.exists("../models_soundpop/loo_bivariate_ou.rds")) {
    loo_bivariate_ou <- readRDS("../models_soundpop/loo_bivariate_ou.rds")
} else {
    loo_bivariate_ou <- loo(fit_bivariate_ou)
    # Save the LOO result for the bivariate OU model
    saveRDS(loo_bivariate_ou, file = "../models_soundpop/loo_bivariate_ou.rds")
}
# compute marginal likelihood
if (file.exists("../models_soundpop/marginal_bivariate_ou.rds")) {
    marginal_bivariate_ou <- readRDS("../models_soundpop/marginal_bivariate_ou.rds")
} else {
    marginal_bivariate_ou <- bridge_sampler(
        fit_bivariate_ou,
        method = "warp3",
        repetitions = 10,
        silent = FALSE,
        verbose = TRUE
    )
    saveRDS(marginal_bivariate_ou, file = "../models_soundpop/marginal_bivariate_ou.rds")
}

model comparison

loo_compare(
    loo_bivariate,
    loo_bivariate_family,
    loo_bivariate_ou
)
A compare.loo: 3 × 8 of type dbl
elpd_diff se_diff elpd_loo se_elpd_loo p_loo se_p_loo looic se_looic
model3 0.0000 0.00000 -3162.869 52.61225 589.628465 24.8846862 6325.738 105.22450
model2 -140.3466 26.08968 -3303.215 47.98197 231.365495 13.9473034 6606.431 95.96395
model1 -1022.2489 44.03682 -4185.118 40.76089 5.176883 0.3329933 8370.235 81.52178
bayes_factor(
    marginal_bivariate,
    marginal_bivariate_ou,
    log=TRUE
)
Estimated log Bayes factor (based on medians of log marginal likelihood estimates)
 in favor of x1 over x2: -1194.40055
Range of estimates: -1198.27927 to -1190.88134
Interquartile range: 3.94644
bayes_factor(
    marginal_bivariate_family,
    marginal_bivariate_ou,
    log=TRUE
)
Estimated log Bayes factor (based on medians of log marginal likelihood estimates)
 in favor of x1 over x2: -116.87466
Range of estimates: -122.04057 to -113.23186
Interquartile range: 5.10539