Word order

library(repr)
options(repr.plot.width=12, repr.plot.height=10)
library(tidyverse)
library(gridExtra)
library(patchwork)
library(stringr)
library(reshape2)
library(rstan)
library(bridgesampling)
library(loo)
library(ape)
library(bayesplot)
library(shinystan)
library(coda)
library(ggtern)

options(mc.cores = parallel::detectCores())
rstan_options("auto_write" = TRUE)
karminrot <- rgb(206/255, 143/255, 137/255)
gold <- rgb(217/255, 205/255, 177/255)
anthrazit <- rgb(143/255, 143/255, 149/255)

Next, the data are loaded into a tibble called d.

d <- read_csv("../data/data_with_counts.csv")
d$Glottocode[d$Glottocode == "kose1239"] <- "awiy1238"
taxa <- d$Glottocode
head(d)
Rows: 827 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): ISO, Glottocode, Language, Family, Area
dbl (8): N_Speakers, Case, AP_Entropy, VerbFinal, VerbMiddle, v1, vm, vf

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
A tibble: 6 × 13
ISO Glottocode Language Family Area N_Speakers Case AP_Entropy VerbFinal VerbMiddle v1 vm vf
<chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
aai arif1239 Arifama-Miniafia Austronesian Oceania 3470 0 0.7510324 0.8172043 0.15053763 3 14 76
aak anka1246 Ankave Angan Oceania 1500 NA 0.9549905 0.6121212 0.30303030 14 50 101
aau abau1245 Abau Sepik Oceania 7500 1 0.6152539 0.9565217 0.02173913 1 1 44
abt ambu1247 Ambulas Ndu Oceania 33000 1 0.8478617 0.6666667 0.29411765 2 15 34
aby anem1248 Aneme Wake Yareban Oceania 650 1 0.8426579 0.6562500 0.22916667 11 22 63
acd giky1238 Gikyode Atlantic-Congo Africa 10400 0 0.5916728 0.1904762 0.73809524 3 31 8

In the next step, the maximum clade credibility trees are loaded as phylo objects.

mcc_trees <- list.files("../data/mcc_trees", pattern = ".tre", full.names = TRUE)

trees <- purrr::map(mcc_trees, read.tree)

# get a vector of tip labels of all trees in trees

tips <- purrr::map(trees, function(x) x$tip.label) %>% unlist %>% unique

A brief sanity check to ensure that all tips of the trees correspond to a Glottocode in d.

if (length(setdiff(tips, taxa)) == 0 ) {
    print("All tips in trees are in taxa")
} else {
    print("Not all tips in trees are in taxa")
}
[1] "All tips in trees are in taxa"

All Glottocodes in d that do not occur as a tip are isolates (for the purpose of this study).

For each isolate I create a list object with the fields "tip.lable" and "Nnode", to make them compatible with the phylo objects for language families. These list objects are added to the list trees.

isolates <- setdiff(taxa, tips)

# convert isolates to one-node trees and add to trees
for (i in isolates) {
    tr <- list()
    tr[["tip.label"]] <- i
    tr[["Nnode"]] <- 0
    trees[[length(trees) + 1]] <- tr
}

Just to make sure, I test whether the tips of this forest are exactly the Glottocodes in the data.

tips <- c()
for (tr in trees) {
    tips <- c(tips, tr$tip.label)
}

if (all(sort(tips) == sort(taxa))) {
    print("taxa and tips are identical")
} else {
    print("taxa and tips are not identical")
}
[1] "taxa and tips are identical"

Now I re-order the rows in d to make sure they are in the same order as tips.

d <- d[match(tips, d$Glottocode), ]

The trees in trees form a forest, i.e., a disconnected directed acyclic graph. I want to construct a representation of this graph which I can pass to Stan, consisting of

First I create an Nx4 matrix where each row represents one node. The columns are

For internal nodes, the label is the empty string.

node_matrix <- c()
node_number <- 1
for (tn in 1:length(trees)) {
    tr <- trees[[tn]]
    tr_nodes <- length(tr$tip.label) + tr$Nnode
    for (i in 1:tr_nodes) {
        tl <- ""
        if (i <= length(tr$tip.label)) {
            tl <- tr$tip.label[i]
        }
        node_matrix <- rbind(node_matrix, c(node_number, tn, i, tl))
        node_number <- node_number+1
    }
}

I convert the matrix to a tibble.

nodes <- tibble(
    node_number = as.numeric(node_matrix[,1]),
    tree_number = as.numeric(node_matrix[,2]),
    internal_number = as.numeric(node_matrix[,3]),
    label = node_matrix[,4]
)
head(nodes)
tail(nodes)
A tibble: 6 × 4
node_number tree_number internal_number label
<dbl> <dbl> <dbl> <chr>
1 1 1 aman1265
2 1 2 wari1266
3 1 3
4 2 1 bora1263
5 2 2 muin1242
6 2 3
A tibble: 6 × 4
node_number tree_number internal_number label
<dbl> <dbl> <dbl> <chr>
1549 95 1 uduk1239
1550 96 1 wiru1244
1551 97 1 kamu1260
1552 98 1 yagu1244
1553 99 1 kark1258
1554 100 1 nucl1454

Next I define a function that maps a tree number and an internal node number to the global node number.

get_global_id <- function(tree_number, local_id) which((nodes$tree_number == tree_number) & (nodes$internal_number == local_id))

Now I create a matrix of edges and, concomittantly, a vector of edge lengths.

edges <- c()
edge_lengths <- c()
for (tn in 1:length(trees)) {
    tr <- trees[[tn]]
    if (length(tr$edge) > 0 ) {
        for (en in 1:nrow(tr$edge)) {
            mother <- tr$edge[en, 1]
            daughter <- tr$edge[en, 2]
            mother_id <- get_global_id(tn, mother)
            daughter_id <- get_global_id(tn, daughter)
            el <- tr$edge.length[en]
            edges <- rbind(edges, c(mother_id, daughter_id))
            edge_lengths <- c(edge_lengths, el)
        }
    }
}

Finally I identify the root nodes and tip nodes.

roots <- c(sort(setdiff(edges[,1], edges[,2])), match(isolates, nodes$label))
tips <- which(nodes$label != "")

Next I define a function computing the post-order.

get_postorder <- function(roots, edges) {
    input <- roots
    output <- c()
    while (length(input) > 0) {
        pivot <- input[1]
        daughters <- setdiff(edges[edges[,1] == pivot, 2], output)
        if (length(daughters) == 0) {
            input <- input[-1]
            output <- c(output, pivot)
        } else {
            input <- c(daughters, input)
        }
    }
    return(output)
}
undefined <- c()
po <- c()
for (i in get_postorder(roots, edges)) {
    if ((i %in% edges[,2]) & !(i %in% undefined)) {
        po <- c(po, i)
    }
}
edge_po <- match(po, edges[,2])
predictive_plot <- function(prior_pc) {
    p1 <- prior_pc %>%
      mutate(
          x_jittered = 1 + rnorm(n(), mean = 0, sd = 0.01),
          y_jittered = tv + rnorm(n(), mean = 0, sd = sd(prior_pc$tv)/10)
      ) %>%
      ggplot(aes(x = x_jittered, y = tv)) +
      geom_boxplot(aes(x = factor(1), y = tv), width = 0.5, fill=karminrot) +
      geom_point(alpha = 0.5, size = 0.1, position = position_identity()) +
      theme_grey(base_size = 16) +
      theme(
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_text(size = 18),
        plot.title = element_text(size = 20, hjust = 0.5)
      ) +
      labs(title = "total variance", y = "") +
      geom_point(aes(x=1, y=empirical$tv, color="empirical"), size=5, fill='red') +
      guides(color = "none")

    p2 <- prior_pc %>%
      mutate(
          x_jittered = 1 + rnorm(n(), mean = 0, sd = 0.01),
          y_jittered = lv + rnorm(n(), mean = 0, sd = sd(prior_pc$lv)/10)
      ) %>%
      ggplot(aes(x = x_jittered, y = lv)) +
      geom_boxplot(aes(x = factor(1), y = lv), width = 0.5, fill=gold) +
      geom_point(alpha = 0.5, size = 0.1, position = position_identity()) +
      theme_grey(base_size = 16) +
      theme(
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_text(size = 18),
        plot.title = element_text(size = 20, hjust = 0.5)
      ) +
      labs(title = "lineage-wise variance", y = "") +
      geom_point(aes(x=1, y=empirical$lv, color="empirical"), size=5, fill='red') +
      guides(color = "none")

    p3 <- prior_pc %>%
      mutate(
          x_jittered = 1 + rnorm(n(), mean = 0, sd = 0.01),
          y_jittered = cv + rnorm(n(), mean = 0, sd = sd(prior_pc$cv)/10)
      ) %>%
      ggplot(aes(x = x_jittered, y = cv)) +
      geom_boxplot(aes(x = factor(1), y = cv), width = 0.5, fill=anthrazit) +
      geom_point(alpha = 0.5, size = 0.1, position = position_identity()) +
      theme_grey(base_size = 16) +
      theme(
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_text(size = 18),
        plot.title = element_text(size = 20, hjust = 0.5)
      ) +
      labs(title = "cross-lineage variance", y = "") +
      geom_point(aes(x=1, y=empirical$cv, color="empirical"), size=5, fill='red') +
      labs(color = "") +
      guides(color = "none")

    return(gridExtra::grid.arrange(p1, p2, p3, ncol=3))
}

The data contain three word order variables, v1 (verb initial), vm (verb medial) and vf (verb final). These are all count data.

The obvious candidate model for this is a multinomial distribution.

\[ \begin{aligned} (n_1, n_2, n_3) &\sim \text{Multinomial}(p_1, p_2, p_3)\\ \end{aligned} \]

In the baseline model, the probabilities of the three categories are identical; they are drawn from a uniform distribution:

\[ \begin{aligned} \vec p &\sim \text{Dirichlet}(1, 1, 1) \end{aligned} \]

Y <- d %>% 
    select(v1, vm, vf) %>%
    as.matrix
word_order_data <- list()
word_order_data$Y <- Y
word_order_data$Ndata <- nrow(Y)
stan_code <- "
data {
    int<lower=1> Ndata;
    int<lower=0> Y[Ndata, 3];
}

parameters {
    simplex[3] p;
}

model {
    p ~ dirichlet(rep_vector(1, 3));
    for (i in 1:Ndata) {
        Y[i] ~ multinomial(p);
    }
}

generated quantities {
    real log_lik[Ndata];
    int Y_posterior[Ndata, 3];
    int Y_prior[Ndata, 3];
    simplex[3] p_prior = dirichlet_rng(rep_vector(1, 3));
    for (i in 1:Ndata) {
        Y_posterior[i] = multinomial_rng(p, sum(Y[i]));
        Y_prior[i] = multinomial_rng(p_prior, sum(Y[i]));
        log_lik[i] = multinomial_lpmf(Y[i] | p);
    }
}
"
model_wo_pooling <- stan_model(model_code=stan_code)
fit_wo_pooling <- rstan::sampling(
    model_wo_pooling,
    data = word_order_data,
    chains = 4,
    iter = 2000,
    thin = 1
)
print(fit_wo_pooling, pars=c("p", "lp__"))
Inference for Stan model: anon_model.
4 chains, each with iter=2000; warmup=1000; thin=1; 
post-warmup draws per chain=1000, total post-warmup draws=4000.

          mean se_mean   sd      2.5%       25%       50%       75%     97.5%
p[1]      0.13    0.00 0.00      0.13      0.13      0.13      0.14      0.14
p[2]      0.52    0.00 0.00      0.52      0.52      0.52      0.52      0.52
p[3]      0.35    0.00 0.00      0.34      0.34      0.35      0.35      0.35
lp__ -90276.46    0.02 0.98 -90278.95 -90276.84 -90276.17 -90275.78 -90275.52
     n_eff Rhat
p[1]  3156    1
p[2]  3306    1
p[3]  3193    1
lp__  1786    1

Samples were drawn using NUTS(diag_e) at Mon Mar 18 14:35:36 2024.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
(loo_wo_pooling <- loo(fit_wo_pooling))
Warning message:
“Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
”

Computed from 4000 by 827 log-likelihood matrix

         Estimate     SE
elpd_loo -23941.4 2104.6
p_loo       193.4   93.0
looic     47882.8 4209.2
------
Monte Carlo SE of elpd_loo is NA.

Pareto k diagnostic values:
                         Count Pct.    Min. n_eff
(-Inf, 0.5]   (good)     819   99.0%   521       
 (0.5, 0.7]   (ok)         2    0.2%   85        
   (0.7, 1]   (bad)        2    0.2%   38        
   (1, Inf)   (very bad)   4    0.5%   1         
See help('pareto-k-diagnostic') for details.
(marginal_wo_pooling <- bridge_sampler(fit_wo_pooling))
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Bridge sampling estimate of the log marginal likelihood: -90283.23
Estimate obtained in 4 iteration(s) via method "normal".

predictive checks

I will use the same three statistics, but computed on relative instead of absolute frequencies.

total_variance <- function(Y) sum(apply(Y / apply(Y, 1, sum), 2, var))
lineage_wise_variance <- function(Y) {
    y_norm <- Y / apply(Y, 1, sum)
    r <- tibble(
        v1 = y_norm[, 1],
        vm = y_norm[, 2],
        vf = y_norm[, 3]
    ) %>%
        mutate(Family = d$Family) %>%
        group_by(Family) %>%
        summarise(
            sum_var = sum(var(v1), var(vm), var(vf))
        ) %>%
        pull() %>% mean(na.rm = T)
    return(r)
}
crossfamily_variance <- function(Y) {
    y_norm <- Y / apply(Y, 1, sum)
    r <- tibble(
        v1 = y_norm[, 1],
        vm = y_norm[, 2],
        vf = y_norm[, 3],
        Family = d$Family
    ) %>%
    group_by(Family) %>%
    summarise(
        v1_mean = mean(v1, na.rm = TRUE),
        vm_mean = mean(vm, na.rm = TRUE),
        vf_mean = mean(vf, na.rm = TRUE)
    ) %>%
    summarise(
        v1_var = var(v1_mean, na.rm = TRUE),
        vm_var = var(vm_mean, na.rm = TRUE),
        vf_var = var(vf_mean, na.rm = TRUE)
    )

    # Sum the variances of the means
    total_variance = sum(r$v1_var, r$vm_var, r$vf_var)
    return(total_variance)
}
empirical <- tibble(
    tv = total_variance(Y),
    lv = lineage_wise_variance(Y),
    cv = crossfamily_variance(Y)
)
generated_quantities <- extract(fit_wo_pooling)
prior_pc <- tibble(
    tv = apply(generated_quantities$Y_prior, 1, total_variance),
    lv = apply(generated_quantities$Y_prior, 1, lineage_wise_variance),
    cv = apply(generated_quantities$Y_prior, 1, crossfamily_variance)
)
predictive_plot(prior_pc)
Warning message in geom_point(aes(x = 1, y = empirical$tv, color = "empirical"), :
“All aesthetics have length 1, but the data has 4000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$lv, color = "empirical"), :
“All aesthetics have length 1, but the data has 4000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$cv, color = "empirical"), :
“All aesthetics have length 1, but the data has 4000 rows.
ℹ Did you mean to use `annotate()`?”

posterior_pc <- tibble(
    tv = apply(generated_quantities$Y_posterior, 1, total_variance),
    lv = apply(generated_quantities$Y_posterior, 1, lineage_wise_variance),
    cv = apply(generated_quantities$Y_posterior, 1, crossfamily_variance)
)
predictive_plot(posterior_pc)
Warning message in geom_point(aes(x = 1, y = empirical$tv, color = "empirical"), :
“All aesthetics have length 1, but the data has 4000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$lv, color = "empirical"), :
“All aesthetics have length 1, but the data has 4000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$cv, color = "empirical"), :
“All aesthetics have length 1, but the data has 4000 rows.
ℹ Did you mean to use `annotate()`?”

In the next model, I assume that each language has its own probability distribution, but they are drawn from a common hyperprior.

separation_code <- "
data {
    int<lower=1> Ndata;
    int<lower=0> Y[Ndata, 3];
}

parameters {
    vector[3] mu;
    vector<lower=0>[3] sigma;
    matrix[Ndata, 3] z;
    simplex[3] p[Ndata];
}

model {
    mu ~ normal(0, 1);
    sigma ~ exponential(1);
    for (i in 1:Ndata) {
        for (k in 1:3) {
            z[i, k] ~ normal(mu[k], sigma[k]);
        }
        Y[i] ~ multinomial(softmax(to_vector(z[i])));
    }
}

generated quantities {
    vector[Ndata] log_lik;
    int Y_posterior[Ndata, 3];
    int Y_prior[Ndata, 3];
    vector[3] mu_prior;
    vector<lower=0>[3] sigma_prior;
    matrix[Ndata, 3] z_prior;

    for (k in 1:3) {
        mu_prior[k] = normal_rng(0, 1);
        sigma_prior[k] = exponential_rng(1);
    }
    for (i in 1:Ndata) {
        for (k in 1:3) {
            z_prior[i, k] = normal_rng(mu_prior[k], sigma_prior[k]);
        }
    Y_posterior[i] = multinomial_rng(softmax(to_vector(z[i])), sum(Y[i]));
    Y_prior[i] = multinomial_rng(softmax(to_vector(z_prior[i])), sum(Y[i]));
    log_lik[i] = multinomial_lpmf(Y[i] | softmax(to_vector(z[i])));
    }
}

"
model_wo_separation <- stan_model(model_code = separation_code)
# fit_wo_separation <- rstan::sampling(
#     model_wo_separation,
#     data = word_order_data,
#     chains = 4,
#     iter = 100000,
#     thin = 1,
#     control = list(max_treedepth = 15)
# )

# saveRDS(fit_wo_separation, "models/fit_wo_separation.rds")
fit_wo_separation <- readRDS("models/fit_wo_separation.rds")
print(fit_wo_separation, pars=c("mu", "sigma", "lp__"))
Inference for Stan model: anon_model.
4 chains, each with iter=1e+05; warmup=50000; thin=1; 
post-warmup draws per chain=50000, total post-warmup draws=2e+05.

              mean se_mean     sd      2.5%       25%       50%       75%
mu[1]        -0.87    0.05   0.57     -1.97     -1.28     -0.83     -0.46
mu[2]         0.56    0.05   0.57     -0.53      0.15      0.60      0.98
mu[3]         0.14    0.05   0.57     -0.96     -0.28      0.18      0.55
sigma[1]      0.98    0.00   0.04      0.91      0.96      0.98      1.01
sigma[2]      0.24    0.00   0.08      0.10      0.18      0.24      0.30
sigma[3]      1.36    0.00   0.04      1.29      1.34      1.36      1.39
lp__     -74410.36   15.62 294.87 -74847.33 -74631.68 -74461.60 -74226.97
             97.5% n_eff Rhat
mu[1]         0.16   130 1.01
mu[2]         1.59   130 1.01
mu[3]         1.17   131 1.01
sigma[1]      1.05  2122 1.00
sigma[2]      0.41   635 1.01
sigma[3]      1.44  9404 1.00
lp__     -73762.00   356 1.02

Samples were drawn using NUTS(diag_e) at Mon Mar  4 01:38:36 2024.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
(loo_wo_separation <- loo(fit_wo_separation))
Warning message:
“Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
”

Computed from 200000 by 827 log-likelihood matrix

         Estimate   SE
elpd_loo  -5116.8 26.2
p_loo      1353.3  8.9
looic     10233.6 52.5
------
Monte Carlo SE of elpd_loo is NA.

Pareto k diagnostic values:
                         Count Pct.    Min. n_eff
(-Inf, 0.5]   (good)       0    0.0%   <NA>      
 (0.5, 0.7]   (ok)         0    0.0%   <NA>      
   (0.7, 1]   (bad)      683   82.6%   1         
   (1, Inf)   (very bad) 144   17.4%   1         
See help('pareto-k-diagnostic') for details.
new_mod <- stan(model_code = separation_code, data = word_order_data, chains = 0)
the number of chains is less than 1; sampling not done
(marginal_wo_separation <- bridge_sampler(
    fit_wo_separation,
    new_mod,
    cores = 10
))
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27
Iteration: 28
Iteration: 29
Iteration: 30
Iteration: 31
Iteration: 32
Iteration: 33
Iteration: 34
Iteration: 35
Iteration: 36
Iteration: 37
Iteration: 38
Iteration: 39
Iteration: 40
Iteration: 41
Iteration: 42
Iteration: 43
Iteration: 44
Iteration: 45
Iteration: 46
Iteration: 47
Iteration: 48
Iteration: 49
Iteration: 50
Iteration: 51
Iteration: 52
Iteration: 53
Iteration: 54
Iteration: 55
Iteration: 56
Iteration: 57
Iteration: 58
Iteration: 59
Iteration: 60
Iteration: 61
Iteration: 62
Iteration: 63
Iteration: 64
Iteration: 65
Iteration: 66
Iteration: 67
Iteration: 68
Iteration: 69
Iteration: 70
Iteration: 71
Iteration: 72
Iteration: 73
Iteration: 74
Iteration: 75
Iteration: 76
Iteration: 77
Iteration: 78
Iteration: 79
Iteration: 80
Iteration: 81
Iteration: 82
Iteration: 83
Iteration: 84
Iteration: 85
Iteration: 86
Iteration: 87
Iteration: 88
Iteration: 89
Iteration: 90
Iteration: 91
Iteration: 92
Iteration: 93
Iteration: 94
Iteration: 95
Iteration: 96
Iteration: 97
Iteration: 98
Iteration: 99
Iteration: 100
Iteration: 101
Iteration: 102
Iteration: 103
Iteration: 104
Iteration: 105
Iteration: 106
Iteration: 107
Iteration: 108
Iteration: 109
Iteration: 110
Iteration: 111
Iteration: 112
Iteration: 113
Iteration: 114
Iteration: 115
Iteration: 116
Iteration: 117
Iteration: 118
Iteration: 119
Iteration: 120
Iteration: 121
Iteration: 122
Iteration: 123
Iteration: 124
Iteration: 125
Iteration: 126
Iteration: 127
Iteration: 128
Iteration: 129
Iteration: 130
Iteration: 131
Iteration: 132
Iteration: 133
Iteration: 134
Iteration: 135
Iteration: 136
Iteration: 137
Iteration: 138
Iteration: 139
Iteration: 140
Iteration: 141
Iteration: 142
Iteration: 143
Iteration: 144
Iteration: 145
Iteration: 146
Iteration: 147
Iteration: 148
Iteration: 149
Iteration: 150
Iteration: 151
Iteration: 152
Iteration: 153
Iteration: 154
Iteration: 155
Iteration: 156
Iteration: 157
Iteration: 158
Iteration: 159
Iteration: 160
Iteration: 161
Iteration: 162
Iteration: 163
Iteration: 164
Iteration: 165
Iteration: 166
Iteration: 167
Iteration: 168
Iteration: 169
Iteration: 170
Iteration: 171
Iteration: 172
Iteration: 173
Iteration: 174
Iteration: 175
Iteration: 176
Iteration: 177
Iteration: 178
Iteration: 179
Iteration: 180
Iteration: 181
Iteration: 182
Iteration: 183
Iteration: 184
Iteration: 185
Iteration: 186
Iteration: 187
Iteration: 188
Iteration: 189
Iteration: 190
Iteration: 191
Iteration: 192
Iteration: 193
Iteration: 194
Iteration: 195
Iteration: 196
Iteration: 197
Iteration: 198
Iteration: 199
Iteration: 200
Iteration: 201
Iteration: 202
Iteration: 203
Iteration: 204
Iteration: 205
Iteration: 206
Iteration: 207
Iteration: 208
Iteration: 209
Iteration: 210
Iteration: 211
Iteration: 212
Iteration: 213
Iteration: 214
Iteration: 215
Iteration: 216
Iteration: 217
Iteration: 218
Iteration: 219
Iteration: 220
Iteration: 221
Iteration: 222
Iteration: 223
Iteration: 224
Iteration: 225
Iteration: 226
Iteration: 227
Iteration: 228
Iteration: 229
Iteration: 230
Iteration: 231
Iteration: 232
Iteration: 233
Iteration: 234
Iteration: 235
Iteration: 236
Iteration: 237
Iteration: 238
Iteration: 239
Iteration: 240
Iteration: 241
Iteration: 242
Iteration: 243
Iteration: 244
Iteration: 245
Iteration: 246
Iteration: 247
Iteration: 248
Iteration: 249
Iteration: 250
Iteration: 251
Iteration: 252
Iteration: 253
Iteration: 254
Iteration: 255
Iteration: 256
Iteration: 257
Iteration: 258
Iteration: 259
Iteration: 260
Iteration: 261
Iteration: 262
Iteration: 263
Iteration: 264
Iteration: 265
Iteration: 266
Iteration: 267
Iteration: 268
Iteration: 269
Iteration: 270
Iteration: 271
Iteration: 272
Iteration: 273
Iteration: 274
Iteration: 275
Iteration: 276
Iteration: 277
Iteration: 278
Iteration: 279
Iteration: 280
Iteration: 281
Iteration: 282
Iteration: 283
Iteration: 284
Iteration: 285
Iteration: 286
Iteration: 287
Iteration: 288
Iteration: 289
Iteration: 290
Iteration: 291
Iteration: 292
Iteration: 293
Iteration: 294
Iteration: 295
Iteration: 296
Iteration: 297
Iteration: 298
Iteration: 299
Iteration: 300
Iteration: 301
Iteration: 302
Iteration: 303
Iteration: 304
Iteration: 305
Iteration: 306
Iteration: 307
Iteration: 308
Iteration: 309
Iteration: 310
Iteration: 311
Iteration: 312
Iteration: 313
Iteration: 314
Iteration: 315
Iteration: 316
Iteration: 317
Iteration: 318
Iteration: 319
Iteration: 320
Iteration: 321
Iteration: 322
Iteration: 323
Iteration: 324
Iteration: 325
Iteration: 326
Iteration: 327
Iteration: 328
Iteration: 329
Iteration: 330
Iteration: 331
Iteration: 332
Iteration: 333
Iteration: 334
Iteration: 335
Iteration: 336
Iteration: 337
Iteration: 338
Iteration: 339
Iteration: 340
Iteration: 341
Iteration: 342
Iteration: 343
Iteration: 344
Iteration: 345
Iteration: 346
Iteration: 347
Iteration: 348
Iteration: 349
Iteration: 350
Iteration: 351
Iteration: 352
Iteration: 353
Iteration: 354
Iteration: 355
Iteration: 356
Iteration: 357
Iteration: 358
Iteration: 359
Iteration: 360
Iteration: 361
Iteration: 362
Iteration: 363
Iteration: 364
Iteration: 365
Iteration: 366
Iteration: 367
Iteration: 368
Iteration: 369
Iteration: 370
Iteration: 371
Iteration: 372
Iteration: 373
Iteration: 374
Iteration: 375
Iteration: 376
Iteration: 377
Iteration: 378
Iteration: 379
Iteration: 380
Iteration: 381
Iteration: 382
Iteration: 383
Iteration: 384
Iteration: 385
Iteration: 386
Iteration: 387
Iteration: 388
Iteration: 389
Iteration: 390
Iteration: 391
Iteration: 392
Iteration: 393
Iteration: 394
Iteration: 395
Iteration: 396
Iteration: 397
Iteration: 398
Iteration: 399
Iteration: 400
Iteration: 401
Iteration: 402
Iteration: 403
Iteration: 404
Iteration: 405
Iteration: 406
Iteration: 407
Iteration: 408
Iteration: 409
Iteration: 410
Iteration: 411
Iteration: 412
Iteration: 413
Iteration: 414
Iteration: 415
Iteration: 416
Iteration: 417
Iteration: 418
Iteration: 419
Iteration: 420
Iteration: 421
Iteration: 422
Iteration: 423
Iteration: 424
Iteration: 425
Iteration: 426
Iteration: 427
Iteration: 428
Iteration: 429
Iteration: 430
Iteration: 431
Iteration: 432
Iteration: 433
Iteration: 434
Iteration: 435
Iteration: 436
Iteration: 437
Iteration: 438
Iteration: 439
Iteration: 440
Iteration: 441
Iteration: 442
Iteration: 443
Iteration: 444
Iteration: 445
Iteration: 446
Iteration: 447
Iteration: 448
Iteration: 449
Iteration: 450
Iteration: 451
Iteration: 452
Iteration: 453
Iteration: 454
Iteration: 455
Iteration: 456
Iteration: 457
Iteration: 458
Iteration: 459
Iteration: 460
Iteration: 461
Iteration: 462
Iteration: 463
Iteration: 464
Iteration: 465
Iteration: 466
Iteration: 467
Iteration: 468
Iteration: 469
Iteration: 470
Iteration: 471
Iteration: 472
Iteration: 473
Iteration: 474
Iteration: 475
Iteration: 476
Iteration: 477
Iteration: 478
Iteration: 479
Iteration: 480
Iteration: 481
Iteration: 482
Iteration: 483
Iteration: 484
Iteration: 485
Iteration: 486
Iteration: 487
Iteration: 488
Iteration: 489
Iteration: 490
Iteration: 491
Iteration: 492
Iteration: 493
Iteration: 494
Iteration: 495
Iteration: 496
Iteration: 497
Iteration: 498
Iteration: 499
Iteration: 500
Iteration: 501
Iteration: 502
Iteration: 503
Iteration: 504
Iteration: 505
Iteration: 506
Iteration: 507
Iteration: 508
Iteration: 509
Iteration: 510
Iteration: 511
Iteration: 512
Iteration: 513
Iteration: 514
Iteration: 515
Iteration: 516
Iteration: 517
Iteration: 518
Iteration: 519
Iteration: 520
Iteration: 521
Iteration: 522
Iteration: 523
Iteration: 524
Iteration: 525
Iteration: 526
Iteration: 527
Iteration: 528
Iteration: 529
Iteration: 530
Iteration: 531
Iteration: 532
Iteration: 533
Iteration: 534
Iteration: 535
Iteration: 536
Iteration: 537
Iteration: 538
Iteration: 539
Iteration: 540
Iteration: 541
Iteration: 542
Iteration: 543
Iteration: 544
Iteration: 545
Iteration: 546
Iteration: 547
Iteration: 548
Iteration: 549
Iteration: 550
Iteration: 551
Iteration: 552
Iteration: 553
Iteration: 554
Iteration: 555
Iteration: 556
Iteration: 557
Iteration: 558
Iteration: 559
Iteration: 560
Iteration: 561
Iteration: 562
Iteration: 563
Iteration: 564
Iteration: 565
Iteration: 566
Iteration: 567
Iteration: 568
Iteration: 569
Iteration: 570
Iteration: 571
Iteration: 572
Iteration: 573
Iteration: 574
Iteration: 575
Iteration: 576
Iteration: 577
Iteration: 578
Iteration: 579
Iteration: 580
Iteration: 581
Iteration: 582
Iteration: 583
Iteration: 584
Iteration: 585
Iteration: 586
Iteration: 587
Iteration: 588
Iteration: 589
Iteration: 590
Iteration: 591
Iteration: 592
Iteration: 593
Iteration: 594
Iteration: 595
Iteration: 596
Iteration: 597
Iteration: 598
Iteration: 599
Iteration: 600
Iteration: 601
Iteration: 602
Iteration: 603
Iteration: 604
Iteration: 605
Iteration: 606
Iteration: 607
Iteration: 608
Iteration: 609
Iteration: 610
Iteration: 611
Iteration: 612
Iteration: 613
Iteration: 614
Iteration: 615
Iteration: 616
Iteration: 617
Iteration: 618
Iteration: 619
Iteration: 620
Iteration: 621
Iteration: 622
Iteration: 623
Iteration: 624
Iteration: 625
Iteration: 626
Iteration: 627
Iteration: 628
Iteration: 629
Iteration: 630
Iteration: 631
Iteration: 632
Iteration: 633
Iteration: 634
Iteration: 635
Iteration: 636
Iteration: 637
Iteration: 638
Iteration: 639
Iteration: 640
Iteration: 641
Iteration: 642
Iteration: 643
Iteration: 644
Iteration: 645
Iteration: 646
Iteration: 647
Iteration: 648
Iteration: 649
Iteration: 650
Iteration: 651
Iteration: 652
Iteration: 653
Iteration: 654
Iteration: 655
Iteration: 656
Iteration: 657
Iteration: 658
Iteration: 659
Iteration: 660
Iteration: 661
Iteration: 662
Iteration: 663
Iteration: 664
Iteration: 665
Iteration: 666
Iteration: 667
Iteration: 668
Iteration: 669
Iteration: 670
Iteration: 671
Iteration: 672
Iteration: 673
Iteration: 674
Iteration: 675
Iteration: 676
Iteration: 677
Iteration: 678
Iteration: 679
Iteration: 680
Iteration: 681
Iteration: 682
Iteration: 683
Iteration: 684
Iteration: 685
Iteration: 686
Iteration: 687
Iteration: 688
Iteration: 689
Iteration: 690
Iteration: 691
Iteration: 692
Iteration: 693
Iteration: 694
Iteration: 695
Iteration: 696
Iteration: 697
Iteration: 698
Iteration: 699
Iteration: 700
Iteration: 701
Iteration: 702
Iteration: 703
Iteration: 704
Iteration: 705
Iteration: 706
Iteration: 707
Iteration: 708
Iteration: 709
Iteration: 710
Iteration: 711
Iteration: 712
Iteration: 713
Iteration: 714
Iteration: 715
Iteration: 716
Iteration: 717
Iteration: 718
Iteration: 719
Iteration: 720
Iteration: 721
Iteration: 722
Iteration: 723
Iteration: 724
Iteration: 725
Iteration: 726
Iteration: 727
Iteration: 728
Iteration: 729
Iteration: 730
Iteration: 731
Iteration: 732
Iteration: 733
Iteration: 734
Iteration: 735
Iteration: 736
Iteration: 737
Iteration: 738
Iteration: 739
Iteration: 740
Iteration: 741
Iteration: 742
Iteration: 743
Iteration: 744
Iteration: 745
Iteration: 746
Iteration: 747
Iteration: 748
Iteration: 749
Iteration: 750
Iteration: 751
Iteration: 752
Iteration: 753
Iteration: 754
Iteration: 755
Iteration: 756
Iteration: 757
Iteration: 758
Iteration: 759
Iteration: 760
Iteration: 761
Iteration: 762
Iteration: 763
Iteration: 764
Iteration: 765
Iteration: 766
Iteration: 767
Iteration: 768
Iteration: 769
Iteration: 770
Iteration: 771
Iteration: 772
Iteration: 773
Iteration: 774
Iteration: 775
Iteration: 776
Iteration: 777
Iteration: 778
Iteration: 779
Iteration: 780
Iteration: 781
Iteration: 782
Iteration: 783
Iteration: 784
Iteration: 785
Iteration: 786
Iteration: 787
Iteration: 788
Iteration: 789
Iteration: 790
Iteration: 791
Iteration: 792
Iteration: 793
Iteration: 794
Iteration: 795
Iteration: 796
Iteration: 797
Iteration: 798
Iteration: 799
Iteration: 800
Iteration: 801
Iteration: 802
Iteration: 803
Iteration: 804
Iteration: 805
Iteration: 806
Iteration: 807
Iteration: 808
Iteration: 809
Iteration: 810
Iteration: 811
Iteration: 812
Iteration: 813
Iteration: 814
Iteration: 815
Iteration: 816
Iteration: 817
Iteration: 818
Iteration: 819
Iteration: 820
Iteration: 821
Iteration: 822
Iteration: 823
Iteration: 824
Iteration: 825
Iteration: 826
Iteration: 827
Iteration: 828
Iteration: 829
Iteration: 830
Iteration: 831
Iteration: 832
Iteration: 833
Iteration: 834
Iteration: 835
Iteration: 836
Iteration: 837
Iteration: 838
Iteration: 839
Iteration: 840
Iteration: 841
Iteration: 842
Iteration: 843
Iteration: 844
Iteration: 845
Iteration: 846
Iteration: 847
Iteration: 848
Iteration: 849
Iteration: 850
Iteration: 851
Iteration: 852
Iteration: 853
Iteration: 854
Iteration: 855
Iteration: 856
Iteration: 857
Iteration: 858
Iteration: 859
Iteration: 860
Iteration: 861
Iteration: 862
Iteration: 863
Iteration: 864
Iteration: 865
Iteration: 866
Iteration: 867
Iteration: 868
Iteration: 869
Iteration: 870
Iteration: 871
Iteration: 872
Iteration: 873
Iteration: 874
Iteration: 875
Iteration: 876
Iteration: 877
Iteration: 878
Iteration: 879
Iteration: 880
Iteration: 881
Iteration: 882
Iteration: 883
Iteration: 884
Iteration: 885
Iteration: 886
Iteration: 887
Iteration: 888
Iteration: 889
Iteration: 890
Iteration: 891
Iteration: 892
Iteration: 893
Iteration: 894
Iteration: 895
Iteration: 896
Iteration: 897
Iteration: 898
Iteration: 899
Iteration: 900
Iteration: 901
Iteration: 902
Iteration: 903
Iteration: 904
Iteration: 905
Iteration: 906
Iteration: 907
Iteration: 908
Iteration: 909
Iteration: 910
Iteration: 911
Iteration: 912
Iteration: 913
Iteration: 914
Iteration: 915
Iteration: 916
Iteration: 917
Iteration: 918
Iteration: 919
Iteration: 920
Iteration: 921
Iteration: 922
Iteration: 923
Iteration: 924
Iteration: 925
Iteration: 926
Iteration: 927
Iteration: 928
Iteration: 929
Iteration: 930
Iteration: 931
Iteration: 932
Iteration: 933
Iteration: 934
Iteration: 935
Iteration: 936
Iteration: 937
Iteration: 938
Iteration: 939
Iteration: 940
Iteration: 941
Iteration: 942
Iteration: 943
Iteration: 944
Iteration: 945
Iteration: 946
Iteration: 947
Iteration: 948
Iteration: 949
Iteration: 950
Iteration: 951
Iteration: 952
Iteration: 953
Iteration: 954
Iteration: 955
Iteration: 956
Iteration: 957
Iteration: 958
Iteration: 959
Iteration: 960
Iteration: 961
Iteration: 962
Iteration: 963
Iteration: 964
Iteration: 965
Iteration: 966
Iteration: 967
Iteration: 968
Iteration: 969
Iteration: 970
Iteration: 971
Iteration: 972
Iteration: 973
Iteration: 974
Iteration: 975
Iteration: 976
Iteration: 977
Iteration: 978
Iteration: 979
Iteration: 980
Iteration: 981
Iteration: 982
Iteration: 983
Iteration: 984
Iteration: 985
Iteration: 986
Iteration: 987
Iteration: 988
Iteration: 989
Iteration: 990
Iteration: 991
Iteration: 992
Iteration: 993
Iteration: 994
Iteration: 995
Iteration: 996
Iteration: 997
Iteration: 998
Iteration: 999
Iteration: 1000
Iteration: 1
Warning message:
“logml could not be estimated within maxiter, rerunning with adjusted starting value. 
Estimate might be more variable than usual.”
Bridge sampling estimate of the log marginal likelihood: -70827.69
Estimate obtained in 1001 iteration(s) via method "normal".
generated_quantities <- extract(fit_wo_separation)
spl <- sample(1:dim(generated_quantities$Y_prior)[1], 1000)
prior_pc <- tibble(
    tv = apply(generated_quantities$Y_prior[spl, , ], 1, total_variance),
    lv = apply(generated_quantities$Y_prior[spl, , ], 1, lineage_wise_variance),
    cv = apply(generated_quantities$Y_prior[spl, , ], 1, crossfamily_variance)
)

predictive_plot(prior_pc)
Warning message in geom_point(aes(x = 1, y = empirical$tv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$lv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$cv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”

posterior_pc <- tibble(
    tv = apply(generated_quantities$Y_posterior[spl, , ], 1, total_variance),
    lv = apply(generated_quantities$Y_posterior[spl, , ], 1, lineage_wise_variance),
    cv = apply(generated_quantities$Y_posterior[spl, , ], 1, crossfamily_variance)
)
predictive_plot(posterior_pc)
Warning message in geom_point(aes(x = 1, y = empirical$tv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$lv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$cv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”

word_order_data <- list()
word_order_data$Y <- Y
word_order_data$Ndata <- nrow(Y)
word_order_data$Ntrees <- length(trees)
word_order_data$Nnodes <- nrow(nodes)
word_order_data$Nedges <- length(edge_po)
word_order_data$observations <- match(d$Glottocode, nodes$label)
word_order_data$roots <- roots
word_order_data$edges <- edges[edge_po,]
word_order_data$edge_lengths <- edge_lengths[edge_po]
brown_code <- "
data {
    int<lower=1> Ntrees;
    int<lower=1> Nnodes;
    int<lower=1> Ndata;
    int<lower=1> Nedges;
    int<lower=1,upper=Nnodes> observations[Ndata];
    int<lower=0> Y[Ndata, 3];
    int<lower=1, upper=Nnodes> roots[Ntrees];
    int<lower=1, upper=Nnodes> edges[Nedges, 2];
    vector[Nedges] edge_lengths;
}

parameters {
    matrix[Nnodes, 3] z;
    row_vector[3] mu_root;
    row_vector<lower=0>[3] sigma;
    row_vector<lower=0>[3] sigma_root;
}

model {
    
    mu_root ~ normal(0, 1);
    sigma ~ exponential(1);
    sigma_root ~ exponential(1);
    
    
    for (i in 1:Ntrees) {
        int idx = roots[i];
        for (k in 1:3) {
            z[idx, k] ~ normal(mu_root[k], sigma_root[k]);
        }
    }
    
    for (i in 1:Nedges) {
        int j = Nedges - i + 1;
        int mother_node = edges[j, 1];
        int daughter_node = edges[j, 2];
        row_vector[3] local_mu = z[mother_node];
        row_vector[3] local_sigma = sigma * sqrt(edge_lengths[j]);
        for (k in 1:3) {
            z[daughter_node,k] ~ normal(local_mu[k], local_sigma[k]);
        }
    }

    for (i in 1:Ndata) {
        Y[i] ~ multinomial(softmax(to_vector(z[observations[i]])));
    }
}

generated quantities {
    int Y_prior[Ndata, 3]; // Prior predictive samples
    int Y_posterior[Ndata, 3]; // Posterior predictive samples
    matrix[Nnodes, 3] z_prior = rep_matrix(0, Nnodes, 3);
    row_vector[3] mu_root_prior;
    row_vector<lower=0>[3] sigma_prior;
    row_vector<lower=0>[3] sigma_root_prior;
    real log_lik[Ndata];

    for (k in 1:3) {
        mu_root_prior[k] = normal_rng(0, 1);
        sigma_prior[k] = exponential_rng(1);
        sigma_root_prior[k] = exponential_rng(1);
    }

    for (i in 1:Ntrees) {
        for (k in 1:3) {
            z_prior[roots[i], k] = normal_rng(
                mu_root_prior[k],
                sigma_root_prior[k]
            );
        }
    }

    
    for (i in 1:Nedges) {
        int j = Nedges - i + 1;
        int mother_node = edges[j, 1];
        int daughter_node = edges[j, 2];
        row_vector[3] local_mu = z[mother_node];
        row_vector[3] local_sigma = sigma * sqrt(edge_lengths[j]);
        for (k in 1:3) {
            z_prior[daughter_node, k] = normal_rng(
                z_prior[mother_node, k],
                sigma_prior[k] * sqrt(edge_lengths[j])
            );
        }
    }
    
    for (i in 1:Ndata) {
        vector[3] p_prior = softmax(to_vector(z_prior[observations[i]]));
        vector[3] p_posterior = softmax(to_vector(z[observations[i]]));
        Y_prior[i] = multinomial_rng(p_prior, sum(Y[i]));
        Y_posterior[i] = multinomial_rng(p_posterior, sum(Y[i]));
        log_lik[i] = multinomial_lpmf(Y[i] | p_posterior);
    }
}

"
model_wo_brown <- stan_model(model_code = brown_code)
# fit_wo_brown <- rstan::sampling(
#     model_wo_brown,
#     data = word_order_data,
#     chains = 4,
#     iter = 100000,
#     thin = 1,
#     control = list(max_treedepth = 15)
# )

# saveRDS(fit_wo_brown, "models/fit_wo_brown.rds")
 fit_wo_brown <- readRDS("models/fit_wo_brown.rds")
print(fit_wo_brown, pars=c("mu_root", "sigma", "sigma_root", "lp__"))
Inference for Stan model: anon_model.
4 chains, each with iter=1e+05; warmup=50000; thin=1; 
post-warmup draws per chain=50000, total post-warmup draws=2e+05.

                   mean se_mean     sd      2.5%       25%       50%       75%
mu_root[1]        -0.99    0.02   0.58     -2.14     -1.38     -0.99     -0.60
mu_root[2]         0.41    0.02   0.58     -0.74      0.02      0.43      0.80
mu_root[3]         0.65    0.02   0.59     -0.50      0.26      0.64      1.04
sigma[1]           1.71    0.00   0.09      1.55      1.66      1.71      1.77
sigma[2]           0.97    0.00   0.11      0.75      0.89      0.97      1.04
sigma[3]           1.63    0.01   0.08      1.47      1.57      1.62      1.69
sigma_root[1]      0.57    0.00   0.09      0.39      0.52      0.58      0.63
sigma_root[2]      0.20    0.01   0.12      0.02      0.10      0.19      0.29
sigma_root[3]      1.21    0.00   0.10      1.02      1.13      1.20      1.28
lp__          -67238.07   10.09 148.38 -67504.73 -67341.23 -67247.99 -67147.36
                  97.5% n_eff Rhat
mu_root[1]         0.22  1284 1.00
mu_root[2]         1.62  1264 1.00
mu_root[3]         1.87  1207 1.00
sigma[1]           1.89  3697 1.00
sigma[2]           1.17  1244 1.01
sigma[3]           1.78   275 1.02
sigma_root[1]      0.74  7635 1.00
sigma_root[2]      0.47   302 1.02
sigma_root[3]      1.42   446 1.01
lp__          -66935.05   216 1.02

Samples were drawn using NUTS(diag_e) at Thu Mar  7 19:37:11 2024.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
(loo_wo_brown <- loo(fit_wo_brown))
Warning message:
“Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
”

Computed from 200000 by 827 log-likelihood matrix

         Estimate   SE
elpd_loo  -5092.7 49.7
p_loo      1257.7 29.3
looic     10185.5 99.5
------
Monte Carlo SE of elpd_loo is NA.

Pareto k diagnostic values:
                         Count Pct.    Min. n_eff
(-Inf, 0.5]   (good)      18    2.2%   59        
 (0.5, 0.7]   (ok)        81    9.8%   3         
   (0.7, 1]   (bad)      617   74.6%   0         
   (1, Inf)   (very bad) 111   13.4%   0         
See help('pareto-k-diagnostic') for details.
new_mod <- stan(model_code = brown_code, data = word_order_data, chains = 0)


marginal_wo_brown <- bridge_sampler(
  fit_wo_brown,
  new_mod,
  cores = 10,
)
the number of chains is less than 1; sampling not done

Warning message:
“Infinite value in iterative scheme, returning NA.
 Try rerunning with more samples.”
Iteration: 1
Iteration: 2
generated_quantities <- extract(fit_wo_brown)
spl <- sample(1:dim(generated_quantities$Y_prior)[1], 1000)


prior_pc <- tibble(
  tv = apply(
    generated_quantities$Y_prior[spl, , ],
    1,
    total_variance
  ),
  lv = apply(
    generated_quantities$Y_prior[spl, , ],
    1,
    lineage_wise_variance
  ),
  cv = apply(
    generated_quantities$Y_prior[spl, , ],
    1,
    crossfamily_variance
  )
)

predictive_plot(prior_pc)
Warning message in geom_point(aes(x = 1, y = empirical$tv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$lv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$cv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”

posterior_pc <- tibble(
  tv = apply(
    generated_quantities$Y_posterior[spl, , ],
    1,
    total_variance
  ),
  lv = apply(
    generated_quantities$Y_posterior[spl, , ],
    1,
    lineage_wise_variance
  ),
  cv = apply(
    generated_quantities$Y_posterior[spl, , ],
    1,
    crossfamily_variance
  )
)
predictive_plot(posterior_pc)
Warning message in geom_point(aes(x = 1, y = empirical$tv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$lv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$cv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”

ou_code <- "
data {
    int<lower=1> Ntrees;
    int<lower=1> Nnodes;
    int<lower=1> Ndata;
    int<lower=1> Nedges;
    int<lower=1,upper=Nnodes> observations[Ndata];
    int<lower=0> Y[Ndata, 3];
    int<lower=1, upper=Nnodes> roots[Ntrees];
    int<lower=1, upper=Nnodes> edges[Nedges, 2];
    vector[Nedges] edge_lengths;
}

parameters {
    matrix[Nnodes, 3] z;
    row_vector[3] mu;
    row_vector<lower=0>[3] sigma;
    row_vector<lower=0>[3] lambda;
}

model {
    
    mu ~ normal(0, 10);
    sigma ~ exponential(1);
    lambda ~ cauchy(0, 2.5);
    
    
    for (i in 1:Ntrees) {
        int idx = roots[i];
        for (k in 1:3) {
            z[idx, k] ~ normal(mu[k], sigma[k] * sqrt(2 * lambda[k]));
        }
    }
    
    for (i in 1:Nedges) {
        int j = Nedges - i + 1;
        int mother_node = edges[j, 1];
        int daughter_node = edges[j, 2];
        row_vector[3] local_mu = mu + (z[mother_node] - mu) .* exp(-edge_lengths[j] .* lambda);
        row_vector[3] local_sigma = sigma .* sqrt((rep_row_vector(1, 3) - exp(-2 .* lambda .* edge_lengths[j])) ./ (2 .* lambda));
        for (k in 1:3) {
            z[daughter_node,k] ~ normal(local_mu[k], local_sigma[k]);
        }
    }

    for (i in 1:Ndata) {
        Y[i] ~ multinomial(softmax(to_vector(z[observations[i]])));
    }
}

generated quantities {
    int Y_prior[Ndata, 3]; // Prior predictive samples
    int Y_posterior[Ndata, 3]; // Posterior predictive samples
    matrix[Nnodes, 3] z_prior = rep_matrix(0, Nnodes, 3);
    row_vector[3] mu_prior;
    row_vector<lower=0>[3] sigma_prior;
    row_vector<lower=0>[3] lambda_prior;
    real log_lik[Ndata];

    for (k in 1:3) {
        mu_prior[k] = normal_rng(0, 1);
        sigma_prior[k] = exponential_rng(1);
        lambda_prior[k] = fabs(cauchy_rng(0, 2.5));
    }

    for (i in 1:Ntrees) {
        for (k in 1:3) {
            z_prior[roots[i], k] = normal_rng(
                mu_prior[k],
                sigma_prior[k] * sqrt(2 * lambda_prior[k])
            );
        }
    }

    
    for (i in 1:Nedges) {
        int j = Nedges - i + 1;
        int mother_node = edges[j, 1];
        int daughter_node = edges[j, 2];
        row_vector[3] local_mu_prior = mu_prior + (z_prior[mother_node] - mu_prior) .* exp(-lambda_prior .* edge_lengths[j]);
        row_vector[3] local_sigma_prior = sigma_prior .* sqrt((rep_row_vector(1, 3) - exp(-2 .* lambda_prior .* edge_lengths[j])) ./ (2 .* lambda_prior));
        for (k in 1:3) {
            z_prior[daughter_node, k] = normal_rng(
                local_mu_prior[k],
                local_sigma_prior[k]
            );
        }
    }
    
    for (i in 1:Ndata) {
        vector[3] p_prior = softmax(to_vector(z_prior[observations[i]]));
        vector[3] p_posterior = softmax(to_vector(z[observations[i]]));
        Y_prior[i] = multinomial_rng(p_prior, sum(Y[i]));
        Y_posterior[i] = multinomial_rng(p_posterior, sum(Y[i]));
        log_lik[i] = multinomial_lpmf(Y[i] | p_posterior);
    }
}

"
model_wo_ou <- stan_model(model_code = ou_code)
# fit_wo_ou <- rstan::sampling(
#     model_wo_ou,
#     data = word_order_data,
#     chains = 4,
#     iter = 50000,
#     thin = 1,
#     #control = list(max_treedepth = 15)
# )
# saveRDS(fit_wo_ou, "models/fit_wo_ou.rds")
fit_wo_ou <- readRDS("models/fit_wo_ou.rds")
print(fit_wo_ou, pars=c("mu", "sigma", "lambda", "lp__"))
Inference for Stan model: anon_model.
4 chains, each with iter=50000; warmup=25000; thin=1; 
post-warmup draws per chain=25000, total post-warmup draws=1e+05.

               mean se_mean     sd      2.5%       25%       50%       75%
mu[1]         -1.01    0.03   0.59     -2.19     -1.41     -1.02     -0.61
mu[2]          0.39    0.03   0.59     -0.79     -0.01      0.39      0.79
mu[3]          0.62    0.03   0.60     -0.57      0.22      0.62      1.02
sigma[1]       1.71    0.00   0.09      1.54      1.65      1.71      1.77
sigma[2]       0.96    0.00   0.12      0.71      0.89      0.96      1.04
sigma[3]       1.66    0.00   0.08      1.50      1.60      1.66      1.71
lambda[1]      0.06    0.00   0.02      0.02      0.05      0.06      0.08
lambda[2]      0.06    0.00   0.06      0.00      0.02      0.05      0.09
lambda[3]      0.35    0.00   0.08      0.22      0.29      0.34      0.39
lp__      -67281.58    3.67 147.49 -67534.61 -67378.66 -67292.55 -67199.24
              97.5% n_eff Rhat
mu[1]          0.15   511    1
mu[2]          1.55   508    1
mu[3]          1.79   530    1
sigma[1]       1.90  3902    1
sigma[2]       1.16  1598    1
sigma[3]       1.82  5327    1
lambda[1]      0.11  9527    1
lambda[2]      0.21  3351    1
lambda[3]      0.53 12321    1
lp__      -66965.66  1615    1

Samples were drawn using NUTS(diag_e) at Thu Mar 14 05:52:32 2024.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
(loo_wo_ou <- loo(fit_wo_ou))
Warning message:
“Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
”

Computed from 100000 by 827 log-likelihood matrix

         Estimate   SE
elpd_loo  -5056.8 47.8
p_loo      1223.5 27.0
looic     10113.6 95.6
------
Monte Carlo SE of elpd_loo is NA.

Pareto k diagnostic values:
                         Count Pct.    Min. n_eff
(-Inf, 0.5]   (good)      13    1.6%   9666      
 (0.5, 0.7]   (ok)        83   10.0%   186       
   (0.7, 1]   (bad)      624   75.5%   9         
   (1, Inf)   (very bad) 107   12.9%   3         
See help('pareto-k-diagnostic') for details.
new_mod <- stan(model_code = ou_code, data = word_order_data, chains = 0)

marginal_wo_ou <- bridge_sampler(
  fit_wo_ou,
  new_mod,
  cores = 10,
)
the number of chains is less than 1; sampling not done

Warning message:
“Infinite value in iterative scheme, returning NA.
 Try rerunning with more samples.”
Iteration: 1
Iteration: 2
generated_quantities <- extract(fit_wo_ou)
spl <- sample(1:dim(generated_quantities$Y_prior)[1], 1000)


prior_pc <- tibble(
  tv = apply(
    generated_quantities$Y_prior[spl, , ],
    1,
    total_variance
  ),
  lv = apply(
    generated_quantities$Y_prior[spl, , ],
    1,
    lineage_wise_variance
  ),
  cv = apply(
    generated_quantities$Y_prior[spl, , ],
    1,
    crossfamily_variance
  )
)

predictive_plot(prior_pc)
Warning message in geom_point(aes(x = 1, y = empirical$tv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$lv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$cv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”

posterior_pc <- tibble(
  tv = apply(
    generated_quantities$Y_posterior[spl, , ],
    1,
    total_variance
  ),
  lv = apply(
    generated_quantities$Y_posterior[spl, , ],
    1,
    lineage_wise_variance
  ),
  cv = apply(
    generated_quantities$Y_posterior[spl, , ],
    1,
    crossfamily_variance
  )
)
predictive_plot(posterior_pc)
Warning message in geom_point(aes(x = 1, y = empirical$tv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$lv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”
Warning message in geom_point(aes(x = 1, y = empirical$cv, color = "empirical"), :
“All aesthetics have length 1, but the data has 1000 rows.
ℹ Did you mean to use `annotate()`?”

model comparison

tibble(model = c("pooling", "separation", "brownian", "ou"),
       looic = c(
           loo_wo_pooling$estimates[3,1], 
           loo_wo_separation$estimates[3,1],
           loo_wo_brown$estimates[3,1],
           loo_wo_ou$estimates[3,1]
       )
) %>%
mutate(delta_looic = round(looic-min(looic), 3)) %>%
arrange(delta_looic)
A tibble: 4 × 3
model looic delta_looic
<chr> <dbl> <dbl>
ou 10113.64 0.000
brownian 10185.46 71.824
separation 10233.57 119.928
pooling 47882.76 37769.124

It seems that OU comes out as the best model, but the estimation is so uncertain that this has to be taken cautiously. In any event, the OU model seems to fit the data well.

Let us check what equilibrium distribution over the three word order categories the fitted model predicts.

mu_posterior <- generated_quantities$mu
sigma_eq_posterior <- generated_quantities$sigma * sqrt(2*generated_quantities$lambda)
softmax <- function(x) exp(x) / sum(exp(x))
# Assuming mu_posterior and sigma_eq_posterior are already defined and softmax is a function you have defined or available in your environment
n <- nrow(mu_posterior)
p <- ncol(mu_posterior)

# Pre-allocate the output matrix
result <- matrix(NA, nrow = n, ncol = p)

# Generate the random numbers outside of the softmax application
# and apply softmax row-wise after generation
for (j in 1:n) {
  norm_sample <- sapply(1:p, function(i) rnorm(1, mu_posterior[j, i], sigma_eq_posterior[j, i]))
  result[j, ] <- softmax(norm_sample)
}

# Now result contains all your softmax outputs
colnames(result) <- c('v1', 'vm', 'vf')
result %>%
    as_tibble %>%
    write_csv("../data/word_order_posterior_eq.csv")
generated_quantities$z %>% dim
  1. 100000
  2. 1554
  3. 3
spl <- sample(1:dim(generated_quantities$Y_prior)[1], 1000)
z <- generated_quantities$z[spl, , ]
mu <- generated_quantities$mu[spl, ]
sigma <- generated_quantities$sigma[spl, ]
lambda <- generated_quantities$lambda[spl, ]

z_normal <- array(0, dim(z))
dim(z_normal)
  1. 1000
  2. 1554
  3. 3
dim(sigma)
  1. 1000
  2. 3
dim(mu)
  1. 1000
  2. 3
for (i in 1:word_order_data$Ntrees) {
    idx <- word_order_data$roots[i]
    z_normal[, idx, ] <- (z[ ,idx, ] - mu) / (sigma * sqrt(2 * lambda))
}
for (i in 1:word_order_data$Nedges) {
    mother <- word_order_data$edges[i, 1]
    daughter <- word_order_data$edges[i, 2]
    local_mu <- mu + (z[, mother, ] - mu) * exp(-word_order_data$edge_lengths[i] * lambda)
    local_sigma <- sigma * sqrt((1 - exp(-2 * lambda * word_order_data$edge_lengths[i])) / (2 * lambda))
    z_normal[, daughter, ] <- (z[, daughter, ] - local_mu) / local_sigma
}
z_normal %>% dim
  1. 1000
  2. 1554
  3. 3
v1 <- z_normal[ , , 1] %>% t %>% as_tibble
write_csv(v1, "../data/v1_ou.csv")
vm <- z_normal[ , , 2] %>% t %>% as_tibble
write_csv(vm, "../data/vm_ou.csv")
vf <- z_normal[ , , 3] %>% t %>% as_tibble
write_csv(vf, "../data/vf_ou.csv")