library(tidyverse)
library(fs)
library(ape)
library(phangorn)
library(glue)
set.seed(123)MrBayes postprocessing
After MrBayes is finished, a little postprocessing is needed. I will do this in R, because it has useful libraries for this kind of work.
The goal is
- to throw away the first half of each mcmc,
- remove dummy taxa,
- sample 100 trees at random from the remaining posterior distributions for each family
- compute a maximum clade credibility tree for each family.
preamble
loading libraries and setting a random seed for reproducability
workflow
Full disclosure: most of the code was written by ChatGPT, with some natural language input from me.
Here are some helper function, whose purpose should be clear from their names.
# Helper function to extract latter half of trees
get_latter_half <- function(tree_list) {
purrr::map(tree_list, ~ .x[round(length(.x) / 2):length(.x)])
}
# Helper function to remove dummy taxa if it exists
remove_dummy <- function(tree) {
if ("dummy" %in% tree$tip.label) {
drop.tip(tree, "dummy")
} else {
tree
}
}
# Check file completion
is_file_complete <- function(file) {
if (file_exists(file)) {
last_line <- read_lines(file) %>% tail(1)
return(str_detect(last_line, "end;"))
}
return(FALSE)
}Now the posterior tree changes are extracted, cleaned and sampled.
# Get the list of family names
families <- dir_ls("mrbayes/output") %>%
purrr::map_chr(fs::path_file) %>%
str_remove("\\..*") %>%
unique() %>%
purrr::keep(str_detect, "^0")
# Check if all files in the family have completed
families_completed <- purrr::map_lgl(families, function(fm) {
files <- glue::glue("mrbayes/output/{fm}.run{1:2}.t")
all(purrr::map_lgl(files, is_file_complete))
})
names(families_completed) <- families
# Extract and process trees
fm_trees <- purrr::map(families, function(fm) {
print(fm)
files <- glue::glue("mrbayes/output/{fm}.run{1:2}.t")
if (families_completed[fm]) {
trees <- purrr::map(files, read.nexus)
} else {
trees_as_string <- purrr::map(files, function(file) {
tree_string <- readLines(file, warn = FALSE)
c(tree_string, "end;")
})
trees <- purrr::map(trees_as_string, ~ read.nexus(textConnection(.x)))
}
latter_half_trees <- do.call(c, get_latter_half(trees))
if (length(latter_half_trees) < 100) {
stop(glue("Not enough trees in {fm}. Expected at least 100 but found {length(latter_half_trees)}."))
}
purrr::map(sample(latter_half_trees, 100), remove_dummy)
})
names(fm_trees) <- familieswriting the posterior samples to disk…
# Create directories and write trees
posterior_dir = "../data/posterior_trees"
dir_create(posterior_dir)
purrr::walk(names(fm_trees), function(fm) {
file_name <- glue("{posterior_dir}/{fm}_posterior.tree")
write.tree(fm_trees[[fm]], file = file_name)
})computing the maximum clade credibility trees and writing them to disk…
mcc_dir = "../data/mcc_trees"
dir_create(mcc_dir)
purrr::walk(families, function(fm) {
file_name <- glue("{mcc_dir}/{fm}_mcc.tre")
tr <- maxCladeCred(fm_trees[[fm]])
write.tree(tr, file = file_name)
})