MrBayes postprocessing

After MrBayes is finished, a little postprocessing is needed. I will do this in R, because it has useful libraries for this kind of work.

The goal is

preamble

loading libraries and setting a random seed for reproducability

library(tidyverse)
library(fs)
library(ape)
library(phangorn)
library(glue)

set.seed(123)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Attaching package: ‘ape’


The following object is masked from ‘package:dplyr’:

    where

workflow

Full disclosure: most of the code was written by ChatGPT, with some natural language input from me.

Here are some helper function, whose purpose should be clear from their names.

# Helper function to extract latter half of trees
get_latter_half <- function(tree_list) {
  purrr::map(tree_list, ~ .x[round(length(.x) / 2):length(.x)])
}

# Helper function to remove dummy taxa if it exists
remove_dummy <- function(tree) {
  if ("dummy" %in% tree$tip.label) {
    drop.tip(tree, "dummy")
  } else {
    tree
  }
}

# Check file completion
is_file_complete <- function(file) {
  if (file_exists(file)) {
    last_line <- read_lines(file) %>% tail(1)
    return(str_detect(last_line, "end;"))
  } 
  return(FALSE)
}

Now the posterior tree changes are extracted, cleaned and sampled.

# Get the list of family names
families <- dir_ls("mrbayes/output") %>%
  purrr::map_chr(fs::path_file) %>%
  str_remove("\\..*") %>%
  unique() %>%
  purrr::keep(str_detect, "^0")

# Check if all files in the family have completed
families_completed <- purrr::map_lgl(families, function(fm) {
  files <- glue::glue("mrbayes/output/{fm}.run{1:2}.t")
  all(purrr::map_lgl(files, is_file_complete))
})

names(families_completed) <- families

# Extract and process trees
fm_trees <- purrr::map(families, function(fm) {
  print(fm)
  files <- glue::glue("mrbayes/output/{fm}.run{1:2}.t")

  if (families_completed[fm]) {
    trees <- purrr::map(files, read.nexus)
  } else {
    trees_as_string <- purrr::map(files, function(file) {
      tree_string <- readLines(file, warn = FALSE)
      c(tree_string, "end;")
    })
    trees <- purrr::map(trees_as_string, ~ read.nexus(textConnection(.x)))
  }

  latter_half_trees <- do.call(c, get_latter_half(trees))
  if (length(latter_half_trees) < 100) {
    stop(glue("Not enough trees in {fm}. Expected at least 100 but found {length(latter_half_trees)}."))
  }
  purrr::map(sample(latter_half_trees, 100), remove_dummy)
})

names(fm_trees) <- families

writing the posterior samples to disk…

# Create directories and write trees
posterior_dir = "../data/posterior_trees"


dir_create(posterior_dir)
purrr::walk(names(fm_trees), function(fm) {
  file_name <- glue("{posterior_dir}/{fm}_posterior.tree")
  write.tree(fm_trees[[fm]], file = file_name)
})

computing the maximum clade credibility trees and writing them to disk…

mcc_dir = "../data/mcc_trees"

dir_create(mcc_dir)
purrr::walk(families, function(fm) {
  file_name <- glue("{mcc_dir}/{fm}_mcc.tre")
  tr <- maxCladeCred(fm_trees[[fm]])
  write.tree(tr, file = file_name)
})