Constructing MrBayes scripts for phylogenetic inference

In my paper Global-scale phylogenetic linguistic inference from lexical resources, I describe a method to extract character matrices from ASJP data. On https://osf.io/a97sz/, I stored code and data from applying this workflow to version 19 of ASJP.

In this script, the character vectors for a predefined set of glottocodes are extracted from the data on the OSF repository, and mrbayes scripts are created, one for each language family present among the collection of glottocodes.

Workflow

Preamble: activating the local environment and loading packages

Note that I load PyCall and import the python package ete3, which is very convenient to manipulate phylogenies.

using Pkg
Pkg.activate(".")
Pkg.instantiate()
using CSV
using DataFrames
using JSON
using HTTP
using Pipe
using ProgressMeter
using PyCall
ete3 = pyimport("ete3")

Loading and preparing data

d = CSV.File("../data/EA_vars_glotto.csv") |> DataFrame

Next I fetch metadata from the Glottolog website to assign a family to each doculect.

glottolog_cldf_zip = "../data/glottolog_cldf.zip"

isfile(glottolog_cldf_zip) || begin
    download(
        "https://zenodo.org/records/8131091/files/glottolog/glottolog-cldf-v4.8.zip?download=1",
        glottolog_cldf_zip
    )
    run(`unzip $glottolog_cldf_zip -d ../data/`)
end
pth = "../data/glottolog-glottolog-cldf-59a612c/cldf/"

glottolog_languages = CSV.File(joinpath(pth, "languages.csv")) |> DataFrame


function glottocode_2_family(g, glottolog_languages)
    # Find the row with the matching Glottocode
    row = findfirst(==(g), glottolog_languages.Glottocode)

    # If the Glottocode is not found, return nothing or an appropriate value
    if isnothing(row)
        return "Glottocode not found"
    end

    # Extract the family code
    family_code = glottolog_languages.Family_ID[row]

    # Check if the family code is missing
    if ismissing(family_code)
        return glottolog_languages.Name[row]
    end

    # Find the row for the family code
    family_row = findfirst(==(family_code), glottolog_languages.Glottocode)

    # If the family Glottocode is not found, return the name for the original Glottocode
    if isnothing(family_row)
        return glottolog_languages.Name[row]
    end

    # Return the name associated with the family Glottocode
    return glottolog_languages.Name[family_row]
end


insertcols!(d, :Family => [glottocode_2_family(g, glottolog_languages) for g in d.glottocode])

Getting the ASJP data

There are two types of characters in the OSF repo, cognate class characters and soundclass-concept characters. The two character matrices are downloaded and loaded in turn.

file_id = "h4a6z"
url = "https://api.osf.io/v2/files/$(file_id)/"
response = HTTP.get(url)
data = JSON.parse(String(response.body))

download_url = data["data"]["links"]["download"]


world_cc_ = DataFrame(
    hcat(
        split.(
            split(read(download(download_url), String), "\n")[2:end]
        )...
    ) |> permutedims, :auto)

rename!(world_cc_, :x1 => :longname, :x2 => :characters)


world_cc = @pipe world_cc_.characters |>
                 mapslices(x -> split.(x, ""), _, dims=1) |>
                 hcat(_...) |>
                 permutedims |>
                 DataFrame(_, :auto) |>
                 insertcols!(_, 1, :longname => world_cc_.longname)
file_id = "3em9h"
url = "https://api.osf.io/v2/files/$(file_id)/"
response = HTTP.get(url)
data = JSON.parse(String(response.body))

download_url = data["data"]["links"]["download"]


world_sc_ = DataFrame(
    hcat(
        split.(
            split(read(download(download_url), String), "\n")[2:end]
        )...
    ) |> permutedims, :auto)


rename!(world_sc_, :x1 => :longname, :x2 => :characters)


world_sc = @pipe world_sc_.characters |>
                 mapslices(x -> split.(x, ""), _, dims=1) |>
                 hcat(_...) |>
                 permutedims |>
                 DataFrame(_, :auto) |>
                 insertcols!(_, 1, :longname => world_sc_.longname)

Next I fetch and prepare the metadata for the ASJP doculect.

file_id = "w4jnf"
url = "https://api.osf.io/v2/files/$(file_id)/"
response = HTTP.get(url)
data = JSON.parse(String(response.body))

download_url = data["data"]["links"]["download"]


asjp_languages = @pipe CSV.read(
                           download(download_url),
                           missingstring="",
                           DataFrame) |>
                       dropmissing(_, :classification_wals) |>
                       dropmissing(_, :Glottocode) |>
                       filter(row -> row.recently_extinct == 0, _) |>
                       filter(row -> row.long_extinct == 0, _) |>
                       select(_, [:Name, :Glottocode, :Family, :classification_wals]) |>
                       DataFrames.transform(_, [:classification_wals, :Name] => ByRow((x, y) -> string(x, ".", y)) => :longname) |>
                       select(_, Not(:classification_wals)) |>
                       DataFrames.transform(_, :longname => ByRow(x -> replace(x, "-" => "_")) => :longname) |>
                       dropmissing

I developed my own naming convention for ASJP doculects – [WALS family name].[WALS genus_name].[doculect name]. These must be matched with glottocodes.

longname2glottocode = Dict{String, String}(
    zip(asjp_languages.longname, asjp_languages.Glottocode)
)

glottocode2longname = Dict{String, String}(
    zip(asjp_languages.Glottocode, asjp_languages.longname)
)

glottocode2family = Dict{String, String}(
    zip(asjp_languages.Glottocode, asjp_languages.Family)
)



for l in d.glottocode
    if l  keys(glottocode2longname)
        longname2glottocode[l] = l
        glottocode2longname[l] = l
    end
end

Restricting the character vectors to the doculects for which I have a glottocode.

filter!(row -> row.longname  asjp_languages.longname, world_cc)
filter!(row -> row.longname  asjp_languages.longname, world_sc)

ASJP sometimes contains several doculects for the same glottocode. Therefore I now compute the number of missing entries for each doculect. For each glottocode, I select the ASJP doculect with fewest missing entries as representative.

insertcols!(
    world_cc,
    1,
    :Glottocode => [longname2glottocode[x] for x in world_cc.longname]
)

insertcols!(
    world_sc,
    1,
    :Glottocode => [longname2glottocode[x] for x in world_sc.longname]
)



best_languages = @pipe world_sc |> 
    DataFrame(
        longname = _.longname,
        Glottocode = _.Glottocode,
        nGaps = map(x -> sum(Array(x) .== "-"), eachrow(_))
    ) |> 
    sort(_, :nGaps) |>
    unique(_, :Glottocode).longname 

The character matrices are now restricted to the doculects representing a glottocode.

filter!(row -> row.longname  best_languages, world_cc)
filter!(row -> row.longname  best_languages, world_sc)

select!(world_cc, Not(:longname))
select!(world_sc, Not(:longname))

For the glottocodes in the target set for which there are no ASJP data, a character vector consisting of missing entries is constructed. Then, the character matrices are restricted to the glottocodes from the target set.

for l in setdiff(d.glottocode, world_sc.Glottocode)
    nl_cc = repeat(["-"], size(world_cc, 2))
    nl_cc[1] = l
    push!(world_cc, nl_cc)
    nl_sc = repeat(["-"], size(world_sc, 2))
    nl_sc[1] = l
    push!(world_sc, nl_sc)
end

filter!(row -> row.Glottocode  d.glottocode, world_cc)
filter!(row -> row.Glottocode  d.glottocode, world_sc)

In the cognate class character matrix, all columns containing no 1 are removed.

select!(world_cc, Not([
    i for i in axes(world_cc, 2)[2:end] if sum(world_cc[:, i] .== "1") == 0
]))

Now I add the Glottolog family names to d.

d[:,:Family] = replace.(d[:,:Family], " " => "_", "-" => "_", "'" => "")

Here is a helper function that takes a DataFrame object representing a character matrix and constructs the content of a Nexus file representing that matrix.

If a family only contains two taxa, a dummy taxa is added which has all characters missing. This is required because MrBayes only works with datasets containing at least 3 taxa.

# create character matrices

function df2nexus(cm)
    pad = maximum(length.(cm.Glottocode)) + 5
    ntaxa = size(cm, 1) == 2 ? 3 : size(cm, 1)
    nex = """#Nexus
    BEGIN DATA;
    DIMENSIONS ntax=$ntaxa nchar = $(size(cm, 2)-1);
    FORMAT DATATYPE=Restriction GAP=? MISSING=- interleave=no;
    MATRIX

    """
    for i in axes(cm, 1)
        nex *= rpad(cm.Glottocode[i], pad) * join(Array(cm[i, 2:end])) * "\n"
    end
    if nrow(cm) == 2
        nex *= rpad("dummy", pad) * repeat("?", size(cm, 2)-1) * "\n"
    end
    nex *= ";\nEND"
    nex
end

concatenating the two character matrices…

char_mtx = innerjoin(
        world_sc,
        world_cc,
        on=:Glottocode => :Glottocode,
        makeunique=true,
    )
lineages = @pipe d |> 
    unique(_, :glottocode) |>
    groupby(_, :Family) |> 
    combine(_, nrow) |> 
    sort(_,:nrow)

families = lineages.Family[lineages.nrow .> 1]

Now I fetch the Glottolog classification as a vector of newick strings from the Glottolog website.

glottologF = "../data/tree_glottolog_newick.txt"

isfile(glottologF) || download(
    "https://cdstar.eva.mpg.de//bitstreams/EAEA0-B701-6328-C3E3-0/tree_glottolog_newick.txt",
    glottologF
)
raw = readlines(glottologF);

First some clean-up to make the newick strings digestible by ete3. Then, the newick tree for each family is read in as ete3 tree object.

trees = []

for ln in raw
    ln = strip(ln)
    ln = replace(ln, r"\'[A-ZÄÖÜ][^[]*\[" => "[")
    ln = replace(ln, r"\][^']*\'" => "]")
    ln = replace(ln, r"\[|\]" => "")
    ln = replace(ln, "''" => "")
    ln = replace(ln, ":1" => "")
    push!(
        trees,
        ete3.Tree(ln, format=1)
    )
end

Next, the Glottolog for the individual families are combined to a Glottolog world tree.

This Glottolog tree contains internal nodes representing glottocodes. They may have daughter nodes representing dialects. To make sure that each glottocode is a leaf, I create another leaf daughter for each named internal node and shift the namer of the internal node to that new leaf.

glot = ete3.Tree()

for t in trees
    glot.add_child(t)
end

nonLeaves = [nd.name for nd in glot.traverse()
             if (nd.name != "") & !nd.is_leaf()
]

@showprogress for nm in nonLeaves
    nd = (glot & nm)
    nd.name = ""
    nd.add_child(name=nm)
end

Next I create a dictionary with the families from the target set as keys. For each target family, I prune the Glottolog world tree to the glottocodes from that family and store it as a newick string in the dictionary.

function family_to_tree(fm; d=d, glot=glot)
    fm_taxa = d.glottocode[d.Family.==fm]
    glot_fm = glot.copy()
    glot_fm.prune(fm_taxa)
    glot_fm.write(format=9)
end


glot_tree_dict = Dict()

for fm in families
    @info fm
    glot_tree_dict[fm] = family_to_tree(fm)
end

Finally the MrBayes files are created.

For each family, there are three nexus files:

  1. the file containing the character matrix, and
  2. two MrBayes scripts.

Using two MrBayes scripts is a hack because

  • I want at least 100,000 iterations per chain, and
  • I want to apply the early stop rule which terminates a chain when the topologies have sufficiently converged.

If I use the early stop rule from the outset, sampling for very small families will stop right away because there is no (or little) phylogenetic uncertainty. It is still advisable to do some sampling though, to get a good estimate for the branch lengths.

Not using the stop rule is not a good option either, because then I have to fix a sufficiently large number of iterations. For large families, this is in the tens of millions, which would be a waste of resources for smaller families.

As a compromise, I use the first script to run 10, 000,000 iterations without stop rule, and then continue up to 1,000,000,000 iterations with stop rule.

For the latter, I prepare the following analysis:

  • characters are partitioned into the cc and sc characters (see above)
  • relaxed clock
  • gamma-distributed rate variation
  • equilibrium probabilities and the clock rates are estimated for the two partitions separately
  • for the cc characters, ascertainment bias correction is conducted because all-0 columns are not included
  • the Glottolog tree is used as constraint tree
  • if the dataset contains a dummy taxon (because there are only two real taxa), the dummy taxon is treated as outgroup,
  • if the family contains \(\leq 100\) taxa, the mcmc is run with four runs and four chains, otherwise with two runs and two chains,
  • the maximum is 1, 000, 000, 000 iterations, with an early stop if the average standard deviation of split frequencies is \(\leq 0.01\)
function create_mb_script(
    fn,
    char_mtx,
    clades,
    fm_cc,
    fm_glottocodes,
    n_iterations,
    n_runs,
    n_chains,
    append,
    stoprule
)
    mb = """#Nexus
        Begin MrBayes;
            execute $fn.nex;
            charset sc = 1-1640;
            charset cc = 1641-$(size(char_mtx, 2)-1);
            partition dtype = 2:sc, cc;
            set partition = dtype;
            prset applyto=(all) brlenspr = clock:uniform;
            prset applyto=(all) clockvarpr = igr;
            lset applyto=(all) rates=gamma;
            unlink Statefreq=(all) shape=(all) igrvar=(all) rate=(all);
            prset applyto=(all) ratepr=Dirichlet(1, 1);
            prset applyto=(2) clockratepr=exp(1.0); [for partition 2]
            lset applyto=(1) coding=all;
            lset applyto=(2) coding=noabsencesites;
        """
    if length(clades) > 1
        for (i, cl) in enumerate(clades)
            cn = join(cl, " ")
            mb *= "    constraint c$i = "
            mb *= "$cn;\n"
        end
        mb *= "    prset topologypr = constraints("
        mb *= join(["c$i" for i in 1:length(clades)], ",") * ");\n"
    end
    if length(fm_glottocodes) == 2
        mb *= "constraint c1 = $(join(fm_cc.Glottocode, " "));\n"
        mb *= "prset topologypr = constraints(c1);\n"
    end
    if length(fm_glottocodes) > 100
        mb *= "    set beagleprecision=double;\n"
    end
    mb *= """    prset brlenspr = clock:uniform;
        prset clockvarpr = igr;
        mcmcp stoprule=$stoprule stopval=0.05 filename=output/$fn samplefreq=1000;
        mcmc ngen=$n_iterations nchains=$n_chains nruns=$n_runs append=$append;
        sumt;
        sump;
        q;
    end;
    """
    return mb
end


mkpath("mrbayes/output")
@showprogress for (i, fm) in enumerate(families)
    fm_glottocodes = d.glottocode[d.Family.==fm]
    fn = lpad(i, 3, "0")*"_"*fm
    fm_cc = @pipe world_cc |>
        filter(row -> row.Glottocode  fm_glottocodes, _) 

    fm_characters = names(fm_cc)[2:end]

    informative = map(x -> sum(string.(fm_cc[:,x]) .== "1") .> 0, fm_characters)

    fm_cc = select(
        fm_cc, 
        vcat(["Glottocode"], fm_characters[informative])
    )

    fm_sc = @pipe world_sc |>
        filter(row -> row.Glottocode  fm_glottocodes, _)
    fm_characters = [x for x in names(fm_sc) if x != "Glottocode"]

    fm_sc = select(
        fm_sc, 
        vcat(["Glottocode"], fm_characters)
    )
        
    char_mtx = innerjoin(
        fm_sc,
        fm_cc,
        on=:Glottocode => :Glottocode,
        makeunique=true,
    )


    fm_glot = ete3.Tree(glot_tree_dict[fm], format=1)

    internal_nodes = [
        nd for nd in fm_glot.traverse()
        if nd.is_leaf() == false && nd.is_root() == false
    ]
    clades = [x.get_leaf_names() for x in internal_nodes]

    n_iterations_head = 10_000_000
    n_iterations_tail = 1_000_000_000
    n_chains = length(fm_glottocodes) > 100 ? 4 : 2
    n_runs = length(fm_glottocodes) > 100 ? 4 : 2

    mb_head = create_mb_script(
        fn,
        char_mtx,
        clades,
        fm_cc,
        fm_glottocodes,
        n_iterations_head,
        n_runs,
        n_chains,
        "no",
        "no"
    )

    mb_tail = create_mb_script(
        fn,
        char_mtx,
        clades,
        fm_cc,
        fm_glottocodes,
        n_iterations_tail,
        n_runs,
        n_chains,
        "yes",
        "yes"
    )
    write("mrbayes/$(fn)_head.mb.nex", mb_head)
    write("mrbayes/$(fn)_tail.mb.nex", mb_tail)
    write("mrbayes/$fn.nex", df2nexus(char_mtx))
end

This completes data preparation for MrBayes.

In the next step, all MrBayes scripts must be run, ideally with as much parallelization as possible. I used the following shell script on a powerfuls server for this:

#!/bin/bash

# Script to run multiple instances of mb-mpi command using parallel processing

cd mrbayes
max_jobs=25

run_with_limit() {
  while [ "$(jobs | wc -l)" -ge "$max_jobs" ]; do
    sleep 1
  done
  mpirun -np "$1" mb-mpi "$2" &
}

# Main loop to run mb-mpi commands in parallel
for file in *_head.mb.nex; do
  if [[ "$file" == "052_Atlantic_Congo_head.mb.nex" || "$file" == "053_Austronesian_head.mb.nex" ]]; then
    run_with_limit 16 "$file"
  else
    run_with_limit 4 "$file"
  fi
done

wait

for file in *_tail.mb.nex; do
  if [[ "$file" == "052_Atlantic_Congo_tail.mb.nex" || "$file" == "053_Austronesian_tail.mb.nex" ]]; then
    run_with_limit 16 "$file"
  else
    run_with_limit 4 "$file"
  fi
done

echo "All jobs submitted."