using Pkg
Pkg.activate(".")
Pkg.instantiate()Constructing MrBayes scripts for phylogenetic inference
In my paper Global-scale phylogenetic linguistic inference from lexical resources, I describe a method to extract character matrices from ASJP data. On https://osf.io/a97sz/, I stored code and data from applying this workflow to version 19 of ASJP.
In this script, the character vectors for a predefined set of glottocodes are extracted from the data on the OSF repository, and mrbayes scripts are created, one for each language family present among the collection of glottocodes.
Workflow
Preamble: activating the local environment and loading packages
Note that I load PyCall and import the python package ete3, which is very convenient to manipulate phylogenies.
using CSV
using DataFrames
using JSON
using HTTP
using Pipe
using ProgressMeter
using PyCall
ete3 = pyimport("ete3")Loading and preparing data
d = CSV.File("../data/EA_vars_glotto.csv") |> DataFrameNext I fetch metadata from the Glottolog website to assign a family to each doculect.
glottolog_cldf_zip = "../data/glottolog_cldf.zip"
isfile(glottolog_cldf_zip) || begin
download(
"https://zenodo.org/records/8131091/files/glottolog/glottolog-cldf-v4.8.zip?download=1",
glottolog_cldf_zip
)
run(`unzip $glottolog_cldf_zip -d ../data/`)
endpth = "../data/glottolog-glottolog-cldf-59a612c/cldf/"
glottolog_languages = CSV.File(joinpath(pth, "languages.csv")) |> DataFrame
function glottocode_2_family(g, glottolog_languages)
# Find the row with the matching Glottocode
row = findfirst(==(g), glottolog_languages.Glottocode)
# If the Glottocode is not found, return nothing or an appropriate value
if isnothing(row)
return "Glottocode not found"
end
# Extract the family code
family_code = glottolog_languages.Family_ID[row]
# Check if the family code is missing
if ismissing(family_code)
return glottolog_languages.Name[row]
end
# Find the row for the family code
family_row = findfirst(==(family_code), glottolog_languages.Glottocode)
# If the family Glottocode is not found, return the name for the original Glottocode
if isnothing(family_row)
return glottolog_languages.Name[row]
end
# Return the name associated with the family Glottocode
return glottolog_languages.Name[family_row]
end
insertcols!(d, :Family => [glottocode_2_family(g, glottolog_languages) for g in d.glottocode])Getting the ASJP data
There are two types of characters in the OSF repo, cognate class characters and soundclass-concept characters. The two character matrices are downloaded and loaded in turn.
file_id = "h4a6z"
url = "https://api.osf.io/v2/files/$(file_id)/"
response = HTTP.get(url)
data = JSON.parse(String(response.body))
download_url = data["data"]["links"]["download"]
world_cc_ = DataFrame(
hcat(
split.(
split(read(download(download_url), String), "\n")[2:end]
)...
) |> permutedims, :auto)
rename!(world_cc_, :x1 => :longname, :x2 => :characters)
world_cc = @pipe world_cc_.characters |>
mapslices(x -> split.(x, ""), _, dims=1) |>
hcat(_...) |>
permutedims |>
DataFrame(_, :auto) |>
insertcols!(_, 1, :longname => world_cc_.longname)file_id = "3em9h"
url = "https://api.osf.io/v2/files/$(file_id)/"
response = HTTP.get(url)
data = JSON.parse(String(response.body))
download_url = data["data"]["links"]["download"]
world_sc_ = DataFrame(
hcat(
split.(
split(read(download(download_url), String), "\n")[2:end]
)...
) |> permutedims, :auto)
rename!(world_sc_, :x1 => :longname, :x2 => :characters)
world_sc = @pipe world_sc_.characters |>
mapslices(x -> split.(x, ""), _, dims=1) |>
hcat(_...) |>
permutedims |>
DataFrame(_, :auto) |>
insertcols!(_, 1, :longname => world_sc_.longname)Next I fetch and prepare the metadata for the ASJP doculect.
file_id = "w4jnf"
url = "https://api.osf.io/v2/files/$(file_id)/"
response = HTTP.get(url)
data = JSON.parse(String(response.body))
download_url = data["data"]["links"]["download"]
asjp_languages = @pipe CSV.read(
download(download_url),
missingstring="",
DataFrame) |>
dropmissing(_, :classification_wals) |>
dropmissing(_, :Glottocode) |>
filter(row -> row.recently_extinct == 0, _) |>
filter(row -> row.long_extinct == 0, _) |>
select(_, [:Name, :Glottocode, :Family, :classification_wals]) |>
DataFrames.transform(_, [:classification_wals, :Name] => ByRow((x, y) -> string(x, ".", y)) => :longname) |>
select(_, Not(:classification_wals)) |>
DataFrames.transform(_, :longname => ByRow(x -> replace(x, "-" => "_")) => :longname) |>
dropmissingI developed my own naming convention for ASJP doculects – [WALS family name].[WALS genus_name].[doculect name]. These must be matched with glottocodes.
longname2glottocode = Dict{String, String}(
zip(asjp_languages.longname, asjp_languages.Glottocode)
)
glottocode2longname = Dict{String, String}(
zip(asjp_languages.Glottocode, asjp_languages.longname)
)
glottocode2family = Dict{String, String}(
zip(asjp_languages.Glottocode, asjp_languages.Family)
)
for l in d.glottocode
if l ∉ keys(glottocode2longname)
longname2glottocode[l] = l
glottocode2longname[l] = l
end
endRestricting the character vectors to the doculects for which I have a glottocode.
filter!(row -> row.longname ∈ asjp_languages.longname, world_cc)
filter!(row -> row.longname ∈ asjp_languages.longname, world_sc)ASJP sometimes contains several doculects for the same glottocode. Therefore I now compute the number of missing entries for each doculect. For each glottocode, I select the ASJP doculect with fewest missing entries as representative.
insertcols!(
world_cc,
1,
:Glottocode => [longname2glottocode[x] for x in world_cc.longname]
)
insertcols!(
world_sc,
1,
:Glottocode => [longname2glottocode[x] for x in world_sc.longname]
)
best_languages = @pipe world_sc |>
DataFrame(
longname = _.longname,
Glottocode = _.Glottocode,
nGaps = map(x -> sum(Array(x) .== "-"), eachrow(_))
) |>
sort(_, :nGaps) |>
unique(_, :Glottocode).longname The character matrices are now restricted to the doculects representing a glottocode.
filter!(row -> row.longname ∈ best_languages, world_cc)
filter!(row -> row.longname ∈ best_languages, world_sc)
select!(world_cc, Not(:longname))
select!(world_sc, Not(:longname))For the glottocodes in the target set for which there are no ASJP data, a character vector consisting of missing entries is constructed. Then, the character matrices are restricted to the glottocodes from the target set.
for l in setdiff(d.glottocode, world_sc.Glottocode)
nl_cc = repeat(["-"], size(world_cc, 2))
nl_cc[1] = l
push!(world_cc, nl_cc)
nl_sc = repeat(["-"], size(world_sc, 2))
nl_sc[1] = l
push!(world_sc, nl_sc)
end
filter!(row -> row.Glottocode ∈ d.glottocode, world_cc)
filter!(row -> row.Glottocode ∈ d.glottocode, world_sc)In the cognate class character matrix, all columns containing no 1 are removed.
select!(world_cc, Not([
i for i in axes(world_cc, 2)[2:end] if sum(world_cc[:, i] .== "1") == 0
]))Now I add the Glottolog family names to d.
d[:,:Family] = replace.(d[:,:Family], " " => "_", "-" => "_", "'" => "")Here is a helper function that takes a DataFrame object representing a character matrix and constructs the content of a Nexus file representing that matrix.
If a family only contains two taxa, a dummy taxa is added which has all characters missing. This is required because MrBayes only works with datasets containing at least 3 taxa.
# create character matrices
function df2nexus(cm)
pad = maximum(length.(cm.Glottocode)) + 5
ntaxa = size(cm, 1) == 2 ? 3 : size(cm, 1)
nex = """#Nexus
BEGIN DATA;
DIMENSIONS ntax=$ntaxa nchar = $(size(cm, 2)-1);
FORMAT DATATYPE=Restriction GAP=? MISSING=- interleave=no;
MATRIX
"""
for i in axes(cm, 1)
nex *= rpad(cm.Glottocode[i], pad) * join(Array(cm[i, 2:end])) * "\n"
end
if nrow(cm) == 2
nex *= rpad("dummy", pad) * repeat("?", size(cm, 2)-1) * "\n"
end
nex *= ";\nEND"
nex
endconcatenating the two character matrices…
char_mtx = innerjoin(
world_sc,
world_cc,
on=:Glottocode => :Glottocode,
makeunique=true,
)lineages = @pipe d |>
unique(_, :glottocode) |>
groupby(_, :Family) |>
combine(_, nrow) |>
sort(_,:nrow)
families = lineages.Family[lineages.nrow .> 1]Now I fetch the Glottolog classification as a vector of newick strings from the Glottolog website.
glottologF = "../data/tree_glottolog_newick.txt"
isfile(glottologF) || download(
"https://cdstar.eva.mpg.de//bitstreams/EAEA0-B701-6328-C3E3-0/tree_glottolog_newick.txt",
glottologF
)raw = readlines(glottologF);First some clean-up to make the newick strings digestible by ete3. Then, the newick tree for each family is read in as ete3 tree object.
trees = []
for ln in raw
ln = strip(ln)
ln = replace(ln, r"\'[A-ZÄÖÜ][^[]*\[" => "[")
ln = replace(ln, r"\][^']*\'" => "]")
ln = replace(ln, r"\[|\]" => "")
ln = replace(ln, "''" => "")
ln = replace(ln, ":1" => "")
push!(
trees,
ete3.Tree(ln, format=1)
)
endNext, the Glottolog for the individual families are combined to a Glottolog world tree.
This Glottolog tree contains internal nodes representing glottocodes. They may have daughter nodes representing dialects. To make sure that each glottocode is a leaf, I create another leaf daughter for each named internal node and shift the namer of the internal node to that new leaf.
glot = ete3.Tree()
for t in trees
glot.add_child(t)
end
nonLeaves = [nd.name for nd in glot.traverse()
if (nd.name != "") & !nd.is_leaf()
]
@showprogress for nm in nonLeaves
nd = (glot & nm)
nd.name = ""
nd.add_child(name=nm)
endNext I create a dictionary with the families from the target set as keys. For each target family, I prune the Glottolog world tree to the glottocodes from that family and store it as a newick string in the dictionary.
function family_to_tree(fm; d=d, glot=glot)
fm_taxa = d.glottocode[d.Family.==fm]
glot_fm = glot.copy()
glot_fm.prune(fm_taxa)
glot_fm.write(format=9)
end
glot_tree_dict = Dict()
for fm in families
@info fm
glot_tree_dict[fm] = family_to_tree(fm)
endFinally the MrBayes files are created.
For each family, there are three nexus files:
- the file containing the character matrix, and
- two
MrBayesscripts.
Using two MrBayes scripts is a hack because
- I want at least 100,000 iterations per chain, and
- I want to apply the early stop rule which terminates a chain when the topologies have sufficiently converged.
If I use the early stop rule from the outset, sampling for very small families will stop right away because there is no (or little) phylogenetic uncertainty. It is still advisable to do some sampling though, to get a good estimate for the branch lengths.
Not using the stop rule is not a good option either, because then I have to fix a sufficiently large number of iterations. For large families, this is in the tens of millions, which would be a waste of resources for smaller families.
As a compromise, I use the first script to run 10, 000,000 iterations without stop rule, and then continue up to 1,000,000,000 iterations with stop rule.
For the latter, I prepare the following analysis:
- characters are partitioned into the cc and sc characters (see above)
- relaxed clock
- gamma-distributed rate variation
- equilibrium probabilities and the clock rates are estimated for the two partitions separately
- for the cc characters, ascertainment bias correction is conducted because all-0 columns are not included
- the Glottolog tree is used as constraint tree
- if the dataset contains a dummy taxon (because there are only two real taxa), the dummy taxon is treated as outgroup,
- if the family contains \(\leq 100\) taxa, the mcmc is run with four runs and four chains, otherwise with two runs and two chains,
- the maximum is 1, 000, 000, 000 iterations, with an early stop if the average standard deviation of split frequencies is \(\leq 0.01\)
function create_mb_script(
fn,
char_mtx,
clades,
fm_cc,
fm_glottocodes,
n_iterations,
n_runs,
n_chains,
append,
stoprule
)
mb = """#Nexus
Begin MrBayes;
execute $fn.nex;
charset sc = 1-1640;
charset cc = 1641-$(size(char_mtx, 2)-1);
partition dtype = 2:sc, cc;
set partition = dtype;
prset applyto=(all) brlenspr = clock:uniform;
prset applyto=(all) clockvarpr = igr;
lset applyto=(all) rates=gamma;
unlink Statefreq=(all) shape=(all) igrvar=(all) rate=(all);
prset applyto=(all) ratepr=Dirichlet(1, 1);
prset applyto=(2) clockratepr=exp(1.0); [for partition 2]
lset applyto=(1) coding=all;
lset applyto=(2) coding=noabsencesites;
"""
if length(clades) > 1
for (i, cl) in enumerate(clades)
cn = join(cl, " ")
mb *= " constraint c$i = "
mb *= "$cn;\n"
end
mb *= " prset topologypr = constraints("
mb *= join(["c$i" for i in 1:length(clades)], ",") * ");\n"
end
if length(fm_glottocodes) == 2
mb *= "constraint c1 = $(join(fm_cc.Glottocode, " "));\n"
mb *= "prset topologypr = constraints(c1);\n"
end
if length(fm_glottocodes) > 100
mb *= " set beagleprecision=double;\n"
end
mb *= """ prset brlenspr = clock:uniform;
prset clockvarpr = igr;
mcmcp stoprule=$stoprule stopval=0.05 filename=output/$fn samplefreq=1000;
mcmc ngen=$n_iterations nchains=$n_chains nruns=$n_runs append=$append;
sumt;
sump;
q;
end;
"""
return mb
end
mkpath("mrbayes/output")
@showprogress for (i, fm) in enumerate(families)
fm_glottocodes = d.glottocode[d.Family.==fm]
fn = lpad(i, 3, "0")*"_"*fm
fm_cc = @pipe world_cc |>
filter(row -> row.Glottocode ∈ fm_glottocodes, _)
fm_characters = names(fm_cc)[2:end]
informative = map(x -> sum(string.(fm_cc[:,x]) .== "1") .> 0, fm_characters)
fm_cc = select(
fm_cc,
vcat(["Glottocode"], fm_characters[informative])
)
fm_sc = @pipe world_sc |>
filter(row -> row.Glottocode ∈ fm_glottocodes, _)
fm_characters = [x for x in names(fm_sc) if x != "Glottocode"]
fm_sc = select(
fm_sc,
vcat(["Glottocode"], fm_characters)
)
char_mtx = innerjoin(
fm_sc,
fm_cc,
on=:Glottocode => :Glottocode,
makeunique=true,
)
fm_glot = ete3.Tree(glot_tree_dict[fm], format=1)
internal_nodes = [
nd for nd in fm_glot.traverse()
if nd.is_leaf() == false && nd.is_root() == false
]
clades = [x.get_leaf_names() for x in internal_nodes]
n_iterations_head = 10_000_000
n_iterations_tail = 1_000_000_000
n_chains = length(fm_glottocodes) > 100 ? 4 : 2
n_runs = length(fm_glottocodes) > 100 ? 4 : 2
mb_head = create_mb_script(
fn,
char_mtx,
clades,
fm_cc,
fm_glottocodes,
n_iterations_head,
n_runs,
n_chains,
"no",
"no"
)
mb_tail = create_mb_script(
fn,
char_mtx,
clades,
fm_cc,
fm_glottocodes,
n_iterations_tail,
n_runs,
n_chains,
"yes",
"yes"
)
write("mrbayes/$(fn)_head.mb.nex", mb_head)
write("mrbayes/$(fn)_tail.mb.nex", mb_tail)
write("mrbayes/$fn.nex", df2nexus(char_mtx))
endThis completes data preparation for MrBayes.
In the next step, all MrBayes scripts must be run, ideally with as much parallelization as possible. I used the following shell script on a powerfuls server for this:
#!/bin/bash
# Script to run multiple instances of mb-mpi command using parallel processing
cd mrbayes
max_jobs=25
run_with_limit() {
while [ "$(jobs | wc -l)" -ge "$max_jobs" ]; do
sleep 1
done
mpirun -np "$1" mb-mpi "$2" &
}
# Main loop to run mb-mpi commands in parallel
for file in *_head.mb.nex; do
if [[ "$file" == "052_Atlantic_Congo_head.mb.nex" || "$file" == "053_Austronesian_head.mb.nex" ]]; then
run_with_limit 16 "$file"
else
run_with_limit 4 "$file"
fi
done
wait
for file in *_tail.mb.nex; do
if [[ "$file" == "052_Atlantic_Congo_tail.mb.nex" || "$file" == "053_Austronesian_tail.mb.nex" ]]; then
run_with_limit 16 "$file"
else
run_with_limit 4 "$file"
fi
done
echo "All jobs submitted."