Make sure to set the working directory to the location of this file.
Activate the local environment.
cd (@__DIR__ )
using Pkg
Pkg .activate ("." )
Pkg .instantiate ()
Activating project at `/localscratch/nwsja01/projects/papers/computational_typology_routledge/code`
Load packages.
using CSV
using DataFrames
using Pipe
using ProgressMeter
using JSON
using HTTP
using Downloads
using Statistics
using RCall
using PyCall
ete3 = pyimport ("ete3" )
PyObject <module 'ete3' from '/localscratch/nwsja01/miniconda3/envs/worldtree_msa/lib/python3.11/site-packages/ete3/__init__.py'>
Download WALS dataset and unzip it.
# Define the URL and the destination file path
url = "https://zenodo.org/records/13950591/files/cldf-datasets/wals-v2020.4.zip?download=1"
destfile = "wals-v2020.4.zip"
# Download the file
Downloads .download (url, destfile)
# Unzip the file
run (`sh -c "unzip -o $destfile -d ../data/wals-v2020.4 > /dev/null 2>&1"` )
# Clean up
rm (destfile)
Download Phoible dataset.
url_ = "https://github.com/phoible/dev/blob/master/data/phoible.csv?raw=true"
destfile = "phoible.csv"
# Download the file
Downloads .download (url_, destfile)
# Define column types
col_types = Dict (
"InventoryID" => Int ,
"Marginal" => Bool
)
# Read the CSV file with specified column types
phoible = CSV.File (destfile; types= col_types, missingstring= "NA" ) |> DataFrame
first (phoible, 10 )
1
1
kore1280
kor
Korean
missing
0068
h
ç h ɦ
missing
consonant
spa
0
-
-
-
-
-
-
+
+
-
-
-
-
-
-
0
0
-
0
0
0
-
0
0
0
0
0
-
-
-
-
+
-
-
-
-
-
-
2
1
kore1280
kor
Korean
missing
006A
j
j
missing
consonant
spa
0
-
-
-
-
-
+
+
0
+
-
-
-
-
-
0
0
-
0
0
0
+
+
-
+
-
+
-
-
+
-
-
-
-
-
-
-
-
3
1
kore1280
kor
Korean
missing
006B
k
k̚ ɡ k
missing
consonant
spa
0
-
-
-
-
+
-
-
-
-
-
-
-
-
-
0
0
-
0
0
0
+
+
-
-
-
0
-
-
-
-
-
-
-
-
-
-
-
4
1
kore1280
kor
Korean
missing
006B+02B0
kʰ
kʰ
missing
consonant
spa
0
-
-
-
-
+
-
-
-
-
-
-
-
-
-
0
0
-
0
0
0
+
+
-
-
-
0
-
-
-
-
+
-
-
-
-
-
-
5
1
kore1280
kor
Korean
missing
006B+02C0
kˀ
kˀ
missing
consonant
spa
0
-
-
-
-
+
-
-
-
-
-
-
-
-
-
0
0
-
0
0
0
+
+
-
-
-
0
-
-
-
-
-
+
-
-
-
-
-
6
1
kore1280
kor
Korean
missing
006C
l
ɾ l lʲ
missing
consonant
spa
0
-
-
-
-
+
+
+
0
+
-
-
-
+
-
0
0
+
+
-
-
-
0
0
0
0
0
-
-
+
-
-
-
-
-
-
-
-
7
1
kore1280
kor
Korean
missing
006D
m
mb m
missing
consonant
spa
0
-
-
-
-
+
+
-
0
-
-
-
+
-
+
-
-
-
0
0
0
-
0
0
0
0
0
-
-
+
-
-
-
-
-
-
-
-
8
1
kore1280
kor
Korean
missing
006E
n
nd nʲ n̚ n
missing
consonant
spa
0
-
-
-
-
+
+
-
0
-
-
-
+
-
-
0
0
+
+
-
-
-
0
0
0
0
0
-
-
+
-
-
-
-
-
-
-
-
9
1
kore1280
kor
Korean
missing
0070
p
p b p̚
missing
consonant
spa
0
-
-
-
-
+
-
-
-
-
-
-
-
-
+
-
-
-
0
0
0
-
0
0
0
0
0
-
-
-
-
-
-
-
-
-
-
-
10
1
kore1280
kor
Korean
missing
0070+02B0
pʰ
pʰ
missing
consonant
spa
0
-
-
-
-
+
-
-
-
-
-
-
-
-
+
-
-
-
0
0
0
-
0
0
0
0
0
-
-
-
-
+
-
-
-
-
-
-
Download ASJP dataset and unzip it.
url = "https://zenodo.org/records/7079637/files/lexibank/asjp-v20.zip?download=1"
destfile = "../data/asjp-v20.zip"
# Download the file
Downloads .download (url, destfile)
# Unzip the file
run (`sh -c "unzip -o $destfile -d ../data/asjp-v20 > /dev/null 2>&1"` )
# Clean up
rm (destfile)
Load WALS data.
wals_values = CSV.File ("../data/wals-v2020.4/cldf-datasets-wals-0f5cd82/cldf/values.csv" ) |> DataFrame
wals_languages = CSV.File ("../data/wals-v2020.4/cldf-datasets-wals-0f5cd82/cldf/languages.csv" ) |> DataFrame
wals_parameters = CSV.File ("../data/wals-v2020.4/cldf-datasets-wals-0f5cd82/cldf/parameters.csv" ) |> DataFrame
wals_codes = @pipe CSV.File ("../data/wals-v2020.4/cldf-datasets-wals-0f5cd82/cldf/codes.csv" ) |> DataFrame |> select (_, : Parameter_ID, : Name, : Number );
Extract the values for the WALS feature “Affix” and “Adposition”.
d1 = @pipe wals_values |>
filter (row -> row.Parameter_ID ∈ ["26A" , "85A" ]) |>
select (_, : Language_ID, : Parameter_ID, : Code_ID) |>
leftjoin (_, wals_values, on = [: Language_ID, : Code_ID, : Parameter_ID], makeunique= true ) |>
leftjoin (_, wals_codes, on = [: Parameter_ID => : Parameter_ID, : Code_ID => : Number ], makeunique= true ) |>
select (_, Not (: ID, : Comment, : Code_ID, : Source, : Example_ID, : Name)) |>
unstack (_, : Parameter_ID, : Value) |>
dropmissing (_) |>
rename (_, [: Language_ID, : Affix, : Adposition]) |>
leftjoin (_, wals_languages, on = : Language_ID => : ID) |>
select (_, : Glottocode, : Name, : Macroarea, : Longitude, : Latitude, : Family, : Affix, : Adposition) |>
dropmissing (_, : Glottocode) |>
unique (_, : Glottocode) ;
Filter WALS values
filter! (row -> row.Adposition ∈ [1 ,2 ], d1)
filter! (row -> row.Affix != 1 , d1);
Get the world tree from OSF.
file_id = "sbh4q"
url = "https://api.osf.io/v2/files/ $ (file_id)/"
response = HTTP.get (url)
data = JSON.parse (String (response.body))
download_url = data["data" ]["links" ]["download" ]
ml_tree = ete3.Tree (read (Downloads .download (download_url, timeout= 1000 ), String ))
PyObject Tree node '' (0x7f7d7c2d519)
Use mad.py
to root the tree.
ml_tree.write (format= 1 , outfile= "mltree.tre" )
run (`python mad.py mltree.tre` )
run (`bash -c "head -n 1 mltree.tre.rooted > mltree.tre.rooted.head"` )
ml_tree = ete3.Tree ("mltree.tre.rooted.head" )
MAD phylogenetic rooting
Analyzing file 'mltree.tre'...
>> Warning: Trees with repeating branch lengths are suspicious (290 repeating values).
>> Warning: Root is polytomous.
>> [MAD=0.138_AI=1.000_CCV=18.5%_N=1/3]
>> Warning: Root is polytomous.
>> [MAD=0.138_AI=1.000_CCV=18.5%_N=2/3]
>> [MAD=0.138_AI=1.000_CCV=18.5%_N=3/3]
Minimal ancestor deviation, MAD = 0.138
Ambiguity index, AI = 1.000
Clock CV, CCV = 18.5%, 18.5%, 18.5%
Tied root positions,
3 rooted trees written to mltree.tre.rooted
- Please cite DOI:10.1038/s41559-017-0193
PyObject Tree node '' (0x7f7d6ab6fe1)
Map ASJP longnames to Glottocodes.
longname2glottocode = Dict {String, String} (
zip (asjp_languages.longname, asjp_languages.Glottocode)
)
glottocode2longname = Dict {String, String} (
zip (asjp_languages.Glottocode, asjp_languages.longname)
)
Dict{String, String} with 5077 entries:
"vili1238" => "NC.BANTOID.VILI_3"
"avar1256" => "NDa.AVAR_ANDIC_TSEZIC.AVAR_ZAKATALY"
"krun1240" => "AuA.BAHNARIC.BRAO_KRUNG"
"gwii1239" => "KK.KHOE_KWADI.GWI"
"sout2797" => "NC.GUR.SOUTHERN_TOUSSIAN"
"kadu1253" => "ST.BURMESE_LOLO.KADUO"
"kili1268" => "Hok.YUMAN.KILIWA"
"hiww1237" => "An.OCEANIC.HIW"
"mart1256" => "PN.WESTERN_PAMA_NYUNGAN.YULPARIJA"
"cham1312" => "An.CHAMORRO.CHAMORRO"
"akwa1248" => "NC.BANTOID.AKWA"
"kups1238" => "ESu.NILOTIC.KUPSAPIINY"
"towe1240" => "Pau.WESTERN_PAUWASI.TOWEI"
"trum1247" => "Tru.TRUMAI.TRUMAI"
"yimc1240" => "ST.KUKI_CHIN.NAGA_YIMCHUNGRU"
"east2346" => "ST.BODIC.EASTERN_TAMANG"
"lagw1237" => "AA.BIU_MANDARA.LAGWAN"
"duru1249" => "NC.BANTOID.DURUMA"
"leco1242" => "Lek.LEKO.LEKO"
⋮ => ⋮
Prune the world tree to the relevant languages
tree_taxa = intersect (ml_tree.get_leaf_names (), asjp_languages.longname)
ml_tree.prune ([ml_tree & x for x in tree_taxa if x ∈ asjp_languages.longname])
Get the character matrix from OSF.
This is necessary because the ASJP world tree uses ASJP doculect identifiers, and there are often several ASJP doculects per Glottocode. For each Glottocode, we pick the doculect with fewest missing data.
file_id = "3em9h"
url = "https://api.osf.io/v2/files/ $ (file_id)/"
response = HTTP.get (url)
data = JSON.parse (String (response.body))
download_url = data["data" ]["links" ]["download" ]
world_sc_ = DataFrame (
hcat (
split .(
split (read (Base .download (download_url), String ), " \n " )[2 : end ]
)...
) |> permutedims, : auto)
rename! (world_sc_, : x1 => : longname, : x2 => : characters)
world_sc = @pipe world_sc_.characters |>
mapslices (x -> split .(x, "" ), _, dims= 1 ) |>
hcat (_... ) |>
permutedims |>
DataFrame (_, : auto) |>
insertcols! (_, 1 , : longname => world_sc_.longname)
7432×1641 DataFrame
1541 columns and 7407 rows omitted
1
AA.DIZOID.NAO
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
2
AuA.KHASIAN.KHASI
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
⋯
3
AuA.KHASIAN.KHASI_2
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
1
0
0
0
0
0
⋯
4
AuA.KHASIAN.LYNGNGAM
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
1
0
0
0
0
⋯
5
AuA.KHASIAN.PNAR_JOWAI
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
1
0
0
0
0
⋯
6
AuA.KHASIAN.WAR_JAINTIA
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
1
0
0
0
1
0
⋯
7
Gun.GUNWINYGIC.BUAN
0
1
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
8
Hok.YUMAN.YAVAPAI
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
⋯
9
Iwa.IWAIDJAN.AMURDAK
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
1
1
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
⋯
10
Iwa.IWAIDJAN.IWAIDJA
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
1
1
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
⋯
11
LSR.GRASS.ABU
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
⋯
12
NC.KWA.AJAGBE
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
13
NC.NORTHERN_ATLANTIC.WOLOF_8
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
⋯
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋱
7421
NC.BANTOID.NYANJA_NYASA
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
⋯
7422
NC.KAINJI.KUKI
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
7423
NC.KAINJI.REGI
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
7424
NC.KAINJI.ROGO
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
7425
NC.KAINJI.SHAMA
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
7426
NC.KWA.AKPAFU
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
7427
NC.PLATOID.BEROM_F
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
7428
AA.BERBER.CHAOUI
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
⋯
7429
Man.WESTERN_MANDE.SEEKU
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
7430
An.CELEBIC.TOLAKI
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
7431
AA.WEST_CHADIC.DERA
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
7432
NC.KAINJI.SEGEMUK
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
⋯
Select the doculects with the fewest missing data.
insertcols! (world_sc, 1 , : Glottocode => [longname2glottocode[row.longname] for row in eachrow (world_sc)])
best_languages = @pipe world_sc |>
DataFrame (
longname = _.longname,
Glottocode = _.Glottocode,
nGaps = map (x -> sum (Array (x) .== "-" ), eachrow (_))
) |>
sort (_, : nGaps) |>
unique (_, : Glottocode).longname
4261-element Vector{SubString{String}}:
"AuA.KHASIAN.KHASI"
"Hok.YUMAN.YAVAPAI"
"Iwa.IWAIDJAN.IWAIDJA"
"ST.BODIC.BUNAN"
"ST.BODIC.EASTERN_BALTI"
"ST.BODIC.GHACHOK"
"ST.BODIC.HELAMBU_SHERPA"
"ST.BODIC.KAGATE"
"ST.BODIC.LHASA_TIBETAN"
"ST.BODIC.LOWA"
⋮
"TNG.BINANDEREAN.GAINA"
"NDe.ATHAPASKAN.HAN"
"TNG.BINANDEREAN.OROKAIVA_SOSE"
"ESu.NILOTIC.SOGOO"
"NC.BANTOID.KOSHIN"
"CSu.BONGO_BAGIRMI.GULA_SARA"
"An.GREATER_CENTRAL_PHILIPPINE.MANDAYAN_ISLAM_PISO"
"An.OCEANIC.PENRHYN"
"AA.BIU_MANDARA.VEMGO_MABAS_2"
Prune the tree to Glottocodes.
ml_tree.prune ([ml_tree & x for x ∈ best_languages])
for l in ml_tree.get_leaves ()
l.name = longname2glottocode[l.name]
end
Select the typological data rows which match with a leaf in the tree.
tree_taxa = ml_tree.get_leaf_names ()
filter! (row -> row.Glottocode ∈ tree_taxa, d1)
filter! (row -> row.Glottocode ∈ tree_taxa, d2)
CSV.write ("../data/affix_adposition.csv" , d1)
CSV.write ("../data/soundpop.csv" , d2)
Create two trees – one for each dataset.
tree1 = ml_tree.copy ()
tree1.prune ([tree1 & x for x ∈ d1.Glottocode])
tree2 = ml_tree.copy ()
tree2.prune ([tree2 & x for x ∈ d2.Glottocode])
tree1.write (format= 1 , outfile= "tree1.tre" )
tree2.write (format= 1 , outfile= "tree2.tre" )
Root the trees.
R"""
library(ape)
# Load rooted tree
tree1 <- read.tree("tree1.tre")
tree2 <- read.tree("tree2.tre")
# Resolve zero-length internal nodes
tree1 <- di2multi(tree1)
tree2 <- di2multi(tree2)
# Replace very small or zero branch lengths
tree1 $ edge.length[tree1 $ edge.length <= 1e-3] <- 1e-3
tree2 $ edge.length[tree2 $ edge.length <= 1e-3] <- 1e-3
# Scale branch lengths to avoid massive numerical ranges
max_depth1 <- max(node.depth.edgelength(tree1))
scale_factor1 <- 1 / max_depth1
tree1 $ edge.length <- tree1 $ edge.length * scale_factor1
max_depth2 <- max(node.depth.edgelength(tree2))
scale_factor2 <- 1 / max_depth2
tree2 $ edge.length <- tree2 $ edge.length * scale_factor2
# Combine trees
ultra_tree1 <- chronos(
tree1,
lambda = 10, # strong smoothing
model = "correlated", # more stable than "relaxed"
control = chronos.control(epsilon = 1e-6, iter.max = 1000, eval.max = 5000)
)
write.tree(ultra_tree1, "../data/affix_adposition.tre")
ultra_tree2 <- chronos(
tree2,
lambda = 10, # strong smoothing
model = "correlated", # more stable than "relaxed"
control = chronos.control(epsilon = 1e-6, iter.max = 1000, eval.max = 5000)
)
write.tree(ultra_tree2, "../data/soundpop.tre")
"""
Setting initial dates...
Fitting in progress... get a first set of estimates
(Penalised) log-lik = -85345.23
Optimising rates... dates... -85345.23
Optimising rates... dates... -296.365
Optimising rates... dates... -249.5508
Optimising rates... dates... -247.7283
Optimising rates... dates... -247.5299
Optimising rates... dates... -247.5265
Optimising rates... dates... -247.5251
Optimising rates... dates... -247.5244
Optimising rates... dates... -247.5238
Optimising rates... dates... -247.5236
Optimising rates... dates... -247.5235
Optimising rates... dates... -247.5235
Optimising rates... dates... -247.5234
Optimising rates... dates... -247.5234
Optimising rates... dates... -247.5234
Optimising rates... dates... -247.5234
Optimising rates... dates... -247.5234
Optimising rates... dates... -247.5234
Optimising rates... dates... -247.5234
log-Lik = -242.3904
PHIIC = 3409.12
┌ Warning: RCall.jl: Warning: function evaluation limit reached without convergence (9)
└ @ RCall /localscratch/nwsja01/.julia/packages/RCall/0ggIQ/src/io.jl:172
Setting initial dates...
Fitting in progress... get a first set of estimates
(Penalised) log-lik = -9404449
Optimising rates... dates... -9404449
Optimising rates... dates... -2993.863
Optimising rates... dates... -1094.443
Optimising rates... dates... -740.5919
Optimising rates... dates... -684.9216
Optimising rates... dates... -665.2583
Optimising rates... dates... -658.893
Optimising rates... dates... -656.8436
Optimising rates... dates... -656.0729
Optimising rates... dates... -656.0729
log-Lik = -655.0175
PHIIC = 10350.64
Make a copy of the trees using longnames (for human readability).
tree1 = ete3.Tree ("../data/affix_adposition.tre" )
tree2 = ete3.Tree ("../data/soundpop.tre" )
for l in tree1.get_leaves ()
l.name = glottocode2longname[l.name]
end
for l in tree2.get_leaves ()
l.name = glottocode2longname[l.name]
end
tree1.write (format= 1 , outfile= "../data/affix_adposition_longnames.tre" )
tree2.write (format= 1 , outfile= "../data/soundpop_longnames.tre" )
Clean up.
rm ("../data/asjp-v20" , recursive= true , force= true )
rm ("../data/wals-v2020.4" , recursive= true , force= true )
rm ("phoible.csv" , force= true )
rm ("mltree.tre" , force= true )
rm ("mltree.tre.rooted" , force= true )
rm ("mltree.tre.rooted.head" , force= true )
rm ("tree1.tre" , force= true )
rm ("tree2.tre" , force= true )