Prepare data

Setting the working directory and activate the local Julia environment.

cd(@__DIR__)
cd("../code/")
using Pkg
Pkg.activate(".")
Pkg.instantiate()
using CSV, DataFrames, Pipe
using ProgressMeter
  Activating project at `~/projects/research/asjp20world_tree/code`

Downloading ASJP from Zenodo.

mkpath("tmp")

languagesF = "../data/languages.csv"
formsF = "../data/forms.csv"
parametersF = "../data/asjp_parameters.csv"

!(isfile(languagesF) && isfile(formsF)) && begin
      asjpZip = download(
            "https://zenodo.org/record/7079637/files/lexibank/asjp-v20.zip",
            "tmp/asjp-v20.zip",
      )
      run(`unzip -o $asjpZip -d tmp/`)
      cp("tmp/lexibank-asjp-f0f1d0d/cldf/forms.csv", formsF)
      cp("tmp/lexibank-asjp-f0f1d0d/cldf/languages.csv", languagesF)
      cp("tmp/lexibank-asjp-f0f1d0d/cldf/parameters.csv", parametersF)
      for f in readdir("tmp")
            rm("tmp/" * f, recursive = true)
      end
end
rm("tmp", recursive=true)

Loading ASJP cldf files into DataFrames.

forms = CSV.read(formsF, DataFrame)
languages = CSV.read(languagesF, DataFrame)
parameters = CSV.read(parametersF, DataFrame)

dropmissing!(
      languages, 
      [
            :classification_wals, 
            :classification_ethnologue, 
            :classification_glottolog
      ]
)
9912×18 DataFrame
9887 rows omitted
Row ID Name Glottocode Glottolog_Name ISO639P3code Macroarea Latitude Longitude Family classification_wals classification_ethnologue classification_glottolog recently_extinct long_extinct year_of_extinction code_wals code_iso transcribers
String String String15? String? String3? String15? Float64? Float64? String31? String String String Bool Bool Int64? String3? String3? String?
1 A51_BAFIA_MAJA A51_BAFIA_MAJA lefa1242 Lefa lfa Africa 5.1 11.2 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia false false missing missing lfa Ann-Katrin Wett
2 A51_BAFIA_TUMI_TINGON A51_BAFIA_TUMI_TINGON lefa1242 Lefa lfa Africa 5.1 11.2 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia false false missing missing lfa Ann-Katrin Wett
3 A51_BAFIA_ZAKAAN A51_BAFIA_ZAKAAN lefa1242 Lefa lfa Africa 5.1 11.2 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia false false missing missing lfa Ann-Katrin Wett
4 A53_BAFIA_RIKPA A53_BAFIA_RIKPA bafi1243 Bafia ksf Africa 5.0 11.17 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.53) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia false false missing bfi ksf Ann-Katrin Wett
5 A54_BAFIA_NJANTI A54_BAFIA_NJANTI tibe1274 Tibea ngy Africa 5.3 11.3 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.54) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50) false false missing missing ngy Ann-Katrin Wett
6 A60_GUNU A60_GUNU nugu1242 Nugunu (Cameroon) yas Africa 4.58 11.25 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.622) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60),Mmala-Elip-Gunu,Elip-Gunu false false missing gun yas Ann-Katrin Wett
7 A60_MMAALA A60_MMAALA mmaa1238 Mmaala mmu Africa 4.5 11.08 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.62) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60),Mmala-Elip-Gunu false false missing missing mmu Ann-Katrin Wett
8 A61_NGORO_ASOM A61_NGORO_ASOM tuki1240 Tuki bag Africa 4.58 11.5 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.601) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Sanaga-WestMbam(A.40),Sanaga(A.60) false false missing tki bag Ann-Katrin Wett
9 A62_KALONGE A62_KALONGE yang1293 Yangben yav Africa 4.43 11.08 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.62) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60) false false missing missing yav Ann-Katrin Wett
10 A72a_EWONDO A72a_EWONDO ewon1239 Ewondo ewo Africa 4.0 12.0 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Ewondo-Fang(A.72) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Basaa-Yaunde(A40-70),Yaunde-Fang(A.70),Ewondo-Bebele false false missing ewo ewo Ann-Katrin Wett
11 AASAX AASAX aasa1238 Aasax aas Africa -4.04 37.16 Afro-Asiatic AA.SOUTHERN_CUSHITIC Afro-Asiatic,Cushitic,South Afro-Asiatic,Cushitic,SouthCushitic true false 2010 missing aas Darja Appelganz
12 ABAGA ABAGA abag1245 Abaga abg Papunesia -6.17 145.67 Nuclear Trans New Guinea TNG.SIANE-YAGARIA Trans-NewGuinea,Madang,Kalam-Kobon Nuclear_Trans_New_Guinea,Kainantu-Goroka,Goroka,NuclearGoroka,Siane-Yagaria,Kamano-Yagaria,UnclassifiedKamano-Yagaria false false missing missing abg Matthew S. Dryer and Søren Wichmann
13 ABANYOM ABANYOM aban1242 Abanyom abm Africa 6.29 8.63 Atlantic-Congo NC.EKOID-MBE Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,Ekoid,Bakor Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,Ekoid-Mbe,Ekoid,Bakor-Ejagham,Bakor,NorthernBakor,Abanyom-Nkem-Nkum false false missing missing abm Guillaume Segerer and Søren Wichmann
9901 ZOOMBO_4 ZOOMBO_4 koon1244 South-Central Koongo kng Africa -5.0 15.0 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,H,Kikongo(H.16) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Central-WesternBantu,West-CoastalBantu,Nzadic,Lweric,Dingic,Loange-Atlantic,KLCExtended,KikongoLanguageCluster,NuclearKLC,Kikongoic,KambakunyicKikongo,KilaadicKikongo,Central-SouthernKikongo,SoutheasternKikongo,SouthernKikongo,Koongo-Kituba false false missing fio kng Ann-Katrin Wett
9902 ZOQUE_FRANCISCO_LEON ZOQUE_FRANCISCO_LEON fran1266 Francisco León Zoque zos North America 17.33 -93.25 Mixe-Zoque MZ.MIXE-ZOQUE Mixe-Zoquean,Zoquean,ChiapasZoquean Mixe-Zoque,Zoque,ChiapasZoque false false missing zfl zos Søren Wichmann and Viveka Velupillai
9903 ZOQUE_RAYON ZOQUE_RAYON rayo1235 Rayón Zoque zor North America 17.08 -93.0 Mixe-Zoque MZ.MIXE-ZOQUE Mixe-Zoquean,Zoquean,ChiapasZoquean,NortheastZoque Mixe-Zoque,Zoque,ChiapasZoque false false missing zqr zor Søren Wichmann and Viveka Velupillai
9904 ZOROP ZOROP yafi1240 Yafi wfg Papunesia -3.42 140.92 Pauwasi Pau.EASTERN_PAUWASI Pauwasi,Eastern Pauwasi,EasternPauwasi false false missing missing wfg Matthew S. Dryer
9905 ZUGUNUK_KALASHA ZUGUNUK_KALASHA kala1372 Chitral Kalasha kls Eurasia 35.49 71.7 Indo-European IE.INDIC Indo-European,Indo-Iranian,Indo-Aryan,OuterLanguages,Northwestern,Dardic,Chitral Indo-European,ClassicalIndo-European,Indo-Iranian,Indo-Aryan,Indo-AryanNorthwesternzone,Chitral false false missing klh kls missing
9906 ZULGO ZULGO zulg1242 Zulgo-Gemzek gnd Africa 10.83 14.08 Afro-Asiatic AA.BIU-MANDARA Afro-Asiatic,Chadic,Biu-Mandara,A,A.5 Afro-Asiatic,Chadic,Biu-Mandara,NorthBiu-Mandara,Margi-Mandara-Mofu,Mofuic,Meri false false missing missing gnd Ann-Katrin Wett
9907 ZULU ZULU zulu1248 Zulu zul Africa -30.0 30.0 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa false false missing zno zul Cecil H. Brown
9908 ZULU_2 ZULU_2 zulu1248 Zulu zul Africa -30.0 30.0 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa false false missing zno zul Ann-Katrin Wett
9909 ZULU_NKANDLA ZULU_NKANDLA zulu1248 Zulu zul Africa -30.0 30.0 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa false false missing zno zul Ann-Katrin Wett
9910 ZUMBUN ZUMBUN zumb1240 Zumbun jmb Africa 10.8 10.0 Afro-Asiatic AA.WEST_CHADIC Afro-Asiatic,Chadic,West,B,B.2 Afro-Asiatic,Chadic,WestChadic,WestChadicB,WestChadicB.2 false false missing missing jmb Julia Bischoffberger
9911 ZUNI ZUNI zuni1245 Zuni zun North America 35.08 -108.83 Zuni Zun.ZUNI Languageisolate_zun Zuni false false missing zun zun Cecil H. Brown
9912 ZWAY ZWAY zayy1238 Zay zwa Africa 7.93 38.83 Afro-Asiatic AA.SEMITIC Afro-Asiatic,Semitic,South,Ethiopian,South,Transversal,Harari-EastGurage Afro-Asiatic,Semitic,WestSemitic,Ethiosemitic,SouthEthiopic,TransversalSouthEthiopic,Harari-EastGurage false false missing missing zwa Cecil H. Brown and Dmitry Egorov

Here is a helper function that removes all diacritics from ASJP strings.

function cleanASJP(word)
      @pipe word |>
            replace(_, r"[ \*~\"]" => "") |>
            replace(_, r"(.)(.)(.)\$" => s"\2")
end
cleanASJP (generic function with 1 method)

Combining the cldf components into a single long word list and saving it to disk.

full_asjp = @pipe forms |>
      select(_, :Language_ID, :Form, :Parameter_ID) |>
      innerjoin(_, parameters, on = :Parameter_ID => :ID) |>
      select(_, :Language_ID, :Form, :Concepticon_ID, :Concepticon_Gloss) |>
      insertcols(_, :simplified => cleanASJP.(_.Form)) |>
      insertcols(
            _, 
            :tokens => map(
                  x -> join(string.(split(x, "")), " "),
                  _.simplified
            )
      )

CSV.write("../data/asjp_full_long.csv", full_asjp)
"../data/asjp_full_long.csv"

Adding longname, i.e. [WALS family].[WALS genus].[ASJP name]

languages[!, :longname] = @pipe languages |>
      zip(_.classification_wals, _.Name) |>
      join.(_, ".") |>
      replace.(_, "-" => "_")
9912-element Vector{String}:
 "NC.BANTU.A51_BAFIA_MAJA"
 "NC.BANTU.A51_BAFIA_TUMI_TINGON"
 "NC.BANTU.A51_BAFIA_ZAKAAN"
 "NC.BANTU.A53_BAFIA_RIKPA"
 "NC.BANTU.A54_BAFIA_NJANTI"
 "NC.BANTU.A60_GUNU"
 "NC.BANTU.A60_MMAALA"
 "NC.BANTU.A61_NGORO_ASOM"
 "NC.BANTU.A62_KALONGE"
 "NC.BANTU.A72a_EWONDO"
 "AA.SOUTHERN_CUSHITIC.AASAX"
 "TNG.SIANE_YAGARIA.ABAGA"
 "NC.EKOID_MBE.ABANYOM"
 ⋮
 "NC.BANTU.ZOOMBO_4"
 "MZ.MIXE_ZOQUE.ZOQUE_FRANCISCO_LEON"
 "MZ.MIXE_ZOQUE.ZOQUE_RAYON"
 "Pau.EASTERN_PAUWASI.ZOROP"
 "IE.INDIC.ZUGUNUK_KALASHA"
 "AA.BIU_MANDARA.ZULGO"
 "NC.BANTU.ZULU"
 "NC.BANTU.ZULU_2"
 "NC.BANTU.ZULU_NKANDLA"
 "AA.WEST_CHADIC.ZUMBUN"
 "Zun.ZUNI.ZUNI"
 "AA.SEMITIC.ZWAY"

Adding column with transcription without diacritics:

forms[!, :simplified] = cleanASJP.(forms.Value)

forms = @pipe forms |>
      innerjoin(_, parameters, on = :Parameter_ID => :ID)
482118×18 DataFrame
482093 rows omitted
Row ID Local_ID Language_ID Parameter_ID Value Form Segments Comment Source Cognacy Loan Graphemes Profile gloss_in_source simplified Name Concepticon_ID Concepticon_Gloss
String Missing String Int64 String31 String31 String String? String15? Missing Bool String String7 String15 String String15 Int64 String31
1 A51_BAFIA_MAJA-11-1 missing A51_BAFIA_MAJA 11 fo7 fo7 f o ʔ missing 2 missing false ^ f o 7 $ default one fo7 *one 1493 ONE
2 A51_BAFIA_MAJA-12-1 missing A51_BAFIA_MAJA 12 b"yE b"yE b’ j a missing 2 missing false ^ b" y E $ default two byE *two 1498 TWO
3 A51_BAFIA_MAJA-18-1 missing A51_BAFIA_MAJA 18 mum mum m u m missing 2 missing false ^ m u m $ default person mum *person 683 PERSON
4 A51_BAFIA_MAJA-18-2 missing A51_BAFIA_MAJA 18 b"um b"um b’ u m missing 2 missing false ^ b" u m $ default person bum *person 683 PERSON
5 A51_BAFIA_MAJA-19-1 missing A51_BAFIA_MAJA 19 zEy zEy z a j missing 2 missing false ^ z E y $ default fish zEy *fish 227 FISH
6 A51_BAFIA_MAJA-21-1 missing A51_BAFIA_MAJA 21 b3 b3 b ə missing 2 missing false ^ b 3 $ default dog b3 *dog 2009 DOG
7 A51_BAFIA_MAJA-22-1 missing A51_BAFIA_MAJA 22 TEy TEy c a j missing 2 missing false ^ T E y $ default louse TEy *louse 1392 LOUSE
8 A51_BAFIA_MAJA-22-2 missing A51_BAFIA_MAJA 22 b"yey b"yey b’ j e j missing 2 missing false ^ b" y e y $ default louse byey *louse 1392 LOUSE
9 A51_BAFIA_MAJA-23-1 missing A51_BAFIA_MAJA 23 t3 t3 t ə missing 2 missing false ^ t 3 $ default tree t3 *tree 906 TREE
10 A51_BAFIA_MAJA-25-1 missing A51_BAFIA_MAJA 25 fyeyN fyeyN f j e j ŋ missing 2 missing false ^ f y e y N $ default leaf fyeyN *leaf 628 LEAF
11 A51_BAFIA_MAJA-25-2 missing A51_BAFIA_MAJA 25 fyey fyey f j e j missing 2 missing false ^ f y e y $ default leaf fyey *leaf 628 LEAF
12 A51_BAFIA_MAJA-25-3 missing A51_BAFIA_MAJA 25 tyey tyey t j e j missing 2 missing false ^ t y e y $ default leaf tyey *leaf 628 LEAF
13 A51_BAFIA_MAJA-28-1 missing A51_BAFIA_MAJA 28 kwokw3 kwokw3 k w o k w ə missing 2 missing false ^ k w o k w 3 $ default skin kwokw3 *skin 763 SKIN
482107 ZWAY-53-1 missing ZWAY 53 gobut gobut g o b u t missing 10279 missing false ^ g o b u t $ default liver gobut *liver 1224 LIVER
482108 ZWAY-54-1 missing ZWAY 54 sETin sETin s a c i n missing 10279 missing false ^ s E T i n $ default drink sETin *drink 1401 DRINK
482109 ZWAY-66-1 missing ZWAY 66 mEt"t"En mEt"t"En m a tʼ tʼ a n missing 10279 missing false ^ m E t" t" E n $ default come mEttEn *come 1446 COME
482110 ZWAY-72-1 missing ZWAY 72 Erit Erit a r i t missing 10279 missing false ^ E r i t $ default sun Erit *sun 1343 SUN
482111 ZWAY-74-1 missing ZWAY 74 kokEb kokEb k o k a b missing 10279 missing false ^ k o k E b $ default star kokEb *star 1430 STAR
482112 ZWAY-75-1 missing ZWAY 75 mEi mEi m a i missing 10279 missing false ^ m E i $ default water mEi *water 948 WATER
482113 ZWAY-77-1 missing ZWAY 77 umon umon u m o n missing 10279 missing false ^ u m o n $ default stone umon *stone 857 STONE
482114 ZWAY-82-1 missing ZWAY 82 yirE yirE j i r a missing 10279 missing false ^ y i r E $ default fire yirE *fire 221 FIRE
482115 ZWAY-86-1 missing ZWAY 86 sEri sEri s a r i missing 10279 missing false ^ s E r i $ default mountain sEri *mountain 639 MOUNTAIN
482116 ZWAY-92-1 missing ZWAY 92 Erut Erut a r u t missing 10279 missing false ^ E r u t $ default night Erut *night 1233 NIGHT
482117 ZWAY-96-1 missing ZWAY 96 woirE woirE w o i r a missing 10279 missing false ^ w o i r E $ default new woirE *new 1231 NEW
482118 ZWAY-100-1 missing ZWAY 100 s3m s3m s ə m missing 10279 missing false ^ s 3 m $ default name s3m *name 1405 NAME

Removing Oth. languages

languages = languages[.!occursin.("Oth.", languages.classification_wals), :]
9835×19 DataFrame
9810 rows omitted
Row ID Name Glottocode Glottolog_Name ISO639P3code Macroarea Latitude Longitude Family classification_wals classification_ethnologue classification_glottolog recently_extinct long_extinct year_of_extinction code_wals code_iso transcribers longname
String String String15? String? String3? String15? Float64? Float64? String31? String String String Bool Bool Int64? String3? String3? String? String
1 A51_BAFIA_MAJA A51_BAFIA_MAJA lefa1242 Lefa lfa Africa 5.1 11.2 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia false false missing missing lfa Ann-Katrin Wett NC.BANTU.A51_BAFIA_MAJA
2 A51_BAFIA_TUMI_TINGON A51_BAFIA_TUMI_TINGON lefa1242 Lefa lfa Africa 5.1 11.2 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia false false missing missing lfa Ann-Katrin Wett NC.BANTU.A51_BAFIA_TUMI_TINGON
3 A51_BAFIA_ZAKAAN A51_BAFIA_ZAKAAN lefa1242 Lefa lfa Africa 5.1 11.2 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia false false missing missing lfa Ann-Katrin Wett NC.BANTU.A51_BAFIA_ZAKAAN
4 A53_BAFIA_RIKPA A53_BAFIA_RIKPA bafi1243 Bafia ksf Africa 5.0 11.17 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.53) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia false false missing bfi ksf Ann-Katrin Wett NC.BANTU.A53_BAFIA_RIKPA
5 A54_BAFIA_NJANTI A54_BAFIA_NJANTI tibe1274 Tibea ngy Africa 5.3 11.3 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.54) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50) false false missing missing ngy Ann-Katrin Wett NC.BANTU.A54_BAFIA_NJANTI
6 A60_GUNU A60_GUNU nugu1242 Nugunu (Cameroon) yas Africa 4.58 11.25 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.622) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60),Mmala-Elip-Gunu,Elip-Gunu false false missing gun yas Ann-Katrin Wett NC.BANTU.A60_GUNU
7 A60_MMAALA A60_MMAALA mmaa1238 Mmaala mmu Africa 4.5 11.08 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.62) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60),Mmala-Elip-Gunu false false missing missing mmu Ann-Katrin Wett NC.BANTU.A60_MMAALA
8 A61_NGORO_ASOM A61_NGORO_ASOM tuki1240 Tuki bag Africa 4.58 11.5 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.601) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Sanaga-WestMbam(A.40),Sanaga(A.60) false false missing tki bag Ann-Katrin Wett NC.BANTU.A61_NGORO_ASOM
9 A62_KALONGE A62_KALONGE yang1293 Yangben yav Africa 4.43 11.08 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.62) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60) false false missing missing yav Ann-Katrin Wett NC.BANTU.A62_KALONGE
10 A72a_EWONDO A72a_EWONDO ewon1239 Ewondo ewo Africa 4.0 12.0 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Ewondo-Fang(A.72) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Basaa-Yaunde(A40-70),Yaunde-Fang(A.70),Ewondo-Bebele false false missing ewo ewo Ann-Katrin Wett NC.BANTU.A72a_EWONDO
11 AASAX AASAX aasa1238 Aasax aas Africa -4.04 37.16 Afro-Asiatic AA.SOUTHERN_CUSHITIC Afro-Asiatic,Cushitic,South Afro-Asiatic,Cushitic,SouthCushitic true false 2010 missing aas Darja Appelganz AA.SOUTHERN_CUSHITIC.AASAX
12 ABAGA ABAGA abag1245 Abaga abg Papunesia -6.17 145.67 Nuclear Trans New Guinea TNG.SIANE-YAGARIA Trans-NewGuinea,Madang,Kalam-Kobon Nuclear_Trans_New_Guinea,Kainantu-Goroka,Goroka,NuclearGoroka,Siane-Yagaria,Kamano-Yagaria,UnclassifiedKamano-Yagaria false false missing missing abg Matthew S. Dryer and Søren Wichmann TNG.SIANE_YAGARIA.ABAGA
13 ABANYOM ABANYOM aban1242 Abanyom abm Africa 6.29 8.63 Atlantic-Congo NC.EKOID-MBE Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,Ekoid,Bakor Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,Ekoid-Mbe,Ekoid,Bakor-Ejagham,Bakor,NorthernBakor,Abanyom-Nkem-Nkum false false missing missing abm Guillaume Segerer and Søren Wichmann NC.EKOID_MBE.ABANYOM
9824 ZOOMBO_4 ZOOMBO_4 koon1244 South-Central Koongo kng Africa -5.0 15.0 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,H,Kikongo(H.16) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Central-WesternBantu,West-CoastalBantu,Nzadic,Lweric,Dingic,Loange-Atlantic,KLCExtended,KikongoLanguageCluster,NuclearKLC,Kikongoic,KambakunyicKikongo,KilaadicKikongo,Central-SouthernKikongo,SoutheasternKikongo,SouthernKikongo,Koongo-Kituba false false missing fio kng Ann-Katrin Wett NC.BANTU.ZOOMBO_4
9825 ZOQUE_FRANCISCO_LEON ZOQUE_FRANCISCO_LEON fran1266 Francisco León Zoque zos North America 17.33 -93.25 Mixe-Zoque MZ.MIXE-ZOQUE Mixe-Zoquean,Zoquean,ChiapasZoquean Mixe-Zoque,Zoque,ChiapasZoque false false missing zfl zos Søren Wichmann and Viveka Velupillai MZ.MIXE_ZOQUE.ZOQUE_FRANCISCO_LEON
9826 ZOQUE_RAYON ZOQUE_RAYON rayo1235 Rayón Zoque zor North America 17.08 -93.0 Mixe-Zoque MZ.MIXE-ZOQUE Mixe-Zoquean,Zoquean,ChiapasZoquean,NortheastZoque Mixe-Zoque,Zoque,ChiapasZoque false false missing zqr zor Søren Wichmann and Viveka Velupillai MZ.MIXE_ZOQUE.ZOQUE_RAYON
9827 ZOROP ZOROP yafi1240 Yafi wfg Papunesia -3.42 140.92 Pauwasi Pau.EASTERN_PAUWASI Pauwasi,Eastern Pauwasi,EasternPauwasi false false missing missing wfg Matthew S. Dryer Pau.EASTERN_PAUWASI.ZOROP
9828 ZUGUNUK_KALASHA ZUGUNUK_KALASHA kala1372 Chitral Kalasha kls Eurasia 35.49 71.7 Indo-European IE.INDIC Indo-European,Indo-Iranian,Indo-Aryan,OuterLanguages,Northwestern,Dardic,Chitral Indo-European,ClassicalIndo-European,Indo-Iranian,Indo-Aryan,Indo-AryanNorthwesternzone,Chitral false false missing klh kls missing IE.INDIC.ZUGUNUK_KALASHA
9829 ZULGO ZULGO zulg1242 Zulgo-Gemzek gnd Africa 10.83 14.08 Afro-Asiatic AA.BIU-MANDARA Afro-Asiatic,Chadic,Biu-Mandara,A,A.5 Afro-Asiatic,Chadic,Biu-Mandara,NorthBiu-Mandara,Margi-Mandara-Mofu,Mofuic,Meri false false missing missing gnd Ann-Katrin Wett AA.BIU_MANDARA.ZULGO
9830 ZULU ZULU zulu1248 Zulu zul Africa -30.0 30.0 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa false false missing zno zul Cecil H. Brown NC.BANTU.ZULU
9831 ZULU_2 ZULU_2 zulu1248 Zulu zul Africa -30.0 30.0 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa false false missing zno zul Ann-Katrin Wett NC.BANTU.ZULU_2
9832 ZULU_NKANDLA ZULU_NKANDLA zulu1248 Zulu zul Africa -30.0 30.0 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa false false missing zno zul Ann-Katrin Wett NC.BANTU.ZULU_NKANDLA
9833 ZUMBUN ZUMBUN zumb1240 Zumbun jmb Africa 10.8 10.0 Afro-Asiatic AA.WEST_CHADIC Afro-Asiatic,Chadic,West,B,B.2 Afro-Asiatic,Chadic,WestChadic,WestChadicB,WestChadicB.2 false false missing missing jmb Julia Bischoffberger AA.WEST_CHADIC.ZUMBUN
9834 ZUNI ZUNI zuni1245 Zuni zun North America 35.08 -108.83 Zuni Zun.ZUNI Languageisolate_zun Zuni false false missing zun zun Cecil H. Brown Zun.ZUNI.ZUNI
9835 ZWAY ZWAY zayy1238 Zay zwa Africa 7.93 38.83 Afro-Asiatic AA.SEMITIC Afro-Asiatic,Semitic,South,Ethiopian,South,Transversal,Harari-EastGurage Afro-Asiatic,Semitic,WestSemitic,Ethiosemitic,SouthEthiopic,TransversalSouthEthiopic,Harari-EastGurage false false missing missing zwa Cecil H. Brown and Dmitry Egorov AA.SEMITIC.ZWAY

Calculating for each concept the number of doculects where it appears and singling out the 40 concepts with the best coverage.

conceptCoverage = @pipe forms |>
      unique(_, [:Language_ID, :Concepticon_Gloss]) |>
      groupby(_, :Concepticon_Gloss) |>
      combine(nrow, _) |>
      sort(_, :nrow, rev = true)

concepts = conceptCoverage.Concepticon_Gloss[1:40]


forms40 = forms[map(x -> x  concepts, forms.Concepticon_Gloss), :]
371630×18 DataFrame
371605 rows omitted
Row ID Local_ID Language_ID Parameter_ID Value Form Segments Comment Source Cognacy Loan Graphemes Profile gloss_in_source simplified Name Concepticon_ID Concepticon_Gloss
String Missing String Int64 String31 String31 String String? String15? Missing Bool String String7 String15 String String15 Int64 String31
1 A51_BAFIA_MAJA-11-1 missing A51_BAFIA_MAJA 11 fo7 fo7 f o ʔ missing 2 missing false ^ f o 7 $ default one fo7 *one 1493 ONE
2 A51_BAFIA_MAJA-12-1 missing A51_BAFIA_MAJA 12 b"yE b"yE b’ j a missing 2 missing false ^ b" y E $ default two byE *two 1498 TWO
3 A51_BAFIA_MAJA-18-1 missing A51_BAFIA_MAJA 18 mum mum m u m missing 2 missing false ^ m u m $ default person mum *person 683 PERSON
4 A51_BAFIA_MAJA-18-2 missing A51_BAFIA_MAJA 18 b"um b"um b’ u m missing 2 missing false ^ b" u m $ default person bum *person 683 PERSON
5 A51_BAFIA_MAJA-19-1 missing A51_BAFIA_MAJA 19 zEy zEy z a j missing 2 missing false ^ z E y $ default fish zEy *fish 227 FISH
6 A51_BAFIA_MAJA-21-1 missing A51_BAFIA_MAJA 21 b3 b3 b ə missing 2 missing false ^ b 3 $ default dog b3 *dog 2009 DOG
7 A51_BAFIA_MAJA-22-1 missing A51_BAFIA_MAJA 22 TEy TEy c a j missing 2 missing false ^ T E y $ default louse TEy *louse 1392 LOUSE
8 A51_BAFIA_MAJA-22-2 missing A51_BAFIA_MAJA 22 b"yey b"yey b’ j e j missing 2 missing false ^ b" y e y $ default louse byey *louse 1392 LOUSE
9 A51_BAFIA_MAJA-23-1 missing A51_BAFIA_MAJA 23 t3 t3 t ə missing 2 missing false ^ t 3 $ default tree t3 *tree 906 TREE
10 A51_BAFIA_MAJA-25-1 missing A51_BAFIA_MAJA 25 fyeyN fyeyN f j e j ŋ missing 2 missing false ^ f y e y N $ default leaf fyeyN *leaf 628 LEAF
11 A51_BAFIA_MAJA-25-2 missing A51_BAFIA_MAJA 25 fyey fyey f j e j missing 2 missing false ^ f y e y $ default leaf fyey *leaf 628 LEAF
12 A51_BAFIA_MAJA-25-3 missing A51_BAFIA_MAJA 25 tyey tyey t j e j missing 2 missing false ^ t y e y $ default leaf tyey *leaf 628 LEAF
13 A51_BAFIA_MAJA-28-1 missing A51_BAFIA_MAJA 28 kwokw3 kwokw3 k w o k w ə missing 2 missing false ^ k w o k w 3 $ default skin kwokw3 *skin 763 SKIN
371619 ZWAY-53-1 missing ZWAY 53 gobut gobut g o b u t missing 10279 missing false ^ g o b u t $ default liver gobut *liver 1224 LIVER
371620 ZWAY-54-1 missing ZWAY 54 sETin sETin s a c i n missing 10279 missing false ^ s E T i n $ default drink sETin *drink 1401 DRINK
371621 ZWAY-66-1 missing ZWAY 66 mEt"t"En mEt"t"En m a tʼ tʼ a n missing 10279 missing false ^ m E t" t" E n $ default come mEttEn *come 1446 COME
371622 ZWAY-72-1 missing ZWAY 72 Erit Erit a r i t missing 10279 missing false ^ E r i t $ default sun Erit *sun 1343 SUN
371623 ZWAY-74-1 missing ZWAY 74 kokEb kokEb k o k a b missing 10279 missing false ^ k o k E b $ default star kokEb *star 1430 STAR
371624 ZWAY-75-1 missing ZWAY 75 mEi mEi m a i missing 10279 missing false ^ m E i $ default water mEi *water 948 WATER
371625 ZWAY-77-1 missing ZWAY 77 umon umon u m o n missing 10279 missing false ^ u m o n $ default stone umon *stone 857 STONE
371626 ZWAY-82-1 missing ZWAY 82 yirE yirE j i r a missing 10279 missing false ^ y i r E $ default fire yirE *fire 221 FIRE
371627 ZWAY-86-1 missing ZWAY 86 sEri sEri s a r i missing 10279 missing false ^ s E r i $ default mountain sEri *mountain 639 MOUNTAIN
371628 ZWAY-92-1 missing ZWAY 92 Erut Erut a r u t missing 10279 missing false ^ E r u t $ default night Erut *night 1233 NIGHT
371629 ZWAY-96-1 missing ZWAY 96 woirE woirE w o i r a missing 10279 missing false ^ w o i r E $ default new woirE *new 1231 NEW
371630 ZWAY-100-1 missing ZWAY 100 s3m s3m s ə m missing 10279 missing false ^ s 3 m $ default name s3m *name 1405 NAME

Computing the number of covered concepts for each language. Only doculects with at least 30 concept entries are considered further.

languageCoverage = @pipe forms40 |>
      unique(_, [:Language_ID, :Concepticon_Gloss]) |>
      groupby(_, :Language_ID) |>
      combine(nrow, _) |>
      sort(_, :nrow, rev = true)

doculects = languageCoverage.Language_ID[languageCoverage.nrow.>=30]

forms40 = forms40[map(x -> x  doculects, forms40.Language_ID), :]
315843×18 DataFrame
315818 rows omitted
Row ID Local_ID Language_ID Parameter_ID Value Form Segments Comment Source Cognacy Loan Graphemes Profile gloss_in_source simplified Name Concepticon_ID Concepticon_Gloss
String Missing String Int64 String31 String31 String String? String15? Missing Bool String String7 String15 String String15 Int64 String31
1 A51_BAFIA_MAJA-11-1 missing A51_BAFIA_MAJA 11 fo7 fo7 f o ʔ missing 2 missing false ^ f o 7 $ default one fo7 *one 1493 ONE
2 A51_BAFIA_MAJA-12-1 missing A51_BAFIA_MAJA 12 b"yE b"yE b’ j a missing 2 missing false ^ b" y E $ default two byE *two 1498 TWO
3 A51_BAFIA_MAJA-18-1 missing A51_BAFIA_MAJA 18 mum mum m u m missing 2 missing false ^ m u m $ default person mum *person 683 PERSON
4 A51_BAFIA_MAJA-18-2 missing A51_BAFIA_MAJA 18 b"um b"um b’ u m missing 2 missing false ^ b" u m $ default person bum *person 683 PERSON
5 A51_BAFIA_MAJA-19-1 missing A51_BAFIA_MAJA 19 zEy zEy z a j missing 2 missing false ^ z E y $ default fish zEy *fish 227 FISH
6 A51_BAFIA_MAJA-21-1 missing A51_BAFIA_MAJA 21 b3 b3 b ə missing 2 missing false ^ b 3 $ default dog b3 *dog 2009 DOG
7 A51_BAFIA_MAJA-22-1 missing A51_BAFIA_MAJA 22 TEy TEy c a j missing 2 missing false ^ T E y $ default louse TEy *louse 1392 LOUSE
8 A51_BAFIA_MAJA-22-2 missing A51_BAFIA_MAJA 22 b"yey b"yey b’ j e j missing 2 missing false ^ b" y e y $ default louse byey *louse 1392 LOUSE
9 A51_BAFIA_MAJA-23-1 missing A51_BAFIA_MAJA 23 t3 t3 t ə missing 2 missing false ^ t 3 $ default tree t3 *tree 906 TREE
10 A51_BAFIA_MAJA-25-1 missing A51_BAFIA_MAJA 25 fyeyN fyeyN f j e j ŋ missing 2 missing false ^ f y e y N $ default leaf fyeyN *leaf 628 LEAF
11 A51_BAFIA_MAJA-25-2 missing A51_BAFIA_MAJA 25 fyey fyey f j e j missing 2 missing false ^ f y e y $ default leaf fyey *leaf 628 LEAF
12 A51_BAFIA_MAJA-25-3 missing A51_BAFIA_MAJA 25 tyey tyey t j e j missing 2 missing false ^ t y e y $ default leaf tyey *leaf 628 LEAF
13 A51_BAFIA_MAJA-28-1 missing A51_BAFIA_MAJA 28 kwokw3 kwokw3 k w o k w ə missing 2 missing false ^ k w o k w 3 $ default skin kwokw3 *skin 763 SKIN
315832 ZWAY-53-1 missing ZWAY 53 gobut gobut g o b u t missing 10279 missing false ^ g o b u t $ default liver gobut *liver 1224 LIVER
315833 ZWAY-54-1 missing ZWAY 54 sETin sETin s a c i n missing 10279 missing false ^ s E T i n $ default drink sETin *drink 1401 DRINK
315834 ZWAY-66-1 missing ZWAY 66 mEt"t"En mEt"t"En m a tʼ tʼ a n missing 10279 missing false ^ m E t" t" E n $ default come mEttEn *come 1446 COME
315835 ZWAY-72-1 missing ZWAY 72 Erit Erit a r i t missing 10279 missing false ^ E r i t $ default sun Erit *sun 1343 SUN
315836 ZWAY-74-1 missing ZWAY 74 kokEb kokEb k o k a b missing 10279 missing false ^ k o k E b $ default star kokEb *star 1430 STAR
315837 ZWAY-75-1 missing ZWAY 75 mEi mEi m a i missing 10279 missing false ^ m E i $ default water mEi *water 948 WATER
315838 ZWAY-77-1 missing ZWAY 77 umon umon u m o n missing 10279 missing false ^ u m o n $ default stone umon *stone 857 STONE
315839 ZWAY-82-1 missing ZWAY 82 yirE yirE j i r a missing 10279 missing false ^ y i r E $ default fire yirE *fire 221 FIRE
315840 ZWAY-86-1 missing ZWAY 86 sEri sEri s a r i missing 10279 missing false ^ s E r i $ default mountain sEri *mountain 639 MOUNTAIN
315841 ZWAY-92-1 missing ZWAY 92 Erut Erut a r u t missing 10279 missing false ^ E r u t $ default night Erut *night 1233 NIGHT
315842 ZWAY-96-1 missing ZWAY 96 woirE woirE w o i r a missing 10279 missing false ^ w o i r E $ default new woirE *new 1231 NEW
315843 ZWAY-100-1 missing ZWAY 100 s3m s3m s ə m missing 10279 missing false ^ s 3 m $ default name s3m *name 1405 NAME

Creating wordlist with these restrictions; throwing out PROTO languages.

asjpLong = innerjoin(
      forms40,
      languages[:, [:Name, :longname]],
      on = :Language_ID => :Name,
)

asjpLong = asjpLong[.!occursin.("PROTO", asjpLong.Language_ID),:]
306824×19 DataFrame
306799 rows omitted
Row ID Local_ID Language_ID Parameter_ID Value Form Segments Comment Source Cognacy Loan Graphemes Profile gloss_in_source simplified Name Concepticon_ID Concepticon_Gloss longname
String Missing String Int64 String31 String31 String String? String15? Missing Bool String String7 String15 String String15 Int64 String31 String
1 A51_BAFIA_MAJA-11-1 missing A51_BAFIA_MAJA 11 fo7 fo7 f o ʔ missing 2 missing false ^ f o 7 $ default one fo7 *one 1493 ONE NC.BANTU.A51_BAFIA_MAJA
2 A51_BAFIA_MAJA-12-1 missing A51_BAFIA_MAJA 12 b"yE b"yE b’ j a missing 2 missing false ^ b" y E $ default two byE *two 1498 TWO NC.BANTU.A51_BAFIA_MAJA
3 A51_BAFIA_MAJA-18-1 missing A51_BAFIA_MAJA 18 mum mum m u m missing 2 missing false ^ m u m $ default person mum *person 683 PERSON NC.BANTU.A51_BAFIA_MAJA
4 A51_BAFIA_MAJA-18-2 missing A51_BAFIA_MAJA 18 b"um b"um b’ u m missing 2 missing false ^ b" u m $ default person bum *person 683 PERSON NC.BANTU.A51_BAFIA_MAJA
5 A51_BAFIA_MAJA-19-1 missing A51_BAFIA_MAJA 19 zEy zEy z a j missing 2 missing false ^ z E y $ default fish zEy *fish 227 FISH NC.BANTU.A51_BAFIA_MAJA
6 A51_BAFIA_MAJA-21-1 missing A51_BAFIA_MAJA 21 b3 b3 b ə missing 2 missing false ^ b 3 $ default dog b3 *dog 2009 DOG NC.BANTU.A51_BAFIA_MAJA
7 A51_BAFIA_MAJA-22-1 missing A51_BAFIA_MAJA 22 TEy TEy c a j missing 2 missing false ^ T E y $ default louse TEy *louse 1392 LOUSE NC.BANTU.A51_BAFIA_MAJA
8 A51_BAFIA_MAJA-22-2 missing A51_BAFIA_MAJA 22 b"yey b"yey b’ j e j missing 2 missing false ^ b" y e y $ default louse byey *louse 1392 LOUSE NC.BANTU.A51_BAFIA_MAJA
9 A51_BAFIA_MAJA-23-1 missing A51_BAFIA_MAJA 23 t3 t3 t ə missing 2 missing false ^ t 3 $ default tree t3 *tree 906 TREE NC.BANTU.A51_BAFIA_MAJA
10 A51_BAFIA_MAJA-25-1 missing A51_BAFIA_MAJA 25 fyeyN fyeyN f j e j ŋ missing 2 missing false ^ f y e y N $ default leaf fyeyN *leaf 628 LEAF NC.BANTU.A51_BAFIA_MAJA
11 A51_BAFIA_MAJA-25-2 missing A51_BAFIA_MAJA 25 fyey fyey f j e j missing 2 missing false ^ f y e y $ default leaf fyey *leaf 628 LEAF NC.BANTU.A51_BAFIA_MAJA
12 A51_BAFIA_MAJA-25-3 missing A51_BAFIA_MAJA 25 tyey tyey t j e j missing 2 missing false ^ t y e y $ default leaf tyey *leaf 628 LEAF NC.BANTU.A51_BAFIA_MAJA
13 A51_BAFIA_MAJA-28-1 missing A51_BAFIA_MAJA 28 kwokw3 kwokw3 k w o k w ə missing 2 missing false ^ k w o k w 3 $ default skin kwokw3 *skin 763 SKIN NC.BANTU.A51_BAFIA_MAJA
306813 ZWAY-53-1 missing ZWAY 53 gobut gobut g o b u t missing 10279 missing false ^ g o b u t $ default liver gobut *liver 1224 LIVER AA.SEMITIC.ZWAY
306814 ZWAY-54-1 missing ZWAY 54 sETin sETin s a c i n missing 10279 missing false ^ s E T i n $ default drink sETin *drink 1401 DRINK AA.SEMITIC.ZWAY
306815 ZWAY-66-1 missing ZWAY 66 mEt"t"En mEt"t"En m a tʼ tʼ a n missing 10279 missing false ^ m E t" t" E n $ default come mEttEn *come 1446 COME AA.SEMITIC.ZWAY
306816 ZWAY-72-1 missing ZWAY 72 Erit Erit a r i t missing 10279 missing false ^ E r i t $ default sun Erit *sun 1343 SUN AA.SEMITIC.ZWAY
306817 ZWAY-74-1 missing ZWAY 74 kokEb kokEb k o k a b missing 10279 missing false ^ k o k E b $ default star kokEb *star 1430 STAR AA.SEMITIC.ZWAY
306818 ZWAY-75-1 missing ZWAY 75 mEi mEi m a i missing 10279 missing false ^ m E i $ default water mEi *water 948 WATER AA.SEMITIC.ZWAY
306819 ZWAY-77-1 missing ZWAY 77 umon umon u m o n missing 10279 missing false ^ u m o n $ default stone umon *stone 857 STONE AA.SEMITIC.ZWAY
306820 ZWAY-82-1 missing ZWAY 82 yirE yirE j i r a missing 10279 missing false ^ y i r E $ default fire yirE *fire 221 FIRE AA.SEMITIC.ZWAY
306821 ZWAY-86-1 missing ZWAY 86 sEri sEri s a r i missing 10279 missing false ^ s E r i $ default mountain sEri *mountain 639 MOUNTAIN AA.SEMITIC.ZWAY
306822 ZWAY-92-1 missing ZWAY 92 Erut Erut a r u t missing 10279 missing false ^ E r u t $ default night Erut *night 1233 NIGHT AA.SEMITIC.ZWAY
306823 ZWAY-96-1 missing ZWAY 96 woirE woirE w o i r a missing 10279 missing false ^ w o i r E $ default new woirE *new 1231 NEW AA.SEMITIC.ZWAY
306824 ZWAY-100-1 missing ZWAY 100 s3m s3m s ə m missing 10279 missing false ^ s 3 m $ default name s3m *name 1405 NAME AA.SEMITIC.ZWAY

Reformatting the word list into wide format and saving it to disk.

asjpWide = @pipe asjpLong |>
      groupby(_, [:longname, :Concepticon_Gloss]) |>
      combine(x -> join(x.simplified, "-"), _) |>
      unstack(_, :longname, :Concepticon_Gloss, :x1)

CSV.write("../data/asjp20wide.csv", asjpWide)
"../data/asjp20wide.csv"