cd(@__DIR__)
cd("../code/")
using Pkg
Pkg.activate(".")
Pkg.instantiate()
using CSV, DataFrames, Pipe
using ProgressMeter
Activating project at `~/projects/research/asjp20world_tree/code`
Setting the working directory and activate the local Julia environment.
cd(@__DIR__)
cd("../code/")
using Pkg
Pkg.activate(".")
Pkg.instantiate()
using CSV, DataFrames, Pipe
using ProgressMeter
Activating project at `~/projects/research/asjp20world_tree/code`
Downloading ASJP from Zenodo.
mkpath("tmp")
languagesF = "../data/languages.csv"
formsF = "../data/forms.csv"
parametersF = "../data/asjp_parameters.csv"
!(isfile(languagesF) && isfile(formsF)) && begin
asjpZip = download(
"https://zenodo.org/record/7079637/files/lexibank/asjp-v20.zip",
"tmp/asjp-v20.zip",
)
run(`unzip -o $asjpZip -d tmp/`)
cp("tmp/lexibank-asjp-f0f1d0d/cldf/forms.csv", formsF)
cp("tmp/lexibank-asjp-f0f1d0d/cldf/languages.csv", languagesF)
cp("tmp/lexibank-asjp-f0f1d0d/cldf/parameters.csv", parametersF)
for f in readdir("tmp")
rm("tmp/" * f, recursive = true)
end
end
rm("tmp", recursive=true)
Loading ASJP cldf files into DataFrames.
forms = CSV.read(formsF, DataFrame)
languages = CSV.read(languagesF, DataFrame)
parameters = CSV.read(parametersF, DataFrame)
dropmissing!(
languages,
[
:classification_wals,
:classification_ethnologue,
:classification_glottolog
]
)
Row | ID | Name | Glottocode | Glottolog_Name | ISO639P3code | Macroarea | Latitude | Longitude | Family | classification_wals | classification_ethnologue | classification_glottolog | recently_extinct | long_extinct | year_of_extinction | code_wals | code_iso | transcribers |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
String | String | String15? | String? | String3? | String15? | Float64? | Float64? | String31? | String | String | String | Bool | Bool | Int64? | String3? | String3? | String? | |
1 | A51_BAFIA_MAJA | A51_BAFIA_MAJA | lefa1242 | Lefa | lfa | Africa | 5.1 | 11.2 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia | false | false | missing | missing | lfa | Ann-Katrin Wett |
2 | A51_BAFIA_TUMI_TINGON | A51_BAFIA_TUMI_TINGON | lefa1242 | Lefa | lfa | Africa | 5.1 | 11.2 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia | false | false | missing | missing | lfa | Ann-Katrin Wett |
3 | A51_BAFIA_ZAKAAN | A51_BAFIA_ZAKAAN | lefa1242 | Lefa | lfa | Africa | 5.1 | 11.2 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia | false | false | missing | missing | lfa | Ann-Katrin Wett |
4 | A53_BAFIA_RIKPA | A53_BAFIA_RIKPA | bafi1243 | Bafia | ksf | Africa | 5.0 | 11.17 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.53) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia | false | false | missing | bfi | ksf | Ann-Katrin Wett |
5 | A54_BAFIA_NJANTI | A54_BAFIA_NJANTI | tibe1274 | Tibea | ngy | Africa | 5.3 | 11.3 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.54) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50) | false | false | missing | missing | ngy | Ann-Katrin Wett |
6 | A60_GUNU | A60_GUNU | nugu1242 | Nugunu (Cameroon) | yas | Africa | 4.58 | 11.25 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.622) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60),Mmala-Elip-Gunu,Elip-Gunu | false | false | missing | gun | yas | Ann-Katrin Wett |
7 | A60_MMAALA | A60_MMAALA | mmaa1238 | Mmaala | mmu | Africa | 4.5 | 11.08 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.62) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60),Mmala-Elip-Gunu | false | false | missing | missing | mmu | Ann-Katrin Wett |
8 | A61_NGORO_ASOM | A61_NGORO_ASOM | tuki1240 | Tuki | bag | Africa | 4.58 | 11.5 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.601) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Sanaga-WestMbam(A.40),Sanaga(A.60) | false | false | missing | tki | bag | Ann-Katrin Wett |
9 | A62_KALONGE | A62_KALONGE | yang1293 | Yangben | yav | Africa | 4.43 | 11.08 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.62) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60) | false | false | missing | missing | yav | Ann-Katrin Wett |
10 | A72a_EWONDO | A72a_EWONDO | ewon1239 | Ewondo | ewo | Africa | 4.0 | 12.0 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Ewondo-Fang(A.72) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Basaa-Yaunde(A40-70),Yaunde-Fang(A.70),Ewondo-Bebele | false | false | missing | ewo | ewo | Ann-Katrin Wett |
11 | AASAX | AASAX | aasa1238 | Aasax | aas | Africa | -4.04 | 37.16 | Afro-Asiatic | AA.SOUTHERN_CUSHITIC | Afro-Asiatic,Cushitic,South | Afro-Asiatic,Cushitic,SouthCushitic | true | false | 2010 | missing | aas | Darja Appelganz |
12 | ABAGA | ABAGA | abag1245 | Abaga | abg | Papunesia | -6.17 | 145.67 | Nuclear Trans New Guinea | TNG.SIANE-YAGARIA | Trans-NewGuinea,Madang,Kalam-Kobon | Nuclear_Trans_New_Guinea,Kainantu-Goroka,Goroka,NuclearGoroka,Siane-Yagaria,Kamano-Yagaria,UnclassifiedKamano-Yagaria | false | false | missing | missing | abg | Matthew S. Dryer and Søren Wichmann |
13 | ABANYOM | ABANYOM | aban1242 | Abanyom | abm | Africa | 6.29 | 8.63 | Atlantic-Congo | NC.EKOID-MBE | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,Ekoid,Bakor | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,Ekoid-Mbe,Ekoid,Bakor-Ejagham,Bakor,NorthernBakor,Abanyom-Nkem-Nkum | false | false | missing | missing | abm | Guillaume Segerer and Søren Wichmann |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
9901 | ZOOMBO_4 | ZOOMBO_4 | koon1244 | South-Central Koongo | kng | Africa | -5.0 | 15.0 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,H,Kikongo(H.16) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Central-WesternBantu,West-CoastalBantu,Nzadic,Lweric,Dingic,Loange-Atlantic,KLCExtended,KikongoLanguageCluster,NuclearKLC,Kikongoic,KambakunyicKikongo,KilaadicKikongo,Central-SouthernKikongo,SoutheasternKikongo,SouthernKikongo,Koongo-Kituba | false | false | missing | fio | kng | Ann-Katrin Wett |
9902 | ZOQUE_FRANCISCO_LEON | ZOQUE_FRANCISCO_LEON | fran1266 | Francisco León Zoque | zos | North America | 17.33 | -93.25 | Mixe-Zoque | MZ.MIXE-ZOQUE | Mixe-Zoquean,Zoquean,ChiapasZoquean | Mixe-Zoque,Zoque,ChiapasZoque | false | false | missing | zfl | zos | Søren Wichmann and Viveka Velupillai |
9903 | ZOQUE_RAYON | ZOQUE_RAYON | rayo1235 | Rayón Zoque | zor | North America | 17.08 | -93.0 | Mixe-Zoque | MZ.MIXE-ZOQUE | Mixe-Zoquean,Zoquean,ChiapasZoquean,NortheastZoque | Mixe-Zoque,Zoque,ChiapasZoque | false | false | missing | zqr | zor | Søren Wichmann and Viveka Velupillai |
9904 | ZOROP | ZOROP | yafi1240 | Yafi | wfg | Papunesia | -3.42 | 140.92 | Pauwasi | Pau.EASTERN_PAUWASI | Pauwasi,Eastern | Pauwasi,EasternPauwasi | false | false | missing | missing | wfg | Matthew S. Dryer |
9905 | ZUGUNUK_KALASHA | ZUGUNUK_KALASHA | kala1372 | Chitral Kalasha | kls | Eurasia | 35.49 | 71.7 | Indo-European | IE.INDIC | Indo-European,Indo-Iranian,Indo-Aryan,OuterLanguages,Northwestern,Dardic,Chitral | Indo-European,ClassicalIndo-European,Indo-Iranian,Indo-Aryan,Indo-AryanNorthwesternzone,Chitral | false | false | missing | klh | kls | missing |
9906 | ZULGO | ZULGO | zulg1242 | Zulgo-Gemzek | gnd | Africa | 10.83 | 14.08 | Afro-Asiatic | AA.BIU-MANDARA | Afro-Asiatic,Chadic,Biu-Mandara,A,A.5 | Afro-Asiatic,Chadic,Biu-Mandara,NorthBiu-Mandara,Margi-Mandara-Mofu,Mofuic,Meri | false | false | missing | missing | gnd | Ann-Katrin Wett |
9907 | ZULU | ZULU | zulu1248 | Zulu | zul | Africa | -30.0 | 30.0 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa | false | false | missing | zno | zul | Cecil H. Brown |
9908 | ZULU_2 | ZULU_2 | zulu1248 | Zulu | zul | Africa | -30.0 | 30.0 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa | false | false | missing | zno | zul | Ann-Katrin Wett |
9909 | ZULU_NKANDLA | ZULU_NKANDLA | zulu1248 | Zulu | zul | Africa | -30.0 | 30.0 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa | false | false | missing | zno | zul | Ann-Katrin Wett |
9910 | ZUMBUN | ZUMBUN | zumb1240 | Zumbun | jmb | Africa | 10.8 | 10.0 | Afro-Asiatic | AA.WEST_CHADIC | Afro-Asiatic,Chadic,West,B,B.2 | Afro-Asiatic,Chadic,WestChadic,WestChadicB,WestChadicB.2 | false | false | missing | missing | jmb | Julia Bischoffberger |
9911 | ZUNI | ZUNI | zuni1245 | Zuni | zun | North America | 35.08 | -108.83 | Zuni | Zun.ZUNI | Languageisolate_zun | Zuni | false | false | missing | zun | zun | Cecil H. Brown |
9912 | ZWAY | ZWAY | zayy1238 | Zay | zwa | Africa | 7.93 | 38.83 | Afro-Asiatic | AA.SEMITIC | Afro-Asiatic,Semitic,South,Ethiopian,South,Transversal,Harari-EastGurage | Afro-Asiatic,Semitic,WestSemitic,Ethiosemitic,SouthEthiopic,TransversalSouthEthiopic,Harari-EastGurage | false | false | missing | missing | zwa | Cecil H. Brown and Dmitry Egorov |
Here is a helper function that removes all diacritics from ASJP strings.
function cleanASJP(word)
@pipe word |>
replace(_, r"[ \*~\"]" => "") |>
replace(_, r"(.)(.)(.)\$" => s"\2")
end
cleanASJP (generic function with 1 method)
Combining the cldf components into a single long word list and saving it to disk.
full_asjp = @pipe forms |>
select(_, :Language_ID, :Form, :Parameter_ID) |>
innerjoin(_, parameters, on = :Parameter_ID => :ID) |>
select(_, :Language_ID, :Form, :Concepticon_ID, :Concepticon_Gloss) |>
insertcols(_, :simplified => cleanASJP.(_.Form)) |>
insertcols(
_,
:tokens => map(
x -> join(string.(split(x, "")), " "),
_.simplified
)
)
CSV.write("../data/asjp_full_long.csv", full_asjp)
"../data/asjp_full_long.csv"
Adding longname
, i.e. [WALS family].[WALS genus].[ASJP name]
languages[!, :longname] = @pipe languages |>
zip(_.classification_wals, _.Name) |>
join.(_, ".") |>
replace.(_, "-" => "_")
9912-element Vector{String}:
"NC.BANTU.A51_BAFIA_MAJA"
"NC.BANTU.A51_BAFIA_TUMI_TINGON"
"NC.BANTU.A51_BAFIA_ZAKAAN"
"NC.BANTU.A53_BAFIA_RIKPA"
"NC.BANTU.A54_BAFIA_NJANTI"
"NC.BANTU.A60_GUNU"
"NC.BANTU.A60_MMAALA"
"NC.BANTU.A61_NGORO_ASOM"
"NC.BANTU.A62_KALONGE"
"NC.BANTU.A72a_EWONDO"
"AA.SOUTHERN_CUSHITIC.AASAX"
"TNG.SIANE_YAGARIA.ABAGA"
"NC.EKOID_MBE.ABANYOM"
⋮
"NC.BANTU.ZOOMBO_4"
"MZ.MIXE_ZOQUE.ZOQUE_FRANCISCO_LEON"
"MZ.MIXE_ZOQUE.ZOQUE_RAYON"
"Pau.EASTERN_PAUWASI.ZOROP"
"IE.INDIC.ZUGUNUK_KALASHA"
"AA.BIU_MANDARA.ZULGO"
"NC.BANTU.ZULU"
"NC.BANTU.ZULU_2"
"NC.BANTU.ZULU_NKANDLA"
"AA.WEST_CHADIC.ZUMBUN"
"Zun.ZUNI.ZUNI"
"AA.SEMITIC.ZWAY"
Adding column with transcription without diacritics:
forms[!, :simplified] = cleanASJP.(forms.Value)
forms = @pipe forms |>
innerjoin(_, parameters, on = :Parameter_ID => :ID)
Row | ID | Local_ID | Language_ID | Parameter_ID | Value | Form | Segments | Comment | Source | Cognacy | Loan | Graphemes | Profile | gloss_in_source | simplified | Name | Concepticon_ID | Concepticon_Gloss |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
String | Missing | String | Int64 | String31 | String31 | String | String? | String15? | Missing | Bool | String | String7 | String15 | String | String15 | Int64 | String31 | |
1 | A51_BAFIA_MAJA-11-1 | missing | A51_BAFIA_MAJA | 11 | fo7 | fo7 | f o ʔ | missing | 2 | missing | false | ^ f o 7 $ | default | one | fo7 | *one | 1493 | ONE |
2 | A51_BAFIA_MAJA-12-1 | missing | A51_BAFIA_MAJA | 12 | b"yE | b"yE | b’ j a | missing | 2 | missing | false | ^ b" y E $ | default | two | byE | *two | 1498 | TWO |
3 | A51_BAFIA_MAJA-18-1 | missing | A51_BAFIA_MAJA | 18 | mum | mum | m u m | missing | 2 | missing | false | ^ m u m $ | default | person | mum | *person | 683 | PERSON |
4 | A51_BAFIA_MAJA-18-2 | missing | A51_BAFIA_MAJA | 18 | b"um | b"um | b’ u m | missing | 2 | missing | false | ^ b" u m $ | default | person | bum | *person | 683 | PERSON |
5 | A51_BAFIA_MAJA-19-1 | missing | A51_BAFIA_MAJA | 19 | zEy | zEy | z a j | missing | 2 | missing | false | ^ z E y $ | default | fish | zEy | *fish | 227 | FISH |
6 | A51_BAFIA_MAJA-21-1 | missing | A51_BAFIA_MAJA | 21 | b3 | b3 | b ə | missing | 2 | missing | false | ^ b 3 $ | default | dog | b3 | *dog | 2009 | DOG |
7 | A51_BAFIA_MAJA-22-1 | missing | A51_BAFIA_MAJA | 22 | TEy | TEy | c a j | missing | 2 | missing | false | ^ T E y $ | default | louse | TEy | *louse | 1392 | LOUSE |
8 | A51_BAFIA_MAJA-22-2 | missing | A51_BAFIA_MAJA | 22 | b"yey | b"yey | b’ j e j | missing | 2 | missing | false | ^ b" y e y $ | default | louse | byey | *louse | 1392 | LOUSE |
9 | A51_BAFIA_MAJA-23-1 | missing | A51_BAFIA_MAJA | 23 | t3 | t3 | t ə | missing | 2 | missing | false | ^ t 3 $ | default | tree | t3 | *tree | 906 | TREE |
10 | A51_BAFIA_MAJA-25-1 | missing | A51_BAFIA_MAJA | 25 | fyeyN | fyeyN | f j e j ŋ | missing | 2 | missing | false | ^ f y e y N $ | default | leaf | fyeyN | *leaf | 628 | LEAF |
11 | A51_BAFIA_MAJA-25-2 | missing | A51_BAFIA_MAJA | 25 | fyey | fyey | f j e j | missing | 2 | missing | false | ^ f y e y $ | default | leaf | fyey | *leaf | 628 | LEAF |
12 | A51_BAFIA_MAJA-25-3 | missing | A51_BAFIA_MAJA | 25 | tyey | tyey | t j e j | missing | 2 | missing | false | ^ t y e y $ | default | leaf | tyey | *leaf | 628 | LEAF |
13 | A51_BAFIA_MAJA-28-1 | missing | A51_BAFIA_MAJA | 28 | kwokw3 | kwokw3 | k w o k w ə | missing | 2 | missing | false | ^ k w o k w 3 $ | default | skin | kwokw3 | *skin | 763 | SKIN |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
482107 | ZWAY-53-1 | missing | ZWAY | 53 | gobut | gobut | g o b u t | missing | 10279 | missing | false | ^ g o b u t $ | default | liver | gobut | *liver | 1224 | LIVER |
482108 | ZWAY-54-1 | missing | ZWAY | 54 | sETin | sETin | s a c i n | missing | 10279 | missing | false | ^ s E T i n $ | default | drink | sETin | *drink | 1401 | DRINK |
482109 | ZWAY-66-1 | missing | ZWAY | 66 | mEt"t"En | mEt"t"En | m a tʼ tʼ a n | missing | 10279 | missing | false | ^ m E t" t" E n $ | default | come | mEttEn | *come | 1446 | COME |
482110 | ZWAY-72-1 | missing | ZWAY | 72 | Erit | Erit | a r i t | missing | 10279 | missing | false | ^ E r i t $ | default | sun | Erit | *sun | 1343 | SUN |
482111 | ZWAY-74-1 | missing | ZWAY | 74 | kokEb | kokEb | k o k a b | missing | 10279 | missing | false | ^ k o k E b $ | default | star | kokEb | *star | 1430 | STAR |
482112 | ZWAY-75-1 | missing | ZWAY | 75 | mEi | mEi | m a i | missing | 10279 | missing | false | ^ m E i $ | default | water | mEi | *water | 948 | WATER |
482113 | ZWAY-77-1 | missing | ZWAY | 77 | umon | umon | u m o n | missing | 10279 | missing | false | ^ u m o n $ | default | stone | umon | *stone | 857 | STONE |
482114 | ZWAY-82-1 | missing | ZWAY | 82 | yirE | yirE | j i r a | missing | 10279 | missing | false | ^ y i r E $ | default | fire | yirE | *fire | 221 | FIRE |
482115 | ZWAY-86-1 | missing | ZWAY | 86 | sEri | sEri | s a r i | missing | 10279 | missing | false | ^ s E r i $ | default | mountain | sEri | *mountain | 639 | MOUNTAIN |
482116 | ZWAY-92-1 | missing | ZWAY | 92 | Erut | Erut | a r u t | missing | 10279 | missing | false | ^ E r u t $ | default | night | Erut | *night | 1233 | NIGHT |
482117 | ZWAY-96-1 | missing | ZWAY | 96 | woirE | woirE | w o i r a | missing | 10279 | missing | false | ^ w o i r E $ | default | new | woirE | *new | 1231 | NEW |
482118 | ZWAY-100-1 | missing | ZWAY | 100 | s3m | s3m | s ə m | missing | 10279 | missing | false | ^ s 3 m $ | default | name | s3m | *name | 1405 | NAME |
Removing Oth.
languages
Row | ID | Name | Glottocode | Glottolog_Name | ISO639P3code | Macroarea | Latitude | Longitude | Family | classification_wals | classification_ethnologue | classification_glottolog | recently_extinct | long_extinct | year_of_extinction | code_wals | code_iso | transcribers | longname |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
String | String | String15? | String? | String3? | String15? | Float64? | Float64? | String31? | String | String | String | Bool | Bool | Int64? | String3? | String3? | String? | String | |
1 | A51_BAFIA_MAJA | A51_BAFIA_MAJA | lefa1242 | Lefa | lfa | Africa | 5.1 | 11.2 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia | false | false | missing | missing | lfa | Ann-Katrin Wett | NC.BANTU.A51_BAFIA_MAJA |
2 | A51_BAFIA_TUMI_TINGON | A51_BAFIA_TUMI_TINGON | lefa1242 | Lefa | lfa | Africa | 5.1 | 11.2 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia | false | false | missing | missing | lfa | Ann-Katrin Wett | NC.BANTU.A51_BAFIA_TUMI_TINGON |
3 | A51_BAFIA_ZAKAAN | A51_BAFIA_ZAKAAN | lefa1242 | Lefa | lfa | Africa | 5.1 | 11.2 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia | false | false | missing | missing | lfa | Ann-Katrin Wett | NC.BANTU.A51_BAFIA_ZAKAAN |
4 | A53_BAFIA_RIKPA | A53_BAFIA_RIKPA | bafi1243 | Bafia | ksf | Africa | 5.0 | 11.17 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.53) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia | false | false | missing | bfi | ksf | Ann-Katrin Wett | NC.BANTU.A53_BAFIA_RIKPA |
5 | A54_BAFIA_NJANTI | A54_BAFIA_NJANTI | tibe1274 | Tibea | ngy | Africa | 5.3 | 11.3 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.54) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50) | false | false | missing | missing | ngy | Ann-Katrin Wett | NC.BANTU.A54_BAFIA_NJANTI |
6 | A60_GUNU | A60_GUNU | nugu1242 | Nugunu (Cameroon) | yas | Africa | 4.58 | 11.25 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.622) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60),Mmala-Elip-Gunu,Elip-Gunu | false | false | missing | gun | yas | Ann-Katrin Wett | NC.BANTU.A60_GUNU |
7 | A60_MMAALA | A60_MMAALA | mmaa1238 | Mmaala | mmu | Africa | 4.5 | 11.08 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.62) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60),Mmala-Elip-Gunu | false | false | missing | missing | mmu | Ann-Katrin Wett | NC.BANTU.A60_MMAALA |
8 | A61_NGORO_ASOM | A61_NGORO_ASOM | tuki1240 | Tuki | bag | Africa | 4.58 | 11.5 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.601) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Sanaga-WestMbam(A.40),Sanaga(A.60) | false | false | missing | tki | bag | Ann-Katrin Wett | NC.BANTU.A61_NGORO_ASOM |
9 | A62_KALONGE | A62_KALONGE | yang1293 | Yangben | yav | Africa | 4.43 | 11.08 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.62) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60) | false | false | missing | missing | yav | Ann-Katrin Wett | NC.BANTU.A62_KALONGE |
10 | A72a_EWONDO | A72a_EWONDO | ewon1239 | Ewondo | ewo | Africa | 4.0 | 12.0 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Ewondo-Fang(A.72) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Basaa-Yaunde(A40-70),Yaunde-Fang(A.70),Ewondo-Bebele | false | false | missing | ewo | ewo | Ann-Katrin Wett | NC.BANTU.A72a_EWONDO |
11 | AASAX | AASAX | aasa1238 | Aasax | aas | Africa | -4.04 | 37.16 | Afro-Asiatic | AA.SOUTHERN_CUSHITIC | Afro-Asiatic,Cushitic,South | Afro-Asiatic,Cushitic,SouthCushitic | true | false | 2010 | missing | aas | Darja Appelganz | AA.SOUTHERN_CUSHITIC.AASAX |
12 | ABAGA | ABAGA | abag1245 | Abaga | abg | Papunesia | -6.17 | 145.67 | Nuclear Trans New Guinea | TNG.SIANE-YAGARIA | Trans-NewGuinea,Madang,Kalam-Kobon | Nuclear_Trans_New_Guinea,Kainantu-Goroka,Goroka,NuclearGoroka,Siane-Yagaria,Kamano-Yagaria,UnclassifiedKamano-Yagaria | false | false | missing | missing | abg | Matthew S. Dryer and Søren Wichmann | TNG.SIANE_YAGARIA.ABAGA |
13 | ABANYOM | ABANYOM | aban1242 | Abanyom | abm | Africa | 6.29 | 8.63 | Atlantic-Congo | NC.EKOID-MBE | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,Ekoid,Bakor | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,Ekoid-Mbe,Ekoid,Bakor-Ejagham,Bakor,NorthernBakor,Abanyom-Nkem-Nkum | false | false | missing | missing | abm | Guillaume Segerer and Søren Wichmann | NC.EKOID_MBE.ABANYOM |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
9824 | ZOOMBO_4 | ZOOMBO_4 | koon1244 | South-Central Koongo | kng | Africa | -5.0 | 15.0 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,H,Kikongo(H.16) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Central-WesternBantu,West-CoastalBantu,Nzadic,Lweric,Dingic,Loange-Atlantic,KLCExtended,KikongoLanguageCluster,NuclearKLC,Kikongoic,KambakunyicKikongo,KilaadicKikongo,Central-SouthernKikongo,SoutheasternKikongo,SouthernKikongo,Koongo-Kituba | false | false | missing | fio | kng | Ann-Katrin Wett | NC.BANTU.ZOOMBO_4 |
9825 | ZOQUE_FRANCISCO_LEON | ZOQUE_FRANCISCO_LEON | fran1266 | Francisco León Zoque | zos | North America | 17.33 | -93.25 | Mixe-Zoque | MZ.MIXE-ZOQUE | Mixe-Zoquean,Zoquean,ChiapasZoquean | Mixe-Zoque,Zoque,ChiapasZoque | false | false | missing | zfl | zos | Søren Wichmann and Viveka Velupillai | MZ.MIXE_ZOQUE.ZOQUE_FRANCISCO_LEON |
9826 | ZOQUE_RAYON | ZOQUE_RAYON | rayo1235 | Rayón Zoque | zor | North America | 17.08 | -93.0 | Mixe-Zoque | MZ.MIXE-ZOQUE | Mixe-Zoquean,Zoquean,ChiapasZoquean,NortheastZoque | Mixe-Zoque,Zoque,ChiapasZoque | false | false | missing | zqr | zor | Søren Wichmann and Viveka Velupillai | MZ.MIXE_ZOQUE.ZOQUE_RAYON |
9827 | ZOROP | ZOROP | yafi1240 | Yafi | wfg | Papunesia | -3.42 | 140.92 | Pauwasi | Pau.EASTERN_PAUWASI | Pauwasi,Eastern | Pauwasi,EasternPauwasi | false | false | missing | missing | wfg | Matthew S. Dryer | Pau.EASTERN_PAUWASI.ZOROP |
9828 | ZUGUNUK_KALASHA | ZUGUNUK_KALASHA | kala1372 | Chitral Kalasha | kls | Eurasia | 35.49 | 71.7 | Indo-European | IE.INDIC | Indo-European,Indo-Iranian,Indo-Aryan,OuterLanguages,Northwestern,Dardic,Chitral | Indo-European,ClassicalIndo-European,Indo-Iranian,Indo-Aryan,Indo-AryanNorthwesternzone,Chitral | false | false | missing | klh | kls | missing | IE.INDIC.ZUGUNUK_KALASHA |
9829 | ZULGO | ZULGO | zulg1242 | Zulgo-Gemzek | gnd | Africa | 10.83 | 14.08 | Afro-Asiatic | AA.BIU-MANDARA | Afro-Asiatic,Chadic,Biu-Mandara,A,A.5 | Afro-Asiatic,Chadic,Biu-Mandara,NorthBiu-Mandara,Margi-Mandara-Mofu,Mofuic,Meri | false | false | missing | missing | gnd | Ann-Katrin Wett | AA.BIU_MANDARA.ZULGO |
9830 | ZULU | ZULU | zulu1248 | Zulu | zul | Africa | -30.0 | 30.0 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa | false | false | missing | zno | zul | Cecil H. Brown | NC.BANTU.ZULU |
9831 | ZULU_2 | ZULU_2 | zulu1248 | Zulu | zul | Africa | -30.0 | 30.0 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa | false | false | missing | zno | zul | Ann-Katrin Wett | NC.BANTU.ZULU_2 |
9832 | ZULU_NKANDLA | ZULU_NKANDLA | zulu1248 | Zulu | zul | Africa | -30.0 | 30.0 | Atlantic-Congo | NC.BANTU | Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Central,S,Nguni(S.42) | Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,EastBantu,SouthernBantu-Makua,Nguni-Tsonga,Nguni(S.40),SouthernNdebele-Lowland,Zulu-Xhosa | false | false | missing | zno | zul | Ann-Katrin Wett | NC.BANTU.ZULU_NKANDLA |
9833 | ZUMBUN | ZUMBUN | zumb1240 | Zumbun | jmb | Africa | 10.8 | 10.0 | Afro-Asiatic | AA.WEST_CHADIC | Afro-Asiatic,Chadic,West,B,B.2 | Afro-Asiatic,Chadic,WestChadic,WestChadicB,WestChadicB.2 | false | false | missing | missing | jmb | Julia Bischoffberger | AA.WEST_CHADIC.ZUMBUN |
9834 | ZUNI | ZUNI | zuni1245 | Zuni | zun | North America | 35.08 | -108.83 | Zuni | Zun.ZUNI | Languageisolate_zun | Zuni | false | false | missing | zun | zun | Cecil H. Brown | Zun.ZUNI.ZUNI |
9835 | ZWAY | ZWAY | zayy1238 | Zay | zwa | Africa | 7.93 | 38.83 | Afro-Asiatic | AA.SEMITIC | Afro-Asiatic,Semitic,South,Ethiopian,South,Transversal,Harari-EastGurage | Afro-Asiatic,Semitic,WestSemitic,Ethiosemitic,SouthEthiopic,TransversalSouthEthiopic,Harari-EastGurage | false | false | missing | missing | zwa | Cecil H. Brown and Dmitry Egorov | AA.SEMITIC.ZWAY |
Calculating for each concept the number of doculects where it appears and singling out the 40 concepts with the best coverage.
conceptCoverage = @pipe forms |>
unique(_, [:Language_ID, :Concepticon_Gloss]) |>
groupby(_, :Concepticon_Gloss) |>
combine(nrow, _) |>
sort(_, :nrow, rev = true)
concepts = conceptCoverage.Concepticon_Gloss[1:40]
forms40 = forms[map(x -> x ∈ concepts, forms.Concepticon_Gloss), :]
Row | ID | Local_ID | Language_ID | Parameter_ID | Value | Form | Segments | Comment | Source | Cognacy | Loan | Graphemes | Profile | gloss_in_source | simplified | Name | Concepticon_ID | Concepticon_Gloss |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
String | Missing | String | Int64 | String31 | String31 | String | String? | String15? | Missing | Bool | String | String7 | String15 | String | String15 | Int64 | String31 | |
1 | A51_BAFIA_MAJA-11-1 | missing | A51_BAFIA_MAJA | 11 | fo7 | fo7 | f o ʔ | missing | 2 | missing | false | ^ f o 7 $ | default | one | fo7 | *one | 1493 | ONE |
2 | A51_BAFIA_MAJA-12-1 | missing | A51_BAFIA_MAJA | 12 | b"yE | b"yE | b’ j a | missing | 2 | missing | false | ^ b" y E $ | default | two | byE | *two | 1498 | TWO |
3 | A51_BAFIA_MAJA-18-1 | missing | A51_BAFIA_MAJA | 18 | mum | mum | m u m | missing | 2 | missing | false | ^ m u m $ | default | person | mum | *person | 683 | PERSON |
4 | A51_BAFIA_MAJA-18-2 | missing | A51_BAFIA_MAJA | 18 | b"um | b"um | b’ u m | missing | 2 | missing | false | ^ b" u m $ | default | person | bum | *person | 683 | PERSON |
5 | A51_BAFIA_MAJA-19-1 | missing | A51_BAFIA_MAJA | 19 | zEy | zEy | z a j | missing | 2 | missing | false | ^ z E y $ | default | fish | zEy | *fish | 227 | FISH |
6 | A51_BAFIA_MAJA-21-1 | missing | A51_BAFIA_MAJA | 21 | b3 | b3 | b ə | missing | 2 | missing | false | ^ b 3 $ | default | dog | b3 | *dog | 2009 | DOG |
7 | A51_BAFIA_MAJA-22-1 | missing | A51_BAFIA_MAJA | 22 | TEy | TEy | c a j | missing | 2 | missing | false | ^ T E y $ | default | louse | TEy | *louse | 1392 | LOUSE |
8 | A51_BAFIA_MAJA-22-2 | missing | A51_BAFIA_MAJA | 22 | b"yey | b"yey | b’ j e j | missing | 2 | missing | false | ^ b" y e y $ | default | louse | byey | *louse | 1392 | LOUSE |
9 | A51_BAFIA_MAJA-23-1 | missing | A51_BAFIA_MAJA | 23 | t3 | t3 | t ə | missing | 2 | missing | false | ^ t 3 $ | default | tree | t3 | *tree | 906 | TREE |
10 | A51_BAFIA_MAJA-25-1 | missing | A51_BAFIA_MAJA | 25 | fyeyN | fyeyN | f j e j ŋ | missing | 2 | missing | false | ^ f y e y N $ | default | leaf | fyeyN | *leaf | 628 | LEAF |
11 | A51_BAFIA_MAJA-25-2 | missing | A51_BAFIA_MAJA | 25 | fyey | fyey | f j e j | missing | 2 | missing | false | ^ f y e y $ | default | leaf | fyey | *leaf | 628 | LEAF |
12 | A51_BAFIA_MAJA-25-3 | missing | A51_BAFIA_MAJA | 25 | tyey | tyey | t j e j | missing | 2 | missing | false | ^ t y e y $ | default | leaf | tyey | *leaf | 628 | LEAF |
13 | A51_BAFIA_MAJA-28-1 | missing | A51_BAFIA_MAJA | 28 | kwokw3 | kwokw3 | k w o k w ə | missing | 2 | missing | false | ^ k w o k w 3 $ | default | skin | kwokw3 | *skin | 763 | SKIN |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
371619 | ZWAY-53-1 | missing | ZWAY | 53 | gobut | gobut | g o b u t | missing | 10279 | missing | false | ^ g o b u t $ | default | liver | gobut | *liver | 1224 | LIVER |
371620 | ZWAY-54-1 | missing | ZWAY | 54 | sETin | sETin | s a c i n | missing | 10279 | missing | false | ^ s E T i n $ | default | drink | sETin | *drink | 1401 | DRINK |
371621 | ZWAY-66-1 | missing | ZWAY | 66 | mEt"t"En | mEt"t"En | m a tʼ tʼ a n | missing | 10279 | missing | false | ^ m E t" t" E n $ | default | come | mEttEn | *come | 1446 | COME |
371622 | ZWAY-72-1 | missing | ZWAY | 72 | Erit | Erit | a r i t | missing | 10279 | missing | false | ^ E r i t $ | default | sun | Erit | *sun | 1343 | SUN |
371623 | ZWAY-74-1 | missing | ZWAY | 74 | kokEb | kokEb | k o k a b | missing | 10279 | missing | false | ^ k o k E b $ | default | star | kokEb | *star | 1430 | STAR |
371624 | ZWAY-75-1 | missing | ZWAY | 75 | mEi | mEi | m a i | missing | 10279 | missing | false | ^ m E i $ | default | water | mEi | *water | 948 | WATER |
371625 | ZWAY-77-1 | missing | ZWAY | 77 | umon | umon | u m o n | missing | 10279 | missing | false | ^ u m o n $ | default | stone | umon | *stone | 857 | STONE |
371626 | ZWAY-82-1 | missing | ZWAY | 82 | yirE | yirE | j i r a | missing | 10279 | missing | false | ^ y i r E $ | default | fire | yirE | *fire | 221 | FIRE |
371627 | ZWAY-86-1 | missing | ZWAY | 86 | sEri | sEri | s a r i | missing | 10279 | missing | false | ^ s E r i $ | default | mountain | sEri | *mountain | 639 | MOUNTAIN |
371628 | ZWAY-92-1 | missing | ZWAY | 92 | Erut | Erut | a r u t | missing | 10279 | missing | false | ^ E r u t $ | default | night | Erut | *night | 1233 | NIGHT |
371629 | ZWAY-96-1 | missing | ZWAY | 96 | woirE | woirE | w o i r a | missing | 10279 | missing | false | ^ w o i r E $ | default | new | woirE | *new | 1231 | NEW |
371630 | ZWAY-100-1 | missing | ZWAY | 100 | s3m | s3m | s ə m | missing | 10279 | missing | false | ^ s 3 m $ | default | name | s3m | *name | 1405 | NAME |
Computing the number of covered concepts for each language. Only doculects with at least 30 concept entries are considered further.
languageCoverage = @pipe forms40 |>
unique(_, [:Language_ID, :Concepticon_Gloss]) |>
groupby(_, :Language_ID) |>
combine(nrow, _) |>
sort(_, :nrow, rev = true)
doculects = languageCoverage.Language_ID[languageCoverage.nrow.>=30]
forms40 = forms40[map(x -> x ∈ doculects, forms40.Language_ID), :]
Row | ID | Local_ID | Language_ID | Parameter_ID | Value | Form | Segments | Comment | Source | Cognacy | Loan | Graphemes | Profile | gloss_in_source | simplified | Name | Concepticon_ID | Concepticon_Gloss |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
String | Missing | String | Int64 | String31 | String31 | String | String? | String15? | Missing | Bool | String | String7 | String15 | String | String15 | Int64 | String31 | |
1 | A51_BAFIA_MAJA-11-1 | missing | A51_BAFIA_MAJA | 11 | fo7 | fo7 | f o ʔ | missing | 2 | missing | false | ^ f o 7 $ | default | one | fo7 | *one | 1493 | ONE |
2 | A51_BAFIA_MAJA-12-1 | missing | A51_BAFIA_MAJA | 12 | b"yE | b"yE | b’ j a | missing | 2 | missing | false | ^ b" y E $ | default | two | byE | *two | 1498 | TWO |
3 | A51_BAFIA_MAJA-18-1 | missing | A51_BAFIA_MAJA | 18 | mum | mum | m u m | missing | 2 | missing | false | ^ m u m $ | default | person | mum | *person | 683 | PERSON |
4 | A51_BAFIA_MAJA-18-2 | missing | A51_BAFIA_MAJA | 18 | b"um | b"um | b’ u m | missing | 2 | missing | false | ^ b" u m $ | default | person | bum | *person | 683 | PERSON |
5 | A51_BAFIA_MAJA-19-1 | missing | A51_BAFIA_MAJA | 19 | zEy | zEy | z a j | missing | 2 | missing | false | ^ z E y $ | default | fish | zEy | *fish | 227 | FISH |
6 | A51_BAFIA_MAJA-21-1 | missing | A51_BAFIA_MAJA | 21 | b3 | b3 | b ə | missing | 2 | missing | false | ^ b 3 $ | default | dog | b3 | *dog | 2009 | DOG |
7 | A51_BAFIA_MAJA-22-1 | missing | A51_BAFIA_MAJA | 22 | TEy | TEy | c a j | missing | 2 | missing | false | ^ T E y $ | default | louse | TEy | *louse | 1392 | LOUSE |
8 | A51_BAFIA_MAJA-22-2 | missing | A51_BAFIA_MAJA | 22 | b"yey | b"yey | b’ j e j | missing | 2 | missing | false | ^ b" y e y $ | default | louse | byey | *louse | 1392 | LOUSE |
9 | A51_BAFIA_MAJA-23-1 | missing | A51_BAFIA_MAJA | 23 | t3 | t3 | t ə | missing | 2 | missing | false | ^ t 3 $ | default | tree | t3 | *tree | 906 | TREE |
10 | A51_BAFIA_MAJA-25-1 | missing | A51_BAFIA_MAJA | 25 | fyeyN | fyeyN | f j e j ŋ | missing | 2 | missing | false | ^ f y e y N $ | default | leaf | fyeyN | *leaf | 628 | LEAF |
11 | A51_BAFIA_MAJA-25-2 | missing | A51_BAFIA_MAJA | 25 | fyey | fyey | f j e j | missing | 2 | missing | false | ^ f y e y $ | default | leaf | fyey | *leaf | 628 | LEAF |
12 | A51_BAFIA_MAJA-25-3 | missing | A51_BAFIA_MAJA | 25 | tyey | tyey | t j e j | missing | 2 | missing | false | ^ t y e y $ | default | leaf | tyey | *leaf | 628 | LEAF |
13 | A51_BAFIA_MAJA-28-1 | missing | A51_BAFIA_MAJA | 28 | kwokw3 | kwokw3 | k w o k w ə | missing | 2 | missing | false | ^ k w o k w 3 $ | default | skin | kwokw3 | *skin | 763 | SKIN |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
315832 | ZWAY-53-1 | missing | ZWAY | 53 | gobut | gobut | g o b u t | missing | 10279 | missing | false | ^ g o b u t $ | default | liver | gobut | *liver | 1224 | LIVER |
315833 | ZWAY-54-1 | missing | ZWAY | 54 | sETin | sETin | s a c i n | missing | 10279 | missing | false | ^ s E T i n $ | default | drink | sETin | *drink | 1401 | DRINK |
315834 | ZWAY-66-1 | missing | ZWAY | 66 | mEt"t"En | mEt"t"En | m a tʼ tʼ a n | missing | 10279 | missing | false | ^ m E t" t" E n $ | default | come | mEttEn | *come | 1446 | COME |
315835 | ZWAY-72-1 | missing | ZWAY | 72 | Erit | Erit | a r i t | missing | 10279 | missing | false | ^ E r i t $ | default | sun | Erit | *sun | 1343 | SUN |
315836 | ZWAY-74-1 | missing | ZWAY | 74 | kokEb | kokEb | k o k a b | missing | 10279 | missing | false | ^ k o k E b $ | default | star | kokEb | *star | 1430 | STAR |
315837 | ZWAY-75-1 | missing | ZWAY | 75 | mEi | mEi | m a i | missing | 10279 | missing | false | ^ m E i $ | default | water | mEi | *water | 948 | WATER |
315838 | ZWAY-77-1 | missing | ZWAY | 77 | umon | umon | u m o n | missing | 10279 | missing | false | ^ u m o n $ | default | stone | umon | *stone | 857 | STONE |
315839 | ZWAY-82-1 | missing | ZWAY | 82 | yirE | yirE | j i r a | missing | 10279 | missing | false | ^ y i r E $ | default | fire | yirE | *fire | 221 | FIRE |
315840 | ZWAY-86-1 | missing | ZWAY | 86 | sEri | sEri | s a r i | missing | 10279 | missing | false | ^ s E r i $ | default | mountain | sEri | *mountain | 639 | MOUNTAIN |
315841 | ZWAY-92-1 | missing | ZWAY | 92 | Erut | Erut | a r u t | missing | 10279 | missing | false | ^ E r u t $ | default | night | Erut | *night | 1233 | NIGHT |
315842 | ZWAY-96-1 | missing | ZWAY | 96 | woirE | woirE | w o i r a | missing | 10279 | missing | false | ^ w o i r E $ | default | new | woirE | *new | 1231 | NEW |
315843 | ZWAY-100-1 | missing | ZWAY | 100 | s3m | s3m | s ə m | missing | 10279 | missing | false | ^ s 3 m $ | default | name | s3m | *name | 1405 | NAME |
Creating wordlist with these restrictions; throwing out PROTO
languages.
asjpLong = innerjoin(
forms40,
languages[:, [:Name, :longname]],
on = :Language_ID => :Name,
)
asjpLong = asjpLong[.!occursin.("PROTO", asjpLong.Language_ID),:]
Row | ID | Local_ID | Language_ID | Parameter_ID | Value | Form | Segments | Comment | Source | Cognacy | Loan | Graphemes | Profile | gloss_in_source | simplified | Name | Concepticon_ID | Concepticon_Gloss | longname |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
String | Missing | String | Int64 | String31 | String31 | String | String? | String15? | Missing | Bool | String | String7 | String15 | String | String15 | Int64 | String31 | String | |
1 | A51_BAFIA_MAJA-11-1 | missing | A51_BAFIA_MAJA | 11 | fo7 | fo7 | f o ʔ | missing | 2 | missing | false | ^ f o 7 $ | default | one | fo7 | *one | 1493 | ONE | NC.BANTU.A51_BAFIA_MAJA |
2 | A51_BAFIA_MAJA-12-1 | missing | A51_BAFIA_MAJA | 12 | b"yE | b"yE | b’ j a | missing | 2 | missing | false | ^ b" y E $ | default | two | byE | *two | 1498 | TWO | NC.BANTU.A51_BAFIA_MAJA |
3 | A51_BAFIA_MAJA-18-1 | missing | A51_BAFIA_MAJA | 18 | mum | mum | m u m | missing | 2 | missing | false | ^ m u m $ | default | person | mum | *person | 683 | PERSON | NC.BANTU.A51_BAFIA_MAJA |
4 | A51_BAFIA_MAJA-18-2 | missing | A51_BAFIA_MAJA | 18 | b"um | b"um | b’ u m | missing | 2 | missing | false | ^ b" u m $ | default | person | bum | *person | 683 | PERSON | NC.BANTU.A51_BAFIA_MAJA |
5 | A51_BAFIA_MAJA-19-1 | missing | A51_BAFIA_MAJA | 19 | zEy | zEy | z a j | missing | 2 | missing | false | ^ z E y $ | default | fish | zEy | *fish | 227 | FISH | NC.BANTU.A51_BAFIA_MAJA |
6 | A51_BAFIA_MAJA-21-1 | missing | A51_BAFIA_MAJA | 21 | b3 | b3 | b ə | missing | 2 | missing | false | ^ b 3 $ | default | dog | b3 | *dog | 2009 | DOG | NC.BANTU.A51_BAFIA_MAJA |
7 | A51_BAFIA_MAJA-22-1 | missing | A51_BAFIA_MAJA | 22 | TEy | TEy | c a j | missing | 2 | missing | false | ^ T E y $ | default | louse | TEy | *louse | 1392 | LOUSE | NC.BANTU.A51_BAFIA_MAJA |
8 | A51_BAFIA_MAJA-22-2 | missing | A51_BAFIA_MAJA | 22 | b"yey | b"yey | b’ j e j | missing | 2 | missing | false | ^ b" y e y $ | default | louse | byey | *louse | 1392 | LOUSE | NC.BANTU.A51_BAFIA_MAJA |
9 | A51_BAFIA_MAJA-23-1 | missing | A51_BAFIA_MAJA | 23 | t3 | t3 | t ə | missing | 2 | missing | false | ^ t 3 $ | default | tree | t3 | *tree | 906 | TREE | NC.BANTU.A51_BAFIA_MAJA |
10 | A51_BAFIA_MAJA-25-1 | missing | A51_BAFIA_MAJA | 25 | fyeyN | fyeyN | f j e j ŋ | missing | 2 | missing | false | ^ f y e y N $ | default | leaf | fyeyN | *leaf | 628 | LEAF | NC.BANTU.A51_BAFIA_MAJA |
11 | A51_BAFIA_MAJA-25-2 | missing | A51_BAFIA_MAJA | 25 | fyey | fyey | f j e j | missing | 2 | missing | false | ^ f y e y $ | default | leaf | fyey | *leaf | 628 | LEAF | NC.BANTU.A51_BAFIA_MAJA |
12 | A51_BAFIA_MAJA-25-3 | missing | A51_BAFIA_MAJA | 25 | tyey | tyey | t j e j | missing | 2 | missing | false | ^ t y e y $ | default | leaf | tyey | *leaf | 628 | LEAF | NC.BANTU.A51_BAFIA_MAJA |
13 | A51_BAFIA_MAJA-28-1 | missing | A51_BAFIA_MAJA | 28 | kwokw3 | kwokw3 | k w o k w ə | missing | 2 | missing | false | ^ k w o k w 3 $ | default | skin | kwokw3 | *skin | 763 | SKIN | NC.BANTU.A51_BAFIA_MAJA |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
306813 | ZWAY-53-1 | missing | ZWAY | 53 | gobut | gobut | g o b u t | missing | 10279 | missing | false | ^ g o b u t $ | default | liver | gobut | *liver | 1224 | LIVER | AA.SEMITIC.ZWAY |
306814 | ZWAY-54-1 | missing | ZWAY | 54 | sETin | sETin | s a c i n | missing | 10279 | missing | false | ^ s E T i n $ | default | drink | sETin | *drink | 1401 | DRINK | AA.SEMITIC.ZWAY |
306815 | ZWAY-66-1 | missing | ZWAY | 66 | mEt"t"En | mEt"t"En | m a tʼ tʼ a n | missing | 10279 | missing | false | ^ m E t" t" E n $ | default | come | mEttEn | *come | 1446 | COME | AA.SEMITIC.ZWAY |
306816 | ZWAY-72-1 | missing | ZWAY | 72 | Erit | Erit | a r i t | missing | 10279 | missing | false | ^ E r i t $ | default | sun | Erit | *sun | 1343 | SUN | AA.SEMITIC.ZWAY |
306817 | ZWAY-74-1 | missing | ZWAY | 74 | kokEb | kokEb | k o k a b | missing | 10279 | missing | false | ^ k o k E b $ | default | star | kokEb | *star | 1430 | STAR | AA.SEMITIC.ZWAY |
306818 | ZWAY-75-1 | missing | ZWAY | 75 | mEi | mEi | m a i | missing | 10279 | missing | false | ^ m E i $ | default | water | mEi | *water | 948 | WATER | AA.SEMITIC.ZWAY |
306819 | ZWAY-77-1 | missing | ZWAY | 77 | umon | umon | u m o n | missing | 10279 | missing | false | ^ u m o n $ | default | stone | umon | *stone | 857 | STONE | AA.SEMITIC.ZWAY |
306820 | ZWAY-82-1 | missing | ZWAY | 82 | yirE | yirE | j i r a | missing | 10279 | missing | false | ^ y i r E $ | default | fire | yirE | *fire | 221 | FIRE | AA.SEMITIC.ZWAY |
306821 | ZWAY-86-1 | missing | ZWAY | 86 | sEri | sEri | s a r i | missing | 10279 | missing | false | ^ s E r i $ | default | mountain | sEri | *mountain | 639 | MOUNTAIN | AA.SEMITIC.ZWAY |
306822 | ZWAY-92-1 | missing | ZWAY | 92 | Erut | Erut | a r u t | missing | 10279 | missing | false | ^ E r u t $ | default | night | Erut | *night | 1233 | NIGHT | AA.SEMITIC.ZWAY |
306823 | ZWAY-96-1 | missing | ZWAY | 96 | woirE | woirE | w o i r a | missing | 10279 | missing | false | ^ w o i r E $ | default | new | woirE | *new | 1231 | NEW | AA.SEMITIC.ZWAY |
306824 | ZWAY-100-1 | missing | ZWAY | 100 | s3m | s3m | s ə m | missing | 10279 | missing | false | ^ s 3 m $ | default | name | s3m | *name | 1405 | NAME | AA.SEMITIC.ZWAY |
Reformatting the word list into wide format and saving it to disk.