Computational Typology
  • Home
  • About

Creating tree plots

load packages

library(repr)
options(repr.plot.width=15, repr.plot.height=15)
library(ape)
library(ggtree)
library(tidyverse)
library(RColorBrewer)
library(phangorn)
library(ggtreeExtra)
library(ggnewscale)
library(phytools)
library(gridExtra)
library(forcats)
library(cowplot)
ggtree v3.14.0 Learn more at https://yulab-smu.top/contribution-tree-data/

Please cite:

Guangchuang Yu.  Data Integration, Manipulation and Visualization of
Phylogenetic Trees (1st edition). Chapman and Hall/CRC. 2022,
doi:10.1201/9781003279242, ISBN: 9781032233574


Attaching package: ‘ggtree’


The following object is masked from ‘package:ape’:

    rotate


── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ tidyr::expand() masks ggtree::expand()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ dplyr::where()  masks ape::where()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ggtreeExtra v1.16.0 For help: https://yulab-smu.top/treedata-book/

If you use the ggtree package suite in published research, please cite
the appropriate paper(s):

S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
visualization of richly annotated phylogenetic data. Molecular Biology
and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166 

Loading required package: maps


Attaching package: ‘maps’


The following object is masked from ‘package:purrr’:

    map



Attaching package: ‘gridExtra’


The following object is masked from ‘package:dplyr’:

    combine



Attaching package: ‘cowplot’


The following object is masked from ‘package:lubridate’:

    stamp

load the tree

tree <- read.tree("../data/affix_adposition_longnames.tre")

load the meta data

asjp_languages = read_csv("../data/asjp_languages.csv")
Rows: 8953 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Name, Glottocode, longname

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

download ASJP

# Pakete laden
if (!requireNamespace("utils")) install.packages("utils")
if (!requireNamespace("curl")) install.packages("curl")

dir.create("tmp", showWarnings = FALSE)

languagesF <- "../data/asjp19_languages.csv"

if (!(file.exists(languagesF))) {
  # Herunterladen
  asjp_url <- "https://zenodo.org/record/3843469/files/lexibank/asjp-v19.1.zip"
  asjp_zip <- "tmp/asjp-19.1.zip"
  download.file(asjp_url, asjp_zip, mode = "wb")

  # Entpacken
  unzip(asjp_zip, exdir = "tmp")

  # Kopieren der benötigten Dateien
  file.copy("tmp/lexibank-asjp-0c18d44/cldf/languages.csv", languagesF, overwrite = TRUE)

  # Temporäre Dateien entfernen
  unlink("tmp", recursive = TRUE)
}
Loading required namespace: curl
asjp19_languages <- read_csv(languagesF)
asjp19_languages %>% head()
Rows: 10168 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): ID, Name, Glottocode, Glottolog_Name, ISO639P3code, Macroarea, Fam...
dbl  (3): Latitude, Longitude, year_of_extinction
lgl  (2): recently_extinct, long_extinct

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
A tibble: 6 × 18
ID Name Glottocode Glottolog_Name ISO639P3code Macroarea Latitude Longitude Family classification_wals classification_ethnologue classification_glottolog recently_extinct long_extinct year_of_extinction code_wals code_iso transcribers
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <lgl> <lgl> <dbl> <chr> <chr> <chr>
A51_BAFIA_MAJA A51_BAFIA_MAJA lefa1242 Lefa lfa Africa 5.10 11.20 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia FALSE FALSE NA NA lfa Ann-Katrin Wett
A51_BAFIA_TUMI_TINGON A51_BAFIA_TUMI_TINGON lefa1242 Lefa lfa Africa 5.10 11.20 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia FALSE FALSE NA NA lfa Ann-Katrin Wett
A51_BAFIA_ZAKAAN A51_BAFIA_ZAKAAN lefa1242 Lefa lfa Africa 5.10 11.20 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.51) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia FALSE FALSE NA NA lfa Ann-Katrin Wett
A53_BAFIA_RIKPA A53_BAFIA_RIKPA bafi1243 Bafia ksf Africa 5.00 11.17 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.53) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50),NuclearBafia(A.50),Lefa-Bafia FALSE FALSE NA bfi ksf Ann-Katrin Wett
A54_BAFIA_NJANTI A54_BAFIA_NJANTI tibe1274 Tibea ngy Africa 5.30 11.30 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Bafia(A.54) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,BantuA-B10-B20-B30,Bafia(A.50) FALSE FALSE NA NA ngy Ann-Katrin Wett
A60_GUNU A60_GUNU nugu1242 Nugunu (Cameroon) yas Africa 4.58 11.25 Atlantic-Congo NC.BANTU Niger-Congo,Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,Southern,NarrowBantu,Northwest,A,Sanaga(A.622) Atlantic-Congo,Volta-Congo,Benue-Congo,Bantoid,SouthernBantoid,NarrowBantu,Mbam-Bubi-Jarawan,Mbam,NuclearMbam,Bati-Mbure-Yambassa,Mbure-Yambassa,Yambassa(A.60),Mmala-Elip-Gunu,Elip-Gunu FALSE FALSE NA gun yas Ann-Katrin Wett

create metadata tibble

glottolog_names <- asjp19_languages %>%
    select(Glottocode, Glottolog_Name) %>%
    distinct()
meta <- asjp19_languages %>%
    drop_na(Glottocode) %>%
    select(Glottocode, Family, Macroarea, Latitude, Longitude) %>%
    distinct(Glottocode, .keep_all = TRUE) %>%
    right_join(asjp_languages, by = "Glottocode") %>%
    mutate(label = longname) %>%
    distinct(label, .keep_all = TRUE) %>%
    left_join(glottolog_names, by = "Glottocode")
meta %>% print(width = Inf)
# A tibble: 8,953 × 9
   Glottocode Family         Macroarea Latitude Longitude Name                 
   <chr>      <chr>          <chr>        <dbl>     <dbl> <chr>                
 1 lefa1242   Atlantic-Congo Africa        5.1       11.2 A51_BAFIA_MAJA       
 2 lefa1242   Atlantic-Congo Africa        5.1       11.2 A51_BAFIA_TUMI_TINGON
 3 lefa1242   Atlantic-Congo Africa        5.1       11.2 A51_BAFIA_ZAKAAN     
 4 lefa1242   Atlantic-Congo Africa        5.1       11.2 LEFA                 
 5 bafi1243   Atlantic-Congo Africa        5         11.2 A53_BAFIA_RIKPA      
 6 bafi1243   Atlantic-Congo Africa        5         11.2 BAFIA                
 7 bafi1243   Atlantic-Congo Africa        5         11.2 BAFIA_ROPE           
 8 tibe1274   Atlantic-Congo Africa        5.3       11.3 A54_BAFIA_NJANTI     
 9 tibe1274   Atlantic-Congo Africa        5.3       11.3 TIBEA                
10 nugu1242   Atlantic-Congo Africa        4.58      11.2 A60_GUNU             
   longname                         label                           
   <chr>                            <chr>                           
 1 NC.BANTOID.A51_BAFIA_MAJA        NC.BANTOID.A51_BAFIA_MAJA       
 2 NC.BANTOID.A51_BAFIA_TUMI_TINGON NC.BANTOID.A51_BAFIA_TUMI_TINGON
 3 NC.BANTOID.A51_BAFIA_ZAKAAN      NC.BANTOID.A51_BAFIA_ZAKAAN     
 4 NC.BANTOID.LEFA                  NC.BANTOID.LEFA                 
 5 NC.BANTOID.A53_BAFIA_RIKPA       NC.BANTOID.A53_BAFIA_RIKPA      
 6 NC.BANTOID.BAFIA                 NC.BANTOID.BAFIA                
 7 NC.BANTOID.BAFIA_ROPE            NC.BANTOID.BAFIA_ROPE           
 8 NC.BANTOID.A54_BAFIA_NJANTI      NC.BANTOID.A54_BAFIA_NJANTI     
 9 NC.BANTOID.TIBEA                 NC.BANTOID.TIBEA                
10 NC.BANTOID.A60_GUNU              NC.BANTOID.A60_GUNU             
   Glottolog_Name   
   <chr>            
 1 Lefa             
 2 Lefa             
 3 Lefa             
 4 Lefa             
 5 Bafia            
 6 Bafia            
 7 Bafia            
 8 Tibea            
 9 Tibea            
10 Nugunu (Cameroon)
# ℹ 8,943 more rows

prune tree

tree <- drop.tip(tree, setdiff(tree$tip.label, meta$label))

create tibble with node data

node_family_df <- tibble(node = 1:length(tree$tip.label), Family = meta$Family[match(tree$tip.label, meta$label)])

assign family labels to internal nodes

# Traverse the tree in postorder
postorder_nodes <- postorder(tree)

# Iterate through each node in postorder
for (node in postorder_nodes) {
    # Check if the node is an internal node
    if (node > length(tree$tip.label)) {
        # Get the descendants of the node
        descendants <- Descendants(tree, node, type = "tips")[[1]]
        
        # Get the families of the descendant tips, ignoring NA values
        descendant_families <- node_family_df %>%
            filter(node %in% descendants, !is.na(Family)) %>%
            pull(Family) %>%
            unique()
        
        # Assign the family value for the internal node
        family_value <- if (length(descendant_families) == 1) descendant_families else NA
        
        # Add a row for the internal node
        node_family_df <- node_family_df %>%
            add_row(node = node, Family = family_value)
    }
}

find 10 largest language families

family_freqs <- node_family_df %>%
    drop_na(Family) %>%
    group_by(Family) %>%
    summarise(n = n()) %>%
    arrange(desc(n)) 

top10 <- family_freqs$Family[1:10]
node_family_df <- node_family_df %>%
    mutate(Family_top10 = ifelse(Family %in% top10, Family, "Other"))
node_family_df <- node_family_df %>%
  mutate(Family_top10 = ifelse(is.na(Family_top10), "Other", Family_top10)) %>%
  mutate(Family_top10 = factor(Family_top10, levels = c(as.character(top10), "Other")))
top10
  1. 'Austronesian'
  2. 'Indo-European'
  3. 'Atlantic-Congo'
  4. 'Afro-Asiatic'
  5. 'Sino-Tibetan'
  6. 'Nilotic'
  7. 'Nuclear Trans New Guinea'
  8. 'Uralic'
  9. 'Uto-Aztecan'
  10. 'Arawakan'

fix colorcode for the 10 largest families

#palette <- c(brewer.pal(8, "Dark2"), brewer.pal(3, "Set1")[1:2], "gray")
#names(palette) <- c(top10, "Other")
palette <- c(
  "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
  "#0072B2", "#D55E00", "#CC79A7", "#999999", 
  "#A6761D", "#E41A1C", "gray"
)
names(palette) <- c(top10, "Other")
palette_no_other <- palette[names(palette) != "Other"]

create tree plot

# Turn off the legend in the tree plot
p_tree <- ggtree(tree, layout = "circular", open.angle = 10, ladderize = TRUE, branch.length = "branch.length") %<+% node_family_df +
  geom_tree(aes(color = Family_top10), size = 1, show.legend = FALSE) +
  scale_color_manual(
    values = palette, 
    na.translate = FALSE
  ) +
  theme(
    plot.margin = margin(10, 10, 10, 10)
  )

# Create dummy data
legend_df <- data.frame(Family = names(palette_no_other))

# Use a constant x value to trigger fill-based legend
dummy_plot <- ggplot(legend_df, aes(x = 1, fill = Family)) +
  geom_bar() +
  scale_fill_manual(values = palette_no_other, name = "Family") +
  theme_void() +
  theme(
    legend.position = "right",
    legend.margin = margin(0, 0, 0, 0)
  )

# Extract the legend
p_legend <- cowplot::get_legend(dummy_plot)
Warning message in get_plot_component(plot, "guide-box"):
“Multiple components found; returning the first one. To return all, use `return_all = TRUE`.”
# Combine tree plot and legend with the legend overlapping the tree
p <- cowplot::ggdraw() +
  cowplot::draw_plot(p_tree, x = 0, y = 0, width = 1, height = 1) +  # Full tree plot
  cowplot::draw_plot(p_legend, x = 0.3, y = 0.25, width = 0.3, height = 0.5)  # Adjust legend position and size

# Save or display
print(p)

svg("../img/asjp_tree.svg", width = 12, height = 12, onefile = TRUE, family = "sans", bg = "white")
print(p)
dev.off()
pdf: 2

extract Indo-European languages and subtree

indoeuropean_labels <- meta %>%
    filter(Family == "Indo-European") %>%
    filter(label %in% tree$tip.label) %>%
    pull(label)

indoeuropean_subtree <- keep.tip(tree, indoeuropean_labels)
indoeuropean_labels <- gsub("_2", "", indoeuropean_labels)
indoeuropean_subtree$tip.label <- gsub("_2", "", indoeuropean_subtree$tip.label)
indoeuropean_labels <- gsub("_3", "", indoeuropean_labels)
indoeuropean_subtree$tip.label <- gsub("_3", "", indoeuropean_subtree$tip.label)
indoeuropean_labels <- sub("ITALIAN.*", "ITALIAN", indoeuropean_labels)
indoeuropean_subtree$tip.label <- sub("ITALIAN.*", "ITALIAN", indoeuropean_subtree$tip.label)
root_height <- max(nodeHeights(indoeuropean_subtree))
indoeuropean_subtree$edge.length <- indoeuropean_subtree$edge.length / root_height
indoeuropean_df <- tibble(
    longname = indoeuropean_subtree$tip.label,
    genus = factor(sapply(strsplit(indoeuropean_subtree$tip.label, "\\."), `[`, 2)),
    label = sapply(strsplit(indoeuropean_subtree$tip.label, "\\."), `[`, 3),
)
indoeuropean_subtree$tip.label <- sapply(strsplit(indoeuropean_subtree$tip.label, "\\."), `[`, 3)
indoeuropean_df <- indoeuropean_df %>%
  select(label, genus) %>%
  distinct() %>%
  mutate(genus = factor(genus))

assign genus labels to internal nodes

node_genus_df <- tibble(node = 1:length(indoeuropean_subtree$tip.label), genus = indoeuropean_df$genus[match(indoeuropean_subtree$tip.label, indoeuropean_df$label)])
# Traverse the tree in postorder
postorder_nodes <- postorder(indoeuropean_subtree)

# Iterate through each node in postorder
for (node in postorder_nodes) {
    # Check if the node is an internal node
    if (node > length(indoeuropean_subtree$tip.label)) {
        # Get the descendants of the node
        descendants <- Descendants(indoeuropean_subtree, node, type = "tips")[[1]]
        
        # Get the genera of the descendant tips, ignoring NA values
        descendant_genera <- node_genus_df %>%
            filter(node %in% descendants, !is.na(genus)) %>%
            pull(genus) %>%
            unique()
        
        # Assign the genus value for the internal node
        genus_value <- if (length(descendant_genera) == 1) descendant_genera else NA
        
        # Add a row for the internal node
        node_genus_df <- node_genus_df %>%
            add_row(node = node, genus = genus_value)
    }
}
node_genus_df <- node_genus_df %>%
  mutate(genus = factor(genus))
get_mother_genus <- function(nd) {
    if (!(nd %in% indoeuropean_subtree$edge[, 2])) {
        return(NA)
    }
    mother <- indoeuropean_subtree$edge[indoeuropean_subtree$edge[, 2] == nd, 1]
    
    
    # Check if the parent node exists in node_genus_df
    if (!(mother %in% node_genus_df$node)) {
        return(NA)
    }
    
    # Retrieve the genus of the parent node
    mother_genus <- node_genus_df$genus[node_genus_df$node == mother]
    
    # Return the genus as a character (handle factors properly)
    return(as.character(mother_genus))
}
Nnodes = max(indoeuropean_subtree$edge)

mother_genus_df <- tibble(node = 1:Nnodes, genus = sapply(1:Nnodes, get_mother_genus))
i <- 55
for (i in 1:length(indoeuropean_subtree$tip.label)) {
    gn <- indoeuropean_df$genus[indoeuropean_df$label == indoeuropean_subtree$tip.label[i]]
    if (length(gn) > 0) {
        mother_genus_df$genus[mother_genus_df$node == i] <- as.character(gn)
    }
}
mother_genus_df$genus[is.na(mother_genus_df$genus)] <- "Other"

Capitalize genus names

mother_genus_df <- mother_genus_df %>%
    mutate(genus = str_to_title(genus))

indoeuropean_df <- indoeuropean_df %>%
    mutate(genus = str_to_title(as.character(genus)))

color palette for genera

genus_palette <- c(
    ALBANIAN   = "#1B9E77",  # Dark green
    ARMENIAN   = "#D95F02",  # Dark orange
    CELTIC     = "#E7298A",  # Dark pink
    GERMANIC   = "#66A61E",  # Dark lime green
    INDIC      = "#A6761D",  # Brown
    IRANIAN    = "#666666",  # Gray
    ROMANCE    = "#D95F02",  # Dark orange (repeated for consistency)
    SLAVIC     = "#7570B3",   # Dark purple (repeated for consistency)
    Other      = "gray70"
)

names(genus_palette) <- str_to_title(names(genus_palette))
mother_genus_df$genus[mother_genus_df$genus == "Greek"] <- "Other"
mother_genus_df$genus[mother_genus_df$genus == "Baltic"] <- "Other"
name2gname <- meta %>% 
    select(Name, Glottolog_Name) %>% 
    distinct() %>% 
    drop_na() %>%
    deframe()
indoeuropean_subtree$tip.label <- name2gname[indoeuropean_subtree$tip.label]

create tree plot

p_indoeuropean <- ggtree(indoeuropean_subtree, layout = "rectangular", branch.length = "branch.length") %<+% mother_genus_df +
    geom_tree(aes(color = genus), size = 2) +
    geom_tiplab(size = 5, hjust = 0, align = TRUE, linesize = 0.3, offset = 0.02) +
    scale_color_manual(
        values = genus_palette, 
        name = "Genus", 
        na.translate = FALSE, 
        breaks = names(genus_palette)[names(genus_palette) != "Other"]  # Exclude "Other" from the legend
    ) +
    theme_tree2() +
    theme(
        legend.position = "inside",
        legend.position.inside = c(0.05, 0.95),
        legend.justification = c("left", "top"),
        legend.background = element_rect(fill = "white", color = "black"),
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 12),
        plot.margin = margin(5, 5, 5, 5),
        
        # Remove scale bar / axis elements
        axis.line = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        axis.title = element_blank()
    ) + xlim(0, 1.25)
p_indoeuropean

svg("../img/indoeuropean_tree.svg", width = 12, height = 13, onefile = TRUE, family = "sans", bg = "white")
print(p_indoeuropean)
dev.off()
pdf: 2

save mapping from ASJP names to Glottolog names

saveRDS(name2gname, "../data/name2gname.rds")