############################################################################### ############ MORPHOMETRIC ANALYSIS OF DACTYLOBIOTUS R SCRIPT ################## ############################################################################### ########### Set general parameters for the analysis ########################### filename = "Data_PCA.xlsx" excel_sheet_animals_abs = 1 excel_sheet_animals_pt = 2 excel_sheet_eggs = 3 taxa_label = "Species" # Column containing the grouping factor (species) column_trait_ID = "_TRT" # String in the trait names used to extract them max_missing_trait = 33 # % of maximum missing data in a trait, if a trait has more missing data it is excluded from analyses ############################################################################### ## Load the libraries required library(readxl) # To read excel files library(ggplot2) # For plotting library(ggforce) # For plotting library(patchwork) # For plotting library(plyr) # For data manipulation and plotting library(pcaMethods) # for PCA with missing data ### Custom functions plot_scores = function(x, grouping){ require(plyr) require(ggplot2) cbPalette <- c("#F0E442", "#0072B2", "#D55E00", "#009E73","#CC79A7", "#999999", "#E69F00", "#56B4E9") scores = x@scores[,1:2] data_plot = cbind(data.frame(group = grouping), scores) var_expl = round(x@R2*100,2) # Create hulls for each group find_hull <- function(df) df[chull(df$PC1, df$PC2), ] hulls_groups <- ddply(data_plot, "Species", find_hull) # Plot the scores plot_scores = ggplot(data_plot)+ theme_bw()+ geom_shape(data = hulls_groups, aes(x=PC1,y=PC2,col=Species, fill = Species), alpha=0.05, radius = unit(0.025, 'npc'), expand = unit(0.025, 'npc'))+ geom_point(aes(x=PC1, y=PC2, fill = Species), size = 5, pch = 21, alpha = 0.75, col = "black")+ xlab(paste0("PC1 ", var_expl[1],"%"))+ ylab(paste0("PC2 ", var_expl[2],"%"))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = "bottom")+ coord_fixed(ratio = diff(range(data_plot$PC1))/diff(range(data_plot$PC2)))+ scale_fill_manual(values=cbPalette)+ scale_color_manual(values=cbPalette) return(plot_scores) } plot_loadings = function(x, claws_separate = T){ require(plyr) require(ggplot2) loadings = data.frame(x@loadings[,1:2]) loadings$variable = rownames(loadings) if(claws_separate == T){loadings$claw = grepl("Claw",loadings$variable, fixed = T)} if(claws_separate == F){loadings$claw = rep(FALSE, nrow(loadings))} var_expl = round(x@R2*100,2) max_loading = max(abs(loadings[,c("PC1","PC2")]))*1.1 plot_loadings = ggplot(loadings)+ theme_bw()+ theme(panel.grid=element_blank(),axis.title.x=element_text(size=10),axis.title.y=element_text(size=10))+ geom_hline(aes(yintercept=0),col="grey")+ geom_vline(aes(xintercept=0),col="grey")+ annotate("path",x=cos(seq(0,2*pi,length.out=100))*max_loading,y=sin(seq(0,2*pi,length.out=100))*max_loading,col="grey", alpha = 0)+ coord_fixed(ratio=1)+ geom_segment(aes(x=0,y=0,xend=PC1,yend=PC2, color = claw), show.legend = F, arrow = arrow(length = unit(0.02, "npc")), linewidth=unit(0.6, "npc"))+ geom_text(data = loadings[loadings$claw == F,], aes(x=PC1,y=PC2,label=variable),size=unit(4, "npc"))+ xlab(paste("PC1",var_expl[1],"%",sep=" "))+ ylab(paste("PC2",var_expl[2],"%",sep=" "))+ scale_color_manual(values = setNames(c("black","darkgrey"), c(FALSE, TRUE))) return(plot_loadings) } #### # Load the data AnimalsAb <- read_xlsx(filename, sheet = excel_sheet_animals_abs) AnimalsPt <- read_xlsx(filename, sheet = excel_sheet_animals_pt) Eggs <- read_xlsx(filename, sheet = excel_sheet_eggs) # Get species labels AnimalsLabsAb <- AnimalsAb[,taxa_label] AnimalsLabsPt <- AnimalsPt[,taxa_label] EggsLabs <- Eggs[,taxa_label] # Extract columns with traits AnimalsAb_traitcols = grepl(column_trait_ID, colnames(AnimalsAb), fixed = TRUE) AnimalsPt_traitcols = grepl(column_trait_ID, colnames(AnimalsPt), fixed = TRUE) Eggs_traitcols = grepl(column_trait_ID, colnames(Eggs), fixed = TRUE) # Extract tables with only traits and remove the trait marker string from their column names AnimalsAb_traits = AnimalsAb[,AnimalsAb_traitcols]; colnames(AnimalsAb_traits) = gsub(column_trait_ID,"",colnames(AnimalsAb_traits), fixed = T) AnimalsPt_traits = AnimalsPt[,AnimalsPt_traitcols]; colnames(AnimalsPt_traits) = gsub(column_trait_ID,"",colnames(AnimalsPt_traits), fixed = T) Eggs_traits = Eggs[,Eggs_traitcols]; colnames(Eggs_traits) = gsub(column_trait_ID,"",colnames(Eggs_traits), fixed = T) # Remove traits with too many missing data AnimalsAb_traits_tokeep = apply(AnimalsAb_traits, MARGIN = 2, FUN = function(x){(sum(is.na(x))/length(x))*100}) <= max_missing_trait AnimalsAb_traits = AnimalsAb_traits[,AnimalsAb_traits_tokeep] AnimalsPt_traits_tokeep = apply(AnimalsPt_traits, MARGIN = 2, FUN = function(x){(sum(is.na(x))/length(x))*100}) <= max_missing_trait AnimalsPt_traits = AnimalsPt_traits[,AnimalsPt_traits_tokeep] Eggs_traits_tokeep = apply(Eggs_traits, MARGIN = 2, FUN = function(x){(sum(is.na(x))/length(x))*100}) <= max_missing_trait Eggs_traits = Eggs_traits[,Eggs_traits_tokeep] # Scale the traits AnimalsAb_traits_scaled = data.frame(scale(AnimalsAb_traits)) AnimalsPt_traits_scaled = data.frame(scale(AnimalsPt_traits)) Eggs_traits_scaled = data.frame(scale(Eggs_traits)) # Perform PCA with missing data AnimalsAb_pca = pca(AnimalsAb_traits_scaled, # Data to analyze. nPcs = ncol(AnimalsAb_traits_scaled), # PC to retain (the same as the number of the original variable). seed = 12345, # To reproduce the analysis perfectly. method = "nipals") # An iterative fast method which is applicable also to data with missing values. AnimalsPt_pca = pca(AnimalsPt_traits_scaled, nPcs = ncol(AnimalsPt_traits_scaled), seed = 12345, method = "nipals") Eggs_pca = pca(Eggs_traits_scaled, nPcs = ncol(Eggs_traits_scaled), seed = 12345, method = "nipals") # Check the explained variance AnimalsAb_r2 = AnimalsAb_pca@R2*100 # Extract explained variance by component as %. AnimalsAb_cumr2 = AnimalsAb_pca@R2cum*100 # Extract cumulative explained variance by component as %. AnimalsPt_r2 = AnimalsPt_pca@R2*100 AnimalsPt_cumr2 = AnimalsPt_pca@R2cum*100 Eggs_r2 = Eggs_pca@R2*100 Eggs_cumr2 = Eggs_pca@R2cum*100 # The first 2 PCs explain more than 50% of the variance so lets focus only on them ## PLOT THE PCAs plot_animals_abs_scores = plot_scores(AnimalsAb_pca, AnimalsLabsAb) + ggtitle("Animals - absolute values") plot_animals_abs_loadings = plot_loadings(AnimalsAb_pca,claws_separate = T) plot_animals_pt_scores = plot_scores(AnimalsPt_pca, AnimalsLabsPt) + ggtitle("Animals - pt values") plot_animals_pt_loadings = plot_loadings(AnimalsPt_pca,claws_separate = T) plot_eggs_scores = plot_scores(Eggs_pca, EggsLabs) + ggtitle("Eggs") plot_eggs_loadings = plot_loadings(Eggs_pca,claws_separate = F) # Assemble the plots #(plot_animals_abs_scores/plot_animals_abs_loadings)|(plot_animals_pt_scores/plot_animals_pt_loadings)|(plot_eggs_scores/plot_eggs_loadings) (plot_animals_abs_scores|plot_animals_abs_loadings)/(plot_animals_pt_scores|plot_animals_pt_loadings)/(plot_eggs_scores|plot_eggs_loadings) ggsave("plot_PCA.pdf", height = 12.5, width = 12.5) ggsave("plot_PCA.svg", height = 12.5, width = 12.5) ## PLOT TRAITS RANGES data_plot_animals_ab = cbind(AnimalsLabsAb,AnimalsAb_traits) data_plot_animals_ab = tidyr::pivot_longer(data_plot_animals_ab, 2:ncol(data_plot_animals_ab)) ggplot(data_plot_animals_ab)+ theme_bw()+ stat_boxplot(aes(x = Species, y = value), geom ='errorbar', coef=NULL, size = 1, col = "black")+ stat_summary(fun.y = mean, geom="point",aes(x = Species, y = value, fill = Species), shape = 21, size = 6)+ facet_wrap(.~name, scales = "free")+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())+ scale_fill_manual(values = c("#F0E442", "#0072B2"))+ xlab("") + ylab("") ggsave("plot_overlap_animals_absolute.pdf", height = 15, width = 15) data_plot_animals_pt = cbind(AnimalsLabsPt,AnimalsPt_traits) data_plot_animals_pt = tidyr::pivot_longer(data_plot_animals_pt, 2:ncol(data_plot_animals_pt)) ggplot(data_plot_animals_pt)+ theme_bw()+ stat_boxplot(aes(x = Species, y = value), geom ='errorbar', coef=NULL, size = 1, col = "black")+ stat_summary(fun.y = mean, geom="point",aes(x = Species, y = value, fill = Species), shape = 21, size = 6)+ facet_wrap(.~name, scales = "free")+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())+ scale_fill_manual(values = c("#F0E442", "#0072B2"))+ xlab("") + ylab("") ggsave("plot_overlap_animals_pt.pdf", height = 15, width = 15) data_plot_egg = cbind(EggsLabs,Eggs_traits) data_plot_egg = tidyr::pivot_longer(data_plot_egg, 2:ncol(data_plot_egg)) ggplot(data_plot_egg)+ theme_bw()+ stat_boxplot(aes(x = Species, y = value), geom ='errorbar', coef=NULL, size = 1, col = "black")+ stat_summary(fun.y = mean, geom="point",aes(x = Species, y = value, fill = Species), shape = 21, size = 6)+ facet_wrap(.~name, scales = "free")+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())+ scale_fill_manual(values = c("#F0E442", "#0072B2"))+ xlab("") + ylab("") ggsave("plot_overlap_eggs.pdf", height = 15, width = 15)