as described in https://github.com/jdonhauser/ampliconTraits using usearch v11.0.667 e.g.
usearch -sinaps ASVs.fasta -db genome_size10.fasta -attr genome_size10 -tabbedout genomeclassification.txt -strand plus
e.g for genome size
usearch -usearch_global dna-sequences.fasta -db genome_size10.fasta -strand plus -id 0.5 \
-maxaccepts 8 -maxrejects 128 -top_hit_only \
-userout genome_size10_topHit.txt -userfields query+target+id
library(vegan)
library(grid)
library(reshape2)
library(egg)
library(RColorBrewer)
library(viridis)
library(ggforce)
library(sp)
library(missMDA)
library(cluster)
library(usdm)
library(MASS)
library(corrplot)
library(ecospat)
library(randomForest)
library(fitdistrplus)
library(Hmisc)
library(rgdal)
library(raster)
library(dismo)
library(maptools)
library(terra)
library(viridis)
library(MicEnvMod)
library(stars)
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] stars_0.6-4 sf_1.0-12 abind_1.4-5
## [4] MicEnvMod_0.0.0.9000 maptools_1.1-4 dismo_1.3-9
## [7] raster_3.5-29 rgdal_1.6-4 Hmisc_4.7-1
## [10] Formula_1.2-4 fitdistrplus_1.1-8 survival_3.4-0
## [13] randomForest_4.7-1.1 ecospat_3.5.1 corrplot_0.92
## [16] MASS_7.3-57 usdm_2.1-6 terra_1.7-23
## [19] cluster_2.1.4 missMDA_1.18 sp_2.0-0
## [22] ggforce_0.4.1 viridis_0.6.4 viridisLite_0.4.2
## [25] RColorBrewer_1.1-3 egg_0.4.5 ggplot2_3.4.4
## [28] gridExtra_2.3 reshape2_1.4.4 vegan_2.6-2
## [31] lattice_0.20-45 permute_0.9-7
##
## loaded via a namespace (and not attached):
## [1] backports_1.4.1 plyr_1.8.7 splines_4.1.3
## [4] digest_0.6.29 foreach_1.5.2 htmltools_0.5.5
## [7] earth_5.3.2 fansi_1.0.3 magrittr_2.0.3
## [10] checkmate_2.1.0 doParallel_1.0.17 ks_1.14.0
## [13] jpeg_0.1-9 colorspace_2.1-0 ggrepel_0.9.1
## [16] xfun_0.32 dplyr_1.0.9 jsonlite_1.8.0
## [19] iterators_1.0.14 ape_5.6-2 glue_1.6.2
## [22] polyclip_1.10-4 PresenceAbsence_1.1.11 gtable_0.3.4
## [25] emmeans_1.8.7 scales_1.2.1 mvtnorm_1.1-3
## [28] DBI_1.1.3 Rcpp_1.0.10 plotrix_3.8-2
## [31] xtable_1.8-4 htmlTable_2.4.1 units_0.8-0
## [34] flashClust_1.01-2 foreign_0.8-82 proxy_0.4-27
## [37] mclust_6.0.0 DT_0.28 htmlwidgets_1.6.2
## [40] nabor_0.5.0 mice_3.15.0 pkgconfig_2.0.3
## [43] reshape_0.8.9 farver_2.1.1 nnet_7.3-17
## [46] multcompView_0.1-9 sass_0.4.2 deldir_1.0-6
## [49] utf8_1.2.2 tidyselect_1.1.2 rlang_1.1.1
## [52] munsell_0.5.0 TeachingDemos_2.12 tools_4.1.3
## [55] cachem_1.0.6 xgboost_1.6.0.1 cli_3.3.0
## [58] generics_0.1.3 ade4_1.7-19 broom_1.0.1
## [61] evaluate_0.16 stringr_1.4.1 fastmap_1.1.0
## [64] yaml_2.3.5 maxnet_0.1.4 knitr_1.40
## [67] purrr_0.3.4 nlme_3.1-159 pracma_2.4.2
## [70] leaps_3.1 biomod2_4.2-4 compiler_4.1.3
## [73] rstudioapi_0.14 png_0.1-7 e1071_1.7-11
## [76] tibble_3.2.1 tweenr_2.0.2 bslib_0.4.0
## [79] stringi_1.7.6 plotmo_3.6.2 poibin_1.5
## [82] Matrix_1.4-1 classInt_0.4-7 gbm_2.1.8.1
## [85] vctrs_0.6.1 pillar_1.9.0 lifecycle_1.0.4
## [88] jquerylib_0.1.4 estimability_1.4.1 data.table_1.14.2
## [91] R6_2.5.1 latticeExtra_0.6-30 KernSmooth_2.23-20
## [94] codetools_0.2-18 gtools_3.9.3 assertthat_0.2.1
## [97] withr_2.5.2 mgcv_1.8-40 parallel_4.1.3
## [100] rpart_4.1.16 tidyr_1.2.0 coda_0.19-4
## [103] class_7.3-20 rmarkdown_2.16 mda_0.5-3
## [106] pROC_1.18.0 scatterplot3d_0.3-44 base64enc_0.1-3
## [109] FactoMineR_2.8 interp_1.1-3
plot.theme1 <- theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white",
colour = "black",
size = 0.5, linetype = "solid"),
panel.border= element_rect(fill=NA,size = 0.5, linetype = 'solid',colour = "black"),
axis.text.x = element_text(size=13),axis.text.y = element_text(size=13),legend.text = element_text(size=13),
axis.title = element_text(size=14),
legend.title = element_text(color = "black", size = 14),
strip.text.x = element_text(size=14),
strip.background = element_rect(colour="black", fill="white")
)
# colors for landcover
covLeg <- read.csv("./igbp_legend.csv")
# create color vector with the colors indicated in the legend table
colLC <- c()
for (i in 1:nrow(covLeg)){
colLC[i] <- rgb(covLeg[i,'r']/360,covLeg[i,'g']/360,covLeg[i,'b']/360)
}
names(colLC) <- covLeg$Name
### Function for aggregating asvs *********************************************************************** ####
#countab: counttable,taxa are rows
#taxo=taxonomy table
#col2matchcount vector of rownames or column in counttab to which order of taxonomy table should be matched
#col2matchtax:vector of rownames or column in taxo to match
#Taxlevel: taxonomic level on which should be aggregated
#Samp: sample data
#fac: factor in Sample of which the mean should be calculated
#Summarize: should rare taxa be summarized and represented as others
#sumlevel: abundance threshold below which taxa are summarkued as others
# long: transform to long format, default TRUE
AbuBarTable <- function(countab,taxo,col2matchcount,col2matchtax,Taxlevel,Samp,fac,Summarize=F,sumlevel,long=T){
#replace NA with unclassified
taxo <- as.data.frame(apply(taxo,2,function(x){
sapply(x,function(y){ifelse(is.na(y),"unclassified",y)})
}))
tax <- taxo[match(col2matchcount,col2matchtax),]
taxabu <- aggregate(countab,list(tax[,Taxlevel]),sum)
rownames(taxabu)<- taxabu[,1]
taxabu <- taxabu[,-1]
taxabu.rel <- apply(taxabu,2,function(x)x/sum(x))
taxabu.rel <- as.data.frame(t(taxabu.rel))
taxabu.rel <- taxabu.rel[rownames(Samp),]
for(i in 1:ncol(Samp)){
Samp[,i] <- as.character(Samp[,i])
}
taxabu.rel.mean <- aggregate(taxabu.rel,list(Samp[,fac]),mean)
colnames(taxabu.rel.mean)[1] <- fac
rownames(taxabu.rel.mean) <- taxabu.rel.mean[,1]
a=c()
for (i in colnames(taxabu.rel.mean)){
a[i] <- is.numeric(taxabu.rel.mean[,i])
}
taxabu.rel.mean <- taxabu.rel.mean[,a]
if(Summarize==T){
num=taxabu.rel.mean
num.l=as.list(as.data.frame(t(num)))
num.l=lapply(num.l,function(x){
names(x)=colnames(num)
Others=sum(x[which(x<sumlevel)])
x=x[-which(x<sumlevel)]
names(Others)="Others"
x=c(x,Others)
})
taxabu.rel.mean <- as.data.frame(do.call(rbind, lapply(num.l, "[",unique(as.vector(unlist(sapply(num.l,names)))))))
colnames(taxabu.rel.mean) <- unique(as.vector(unlist(sapply(num.l,names))))
taxabu.rel.mean <- apply(taxabu.rel.mean,2,function(x){
sapply(x,function(y){ifelse(is.na(y),0,y)})
})
}
a <- as.data.frame(Samp[!duplicated(Samp[,fac]),])
if(all(rownames(taxabu.rel.mean)==a[,fac])){
taxabu.rel.mean <- cbind(taxabu.rel.mean,a)
}else{
taxabu.rel.mean <- taxabu.rel.mean[match(a[,fac],rownames(taxabu.rel.mean)),]
taxabu.rel.mean <- cbind(taxabu.rel.mean,a)
}
require(reshape2)
if (long){
taxabu.rel.mean.long <- melt(taxabu.rel.mean)
a=unique(as.character(taxabu.rel.mean.long$variable))
if("Others"%in%taxabu.rel.mean.long$variable){
taxabu.rel.mean.long$variable <- factor(taxabu.rel.mean.long$variable,levels=c(a[-(which(a=="Others"))],"Others"))
}
taxabu.rel.mean.long[,fac] <- factor(taxabu.rel.mean.long[,fac],levels=unique(taxabu.rel.mean.long[,fac]))
return(taxabu.rel.mean.long)
}else{
return(taxabu.rel.mean)
}
}
load("./traitAnnotationProperties.RData" )
# based on mean of interval
# d1_lo5 has only one value for this dataset, omit
# continuous traits
cont <- c("d1_lo10","d1_lo20","d1_lo30","d1_up10","d1_up20","d1_up30","d1_up5","d2_lo10","d2_lo20","d2_lo30",
"d2_lo5","d2_up10","d2_up20","d2_up30","d2_up5","doubling_h10","doubling_h20","doubling_h30","doubling_h40","doubling_h50",
"doubling_h5","genome_size10","genome_size20","genome_size5","optimum_ph10","optimum_ph20",
"optimum_ph5","optimum_tmp10","optimum_tmp20","rRNA16S_genes_exact_int","rRNA16S_genes10",
"rRNA16S_genes5")
traits_WA <- list()
for (i in cont){
a <- traits[[i]]
rownames(a) <- a[,1]
# replace values with unclassified if bootstrap is < 70 or identity is < 80 or NA
a$Value <- ifelse(a$Bootstrap < 70 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
# extract lower and upper boundary of interval and calculate mean
for (j in unique(a$Value)){
if (j=='unclassified'){
a[a$Value==j,'lo'] <- NA
a[a$Value==j,'up'] <- NA
}else{
a[a$Value==j,'lo'] <- gsub('-.*','', j)
a[a$Value==j,'up'] <- gsub('.*-','', j)
}
}
a$lo <- as.numeric(a$lo)
a$up <- as.numeric(a$up)
# calculate mean
a$mean <- (a$lo+a$up)/2
abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
# calculated weighted average
for (j in unique(abu$Sample.ID)){
b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
# abundance relative to sum of classified
c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
rm(b,c)
}
# dereplicate, so each sample occurs only once
abu <- abu[!duplicated(abu$Sample.ID),]
traits_WA[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
traits_WA2 <- traits_WA
for (i in names(traits_WA2)){
traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}
traits_WA_df <- do.call(cbind, traits_WA2)
colnames(traits_WA_df) <- names(traits_WA2)
# d1lo
ecospat.cor.plot(traits_WA_df[,grep('d1_lo',colnames(traits_WA_df))])
# d1_up
ecospat.cor.plot(traits_WA_df[,grep('d1_up',colnames(traits_WA_df))][,c(4,1,2,3)])
# d2_lo
ecospat.cor.plot(traits_WA_df[,grep('d2_lo',colnames(traits_WA_df))][,c(4,1,2,3)])
# d2_up
ecospat.cor.plot(traits_WA_df[,grep('d2_up',colnames(traits_WA_df))][,c(4,1,2,3)])
#doubling_h
ecospat.cor.plot(traits_WA_df[,grep('doubling_h',colnames(traits_WA_df))][,c(6,1,2,3,4,5)])
#genome_size
ecospat.cor.plot(traits_WA_df[,grep('genome_size',colnames(traits_WA_df))][,c(3,1,2)])
#optimum_ph
ecospat.cor.plot(traits_WA_df[,grep('optimum_ph',colnames(traits_WA_df))][,c(3,1,2)])
#optimum_tmp
ecospat.cor.plot(traits_WA_df[,grep('optimum_tmp',colnames(traits_WA_df))])
#rRNA16S_genes
ecospat.cor.plot(traits_WA_df[,grep('rRNA16S_genes',colnames(traits_WA_df))][,c(3,1,2)])
with and without cutoff for sequence identity ### bootstrap > 70 considered classified, no seqID cutoff
traits_WA2 <- list()
for (i in cont){
a <- traits[[i]]
rownames(a) <- a[,1]
# replace values with unclassified if bootstrap is < 70
a$Value <- ifelse(a$Bootstrap < 70, 'unclassified', a$Value)
# extract lower and upper boundary of interval and calculate mean
for (j in unique(a$Value)){
if (j=='unclassified'){
a[a$Value==j,'lo'] <- NA
a[a$Value==j,'up'] <- NA
}else{
a[a$Value==j,'lo'] <- gsub('-.*','', j)
a[a$Value==j,'up'] <- gsub('.*-','', j)
}
}
a$lo <- as.numeric(a$lo)
a$up <- as.numeric(a$up)
# calculate mean
a$mean <- (a$lo+a$up)/2
abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
# calculated weighted average
for (j in unique(abu$Sample.ID)){
b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
# abundance relative to sum of classified
c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
rm(b,c)
}
# dereplicate, so each sample occurs only once
abu <- abu[!duplicated(abu$Sample.ID),]
traits_WA2[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
for (i in names(traits_WA2)){
traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}
traits_WA_df_70 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_70) <- names(traits_WA2)
traits_WA2 <- list()
for (i in cont){
a <- traits[[i]]
rownames(a) <- a[,1]
# replace values with unclassified if bootstrap is < 60 or identity is < 80 or NA
a$Value <- ifelse(a$Bootstrap < 60 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
# extract lower and upper boundary of interval and calculate mean
for (j in unique(a$Value)){
if (j=='unclassified'){
a[a$Value==j,'lo'] <- NA
a[a$Value==j,'up'] <- NA
}else{
a[a$Value==j,'lo'] <- gsub('-.*','', j)
a[a$Value==j,'up'] <- gsub('.*-','', j)
}
}
a$lo <- as.numeric(a$lo)
a$up <- as.numeric(a$up)
# calculate mean
a$mean <- (a$lo+a$up)/2
abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
# calculated weighted average
for (j in unique(abu$Sample.ID)){
b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
# abundance relative to sum of classified
c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
rm(b,c)
}
# dereplicate, so each sample occurs only once
abu <- abu[!duplicated(abu$Sample.ID),]
traits_WA2[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
for (i in names(traits_WA2)){
traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}
traits_WA_df_60_80 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_60_80) <- names(traits_WA2)
traits_WA2 <- list()
for (i in cont){
a <- traits[[i]]
rownames(a) <- a[,1]
# replace values with unclassified if bootstrap is < 60
a$Value <- ifelse(a$Bootstrap < 60, 'unclassified', a$Value)
# extract lower and upper boundary of interval and calculate mean
for (j in unique(a$Value)){
if (j=='unclassified'){
a[a$Value==j,'lo'] <- NA
a[a$Value==j,'up'] <- NA
}else{
a[a$Value==j,'lo'] <- gsub('-.*','', j)
a[a$Value==j,'up'] <- gsub('.*-','', j)
}
}
a$lo <- as.numeric(a$lo)
a$up <- as.numeric(a$up)
# calculate mean
a$mean <- (a$lo+a$up)/2
abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
# calculated weighted average
for (j in unique(abu$Sample.ID)){
b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
# abundance relative to sum of classified
c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
rm(b,c)
}
# dereplicate, so each sample occurs only once
abu <- abu[!duplicated(abu$Sample.ID),]
traits_WA2[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
for (i in names(traits_WA2)){
traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}
traits_WA_df_60 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_60) <- names(traits_WA2)
traits_WA2 <- list()
for (i in cont){
a <- traits[[i]]
rownames(a) <- a[,1]
# replace values with unclassified if bootstrap is < 50 or identity is < 80 or NA
a$Value <- ifelse(a$Bootstrap < 50 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
# extract lower and upper boundary of interval and calculate mean
# extract lower and upper boundary of interval and calculate mean
for (j in unique(a$Value)){
if (j=='unclassified'){
a[a$Value==j,'lo'] <- NA
a[a$Value==j,'up'] <- NA
}else{
a[a$Value==j,'lo'] <- gsub('-.*','', j)
a[a$Value==j,'up'] <- gsub('.*-','', j)
}
}
a$lo <- as.numeric(a$lo)
a$up <- as.numeric(a$up)
# calculate mean
a$mean <- (a$lo+a$up)/2
abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
# calculated weighted average
for (j in unique(abu$Sample.ID)){
b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
# abundance relative to sum of classified
c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
rm(b,c)
}
# dereplicate, so each sample occurs only once
abu <- abu[!duplicated(abu$Sample.ID),]
traits_WA2[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
for (i in names(traits_WA2)){
traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}
traits_WA_df_50_80 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_50_80) <- names(traits_WA2)
traits_WA2 <- list()
for (i in cont){
a <- traits[[i]]
rownames(a) <- a[,1]
# replace values with unclassified if bootstrap is < 50
a$Value <- ifelse(a$Bootstrap < 50, 'unclassified', a$Value)
# extract lower and upper boundary of interval and calculate mean
for (j in unique(a$Value)){
if (j=='unclassified'){
a[a$Value==j,'lo'] <- NA
a[a$Value==j,'up'] <- NA
}else{
a[a$Value==j,'lo'] <- gsub('-.*','', j)
a[a$Value==j,'up'] <- gsub('.*-','', j)
}
}
a$lo <- as.numeric(a$lo)
a$up <- as.numeric(a$up)
# calculate mean
a$mean <- (a$lo+a$up)/2
abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
# calculated weighted average
for (j in unique(abu$Sample.ID)){
b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
# abundance relative to sum of classified
c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
rm(b,c)
}
# dereplicate, so each sample occurs only once
abu <- abu[!duplicated(abu$Sample.ID),]
traits_WA2[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
for (i in names(traits_WA2)){
traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}
traits_WA_df_50 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_50) <- names(traits_WA2)
### add cutoff to column names and merge tables
colnames(traits_WA_df) <- paste0(colnames(traits_WA_df),'_7080')
colnames(traits_WA_df_70) <- paste0(colnames(traits_WA_df_70),'_70')
colnames(traits_WA_df_60_80) <- paste0(colnames(traits_WA_df_60_80),'_6080')
colnames(traits_WA_df_60) <- paste0(colnames(traits_WA_df_60),'_60')
colnames(traits_WA_df_50_80) <- paste0(colnames(traits_WA_df_50_80),'_5080')
colnames(traits_WA_df_50) <- paste0(colnames(traits_WA_df_50),'_50')
# merge tables
traits_WA_df_all <- cbind(traits_WA_df,traits_WA_df_70,traits_WA_df_60_80,traits_WA_df_60,traits_WA_df_50_80,
traits_WA_df_50)
# for one of the intervals where stable results were obtained
# d1lo20
ecospat.cor.plot(traits_WA_df_all[,grep('d1_lo20',colnames(traits_WA_df_all))])
# d1lo20 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d1_lo20.*80$',colnames(traits_WA_df_all))])
# d1up30
ecospat.cor.plot(traits_WA_df_all[,grep('d1_up30',colnames(traits_WA_df_all))])
# d1up30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d1_up30.*80$',colnames(traits_WA_df_all))])
# d2lo30
ecospat.cor.plot(traits_WA_df_all[,grep('d2_lo30',colnames(traits_WA_df_all))])
# d2lo30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d2_lo30.*80$',colnames(traits_WA_df_all))])
# d2up30
ecospat.cor.plot(traits_WA_df_all[,grep('d2_up30',colnames(traits_WA_df_all))])
# d2up30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d1_up30.*80$',colnames(traits_WA_df_all))])
# doubling_h30
ecospat.cor.plot(traits_WA_df_all[,grep('doubling_h30',colnames(traits_WA_df_all))])
# doubling_h30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('doubling_h30.*80$',colnames(traits_WA_df_all))])
# genome_size10
ecospat.cor.plot(traits_WA_df_all[,grep('genome_size10',colnames(traits_WA_df_all))])
# genome_size10 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('genome_size10.*80$',colnames(traits_WA_df_all))])
# optimum_ph20
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_ph20',colnames(traits_WA_df_all))])
# optimum_ph20 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_ph20.*80$',colnames(traits_WA_df_all))])
# optimum_tmp20
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_tmp20',colnames(traits_WA_df_all))])
# optimum_tmp20 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_tmp20.*80$',colnames(traits_WA_df_all))])
# rRNA16S_genes exact
ecospat.cor.plot(traits_WA_df_all[,grep('rRNA16S_genes_exact_int',colnames(traits_WA_df_all))])
# rRNA16S_genes exact - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('rRNA16S_genes_exact_int.*80$',colnames(traits_WA_df_all))])
# aggregate abundances of trait levels
sam <- sam[,c(1:2,4)] # subset to non-numeric
traits_abu <- list()
for (i in names(traits)){
a <- traits[[i]]
rownames(a) <- a[,1]
# replace values with unclassified if bootstrap is < 70 or identity is < 80 or NA
a$Value <- ifelse(a$Bootstrap < 70 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "Value", sam, "Sample.ID")
traits_abu[[i]] <- abu
}
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
# add name of trait
for (i in names(traits_abu)){
traits_abu[[i]]$trait <- i
}
traits_abu_df <- do.call(rbind, traits_abu)
# subset to unclassified
traits_abu_df <- traits_abu_df[traits_abu_df$variable=='unclassified',]
# subset with continuous traits
traits_abu_df_cont <- traits_abu_df[grep('[0-9]$|_int',traits_abu_df$trait),]
# add column with interval and trait without number
traits_abu_df_cont$trait2 <- gsub('[0-9]*$','',traits_abu_df_cont$trait)
traits_abu_df_cont$intervals <- gsub('.*_','',traits_abu_df_cont$trait)
traits_abu_df_cont$intervals <- gsub('[a-z]*','',traits_abu_df_cont$intervals)
traits_abu_df_cont[traits_abu_df_cont$trait=="rRNA16S_genes_exact_int",'intervals'] <- 'exact'
traits_abu_df_cont[traits_abu_df_cont$trait2=="rRNA16S_genes_exact_int",'trait2'] <- 'rRNA16S_genes'
# order intervals
traits_abu_df_cont$intervals <- factor(traits_abu_df_cont$intervals, levels = c("5", "10", "20", "30", "40", "50", "exact"))
# boxplot for continuous variables as a function of interval
ggplot(traits_abu_df_cont, aes(x = intervals, y = value)) + geom_boxplot(outlier.size = 0.3) +
facet_grid(~trait2, scales = 'free', space = 'free') + plot.theme1 +
geom_jitter(size = 0.3, alpha = 0.3, color = 'blue') +
labs(y = '% unclassified', x = 'number of intervals') +
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))
# subset with categorigal traits
traits_abu_df_cat <- traits_abu_df[-(grep('[0-9]$|exact',traits_abu_df$trait)),]
# boxplot for categorical variables
ggplot(traits_abu_df_cat, aes(x = trait, y = value)) + geom_boxplot(outlier.size = 0.3) +
plot.theme1 +
geom_jitter(size = 0.3, alpha = 0.3, color = 'blue') +
labs(y = '% unclassified', x = 'Trait') +
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))
traits_df <- do.call(rbind,traits)
# subset with continuous traits
traits_df_cont <- traits_df[grep('[0-9]$|_int',traits_df$Trait),]
# add column with interval and trait without number
traits_df_cont$trait2 <- gsub('[0-9]*$','',traits_df_cont$Trait)
traits_df_cont$intervals <- gsub('.*_','',traits_df_cont$Trait)
traits_df_cont$intervals <- gsub('[a-z]*','',traits_df_cont$intervals)
traits_df_cont[traits_df_cont$Trait=="rRNA16S_genes_exact_int",'intervals'] <- 'exact'
traits_df_cont[traits_df_cont$trait2=="rRNA16S_genes_exact_int",'trait2'] <- 'rRNA16S_genes'
# order intervals
traits_df_cont$intervals <- factor(traits_df_cont$intervals, levels = c("5", "10", "20", "30", "40", "50", "exact"))
# boxplot for continuous variables as a function of interval
ggplot(traits_df_cont, aes(x = intervals, y = Bootstrap)) + geom_boxplot(outlier.size = 0.3) +
facet_grid(~trait2, scales = 'free', space = 'free') + plot.theme1 +
# geom_jitter(size = 0.3, alpha = 0.1, color = 'blue') +
labs(y = 'Bootstrap value', x = 'number of intervals') +
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))
# for categorical traits
traits_df_cat <- traits_df[-(grep('[0-9]$|exact',traits_df$Trait)),]
ggplot(traits_df_cat, aes(x = Trait, y = Bootstrap)) + geom_boxplot(outlier.size = 0.3) +
plot.theme1 +
# geom_jitter(size = 0.3, alpha = 0.3, color = 'blue') +
labs(y = '% unclassified', x = 'Trait') +
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))
#retain only one interval for each continuous trait (seq ID with tophit should be the same for different intervals)
traits_df2 <- traits_df[traits_df$Trait%in%c("cell_shape","d1_lo10", "d1_up10","d2_lo10","d2_up10", "doubling_h10", "genome_size10",
"gram_stain", "metabolism", "motility", "optimum_ph10","optimum_tmp10",
"range_salinity", "range_tmp", "rRNA16S_genes_exact", "sporulation"),]
ggplot(traits_df2, aes(x = Trait, y = topHit_ID)) + geom_boxplot(outlier.size = 0.3) +
plot.theme1 +
# geom_jitter(size = 0.3, alpha = 0.1, color = 'blue') +
labs(y = '% sequence identity top hit', x = 'number of intervals') +
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))
# for each sample and trait: average of ASVs that occur in this sample (not weighted by abundance)
meanseqID <- data.frame()
for (i in unique(traits_df2$Trait)){
a <- traits_df2[traits_df2$Trait==i,]
for(j in colnames(asv)){
asvList <- rownames(asv[asv[,j]!=0,])
meanseqID[j,i] <- mean(a[a$ASV%in%asvList,'topHit_ID'],na.rm = T)
}
}
rm(asvList,a)
meanseqID <- melt(meanseqID)
## No id variables; using all as measure variables
ggplot(meanseqID, aes(x=variable,y=value)) + geom_boxplot(outlier.size = 0.3) +
plot.theme1 +
geom_jitter(size = 0.3, alpha = 0.1, color = 'blue') +
labs(y = 'mean % sequence identity \n top hit', x = 'number of intervals') +
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))
### bootstrap values as function of sequence identity for each trait
ggplot(traits_df, aes(x = topHit_ID, y = Bootstrap)) + geom_point(alpha = 0.2, size = 0.8) +
geom_smooth(formula = y ~ s(x)) +
plot.theme1 +
facet_wrap(~Trait)
## `geom_smooth()` using method = 'gam'
# violin plots for different intervals of sequence identity (in scatterplots it is hard to see patterns)
# 10bins of equal length => group by interval
traits_df$topHit_ID_int <- cut_interval(traits_df$topHit_ID, 10)
# reorder levels of trait
traits_df$Trait <- factor(traits_df$Trait, levels = c("cell_shape", "d1_lo5", "d1_lo10","d1_lo20","d1_lo30",
"d1_up5","d1_up10","d1_up20", "d1_up30",
"d2_lo5", "d2_lo10","d2_lo20", "d2_lo30",
"d2_up5", "d2_up10", "d2_up20", "d2_up30",
"doubling_h5", "doubling_h10", "doubling_h20","doubling_h30","doubling_h40", "doubling_h50",
"genome_size5", "genome_size10" , "genome_size20", "gram_stain", "metabolism","motility" ,
"optimum_ph5","optimum_ph10", "optimum_ph20","optimum_tmp10", "optimum_tmp20",
"range_salinity", "range_tmp",
"rRNA16S_genes5","rRNA16S_genes10", "rRNA16S_genes_exact_int", "rRNA16S_genes_exact", "sporulation"))
ggplot(traits_df, aes(x = topHit_ID, y = Bootstrap, group = topHit_ID_int)) + geom_violin(scale = "width", fill = "lightblue") +
plot.theme1 +
facet_wrap(~Trait)
final predictor tables provided (after imputing NAs, aggregating means of replicates and selection of non-colinear predictors); code for imputing NAs and selecting non-colinear variables shown for illustration
# code used for imputing missing values (NOT RUN HERE); samNum is table of numerical predictor variables
# estimate number of components
nb <- estim_ncpPCA(samNum, ncp.min=0, ncp.max=10, scale = TRUE)
# estimate values
samNumEst <- imputePCA(samNum, ncp=nb$ncp)
# extract table with values including estimates for NA
samNumEst <- samNumEst$completeObs
# select non-colinear predictors; (NOT RUN HERE); samNumEst.mean is table of numerical predictor variables
vifstep(samNumEst.mean)
# import data for prredictor and response variables
load("modelGenomeSize.RData")
vif(pred[,-15]) # all < 10
# make landcover factor
pred$landcover <- factor(pred$landcover, levels = rev(unique(pred$landcover)))
predDB$landcover <- factor(predDB$landcover, levels = rev(unique(predDB$landcover)))
# plot predictor variables
# continuous variables in long format to plot as boxplot
pred.long.cont <- melt(pred[,-15])
## No id variables; using all as measure variables
p1 <- ggplot(pred.long.cont, aes(x = variable, y = value)) + geom_boxplot() +
facet_wrap(~variable, scales = 'free', ncol = 8) + plot.theme1 +
geom_jitter( alpha = 0.3, color = 'blue') +
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))
p1 <- set_panel_size(p1, width = unit(0.5, "inch"), height = unit(2.7, "inch"))
# frequency for landcover
p2 <- ggplot(pred, aes(x=landcover, fill = landcover)) +
geom_histogram(stat = "count") +
stat_count(aes(y=..count.., label=..count..), geom="text", vjust=-.5) +
scale_fill_manual(values = colLC) +
plot.theme1 +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
p2 <- set_panel_size(p2, width = unit(3.5, "inch"), height = unit(2.8, "inch"))
grid.arrange(p1,p2, ncol=2)
# range of predictors
apply(pred,2,function(x){min(x)})
## Soil.temperature Water.activity Litter
## " 3.200" "0.4656000" " 0.0000"
## Litter.TC Soil.C.N pH.1.2
## "23.22000" "10.00800" "2.620"
## Silt Clay MAP
## " 0.75000" " 0.85000" " 45.34242"
## BIO5.TmaxHottestMonth BIO7.TannualRange BIO15.precSeasonality
## " 2.786956" "18.08999" "10.88828"
## WHC SOM.COMB landcover
## "-2.000000" " 0.34400" "Barren"
## Litter.C.N
## " 28.70625"
apply(pred,2,function(x){max(x)})
## Soil.temperature Water.activity Litter
## "45.144" "1.0138000" "1348.3700"
## Litter.TC Soil.C.N pH.1.2
## "50.35000" "89.99400" "8.116"
## Silt Clay MAP
## "57.40000" "40.25000" "1635.92255"
## BIO5.TmaxHottestMonth BIO7.TannualRange BIO15.precSeasonality
## "35.900002" "38.81161" "86.91499"
## WHC SOM.COMB landcover
## "23.785325" "59.11500" "Woody Savannas"
## Litter.C.N
## "133.90957"
#find best distribution to model indices
#vector with distributions to test
dist=c("norm", "pois", "exp", "gamma", "nbinom", "geom", "unif", "logis")
ModLogl <- list()
ModAIC <- list()
ModBIC <- list()
for (i in dist){
t <- fitdist(traits_WA_df[,'genome_size10'], i, method="mme")
# Log likelihood
ModLogl[[i]] <- t$loglik
# Akaike information criterion
ModAIC[[i]] <- t$aic
# Bic or SBC (Schwarz Bayesian criterion)
ModBIC[[i]] <- t$bic
rm(t)
}
# plots
t <- list()
for (i in dist){
t[[i]] <- fitdist(traits_WA_df$genome_size10, i, method="mme")
}
par(mfrow = c(2, 2))
plot.legend <- c(names(t)[grep("[0-9]",ModAIC)])
denscomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend, main = 'genome_size10')
qqcomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend)
cdfcomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend)
ppcomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend)
#create data.frames from lists
ModLogl <- as.data.frame(do.call(cbind,ModLogl))
ModAIC <- as.data.frame(do.call(cbind,ModAIC))
ModBIC <- as.data.frame(do.call(cbind,ModBIC))
ModDist <- rbind(ModLogl,ModAIC,ModBIC)
rownames(ModDist) <- c('Logl','AIC','BIC')
rm(ModLogl,ModAIC,ModBIC,t)
# normal gives best fit
set.seed(7)
all(rownames(pred)==rownames(traits_WA_df))
## [1] TRUE
rf1 <- randomForest(x = pred, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)
# Variance importance
p <- varImpPlotgg(rf1)
grid.arrange(p)
vi_rf1 <- varImpPlot(rf1, sort = T)
vi_rf1 <- vi_rf1[order(vi_rf1[,1],decreasing = T),]
# monovariate response plots
respM <- respMono(mod = rf1, pred = pred, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 4)
rm(respM)
# model performance: repeated split sampling
cv <- crossVal(pred = pred, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf1 <-cv$summary
set.seed(7)
full <- glm(traits_WA_df[,'genome_size10'] ~., data = pred, family = gaussian)
null <- glm(traits_WA_df[,'genome_size10'] ~1, data = pred, family = gaussian)
glmStep1 <- stepAIC(full, scope = list(upper = full, lower = null), direction = 'both')
## Start: AIC=2065.45
## traits_WA_df[, "genome_size10"] ~ Soil.temperature + Water.activity +
## Litter + Litter.TC + Soil.C.N + pH.1.2 + Silt + Clay + MAP +
## BIO5.TmaxHottestMonth + BIO7.TannualRange + BIO15.precSeasonality +
## WHC + SOM.COMB + landcover + Litter.C.N
##
## Df Deviance AIC
## - Soil.temperature 1 1.2721e+13 2063.5
## - MAP 1 1.2722e+13 2063.5
## - pH.1.2 1 1.2726e+13 2063.5
## - Litter.C.N 1 1.2726e+13 2063.5
## - BIO5.TmaxHottestMonth 1 1.2735e+13 2063.5
## - BIO7.TannualRange 1 1.2764e+13 2063.7
## - SOM.COMB 1 1.2766e+13 2063.7
## - Water.activity 1 1.2778e+13 2063.8
## - Litter.TC 1 1.2868e+13 2064.3
## - Clay 1 1.3009e+13 2065.0
## <none> 1.2721e+13 2065.4
## - Litter 1 1.3720e+13 2068.8
## - Silt 1 1.3767e+13 2069.0
## - Soil.C.N 1 1.3795e+13 2069.1
## - WHC 1 1.4283e+13 2071.6
## - BIO15.precSeasonality 1 1.4737e+13 2073.8
## - landcover 9 2.0384e+13 2080.5
##
## Step: AIC=2063.46
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC +
## Soil.C.N + pH.1.2 + Silt + Clay + MAP + BIO5.TmaxHottestMonth +
## BIO7.TannualRange + BIO15.precSeasonality + WHC + SOM.COMB +
## landcover + Litter.C.N
##
## Df Deviance AIC
## - MAP 1 1.2723e+13 2061.5
## - pH.1.2 1 1.2726e+13 2061.5
## - Litter.C.N 1 1.2726e+13 2061.5
## - BIO5.TmaxHottestMonth 1 1.2742e+13 2061.6
## - SOM.COMB 1 1.2772e+13 2061.7
## - BIO7.TannualRange 1 1.2774e+13 2061.8
## - Water.activity 1 1.2783e+13 2061.8
## - Litter.TC 1 1.2868e+13 2062.3
## - Clay 1 1.3012e+13 2063.0
## <none> 1.2721e+13 2063.5
## + Soil.temperature 1 1.2721e+13 2065.4
## - Litter 1 1.3721e+13 2066.8
## - Silt 1 1.3770e+13 2067.0
## - Soil.C.N 1 1.3947e+13 2067.9
## - WHC 1 1.4320e+13 2069.7
## - BIO15.precSeasonality 1 1.4785e+13 2072.0
## - landcover 9 2.0460e+13 2078.7
##
## Step: AIC=2061.47
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC +
## Soil.C.N + pH.1.2 + Silt + Clay + BIO5.TmaxHottestMonth +
## BIO7.TannualRange + BIO15.precSeasonality + WHC + SOM.COMB +
## landcover + Litter.C.N
##
## Df Deviance AIC
## - pH.1.2 1 1.2727e+13 2059.5
## - Litter.C.N 1 1.2728e+13 2059.5
## - BIO5.TmaxHottestMonth 1 1.2742e+13 2059.6
## - SOM.COMB 1 1.2774e+13 2059.8
## - Water.activity 1 1.2783e+13 2059.8
## - BIO7.TannualRange 1 1.2791e+13 2059.8
## - Litter.TC 1 1.2870e+13 2060.3
## - Clay 1 1.3050e+13 2061.2
## <none> 1.2723e+13 2061.5
## + MAP 1 1.2721e+13 2063.5
## + Soil.temperature 1 1.2722e+13 2063.5
## - Silt 1 1.3800e+13 2065.2
## - Litter 1 1.3808e+13 2065.2
## - Soil.C.N 1 1.4100e+13 2066.7
## - WHC 1 1.4335e+13 2067.8
## - BIO15.precSeasonality 1 1.4798e+13 2070.0
## - landcover 9 2.0929e+13 2078.3
##
## Step: AIC=2059.49
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC +
## Soil.C.N + Silt + Clay + BIO5.TmaxHottestMonth + BIO7.TannualRange +
## BIO15.precSeasonality + WHC + SOM.COMB + landcover + Litter.C.N
##
## Df Deviance AIC
## - Litter.C.N 1 1.2734e+13 2057.5
## - BIO5.TmaxHottestMonth 1 1.2746e+13 2057.6
## - SOM.COMB 1 1.2775e+13 2057.8
## - Water.activity 1 1.2784e+13 2057.8
## - BIO7.TannualRange 1 1.2799e+13 2057.9
## - Litter.TC 1 1.2890e+13 2058.4
## - Clay 1 1.3088e+13 2059.4
## <none> 1.2727e+13 2059.5
## + pH.1.2 1 1.2723e+13 2061.5
## + MAP 1 1.2726e+13 2061.5
## + Soil.temperature 1 1.2727e+13 2061.5
## - Litter 1 1.3808e+13 2063.2
## - Silt 1 1.3873e+13 2063.5
## - Soil.C.N 1 1.4239e+13 2065.3
## - WHC 1 1.4338e+13 2065.8
## - BIO15.precSeasonality 1 1.4867e+13 2068.4
## - landcover 9 2.4203e+13 2086.5
##
## Step: AIC=2057.52
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC +
## Soil.C.N + Silt + Clay + BIO5.TmaxHottestMonth + BIO7.TannualRange +
## BIO15.precSeasonality + WHC + SOM.COMB + landcover
##
## Df Deviance AIC
## - BIO5.TmaxHottestMonth 1 1.2751e+13 2055.6
## - SOM.COMB 1 1.2776e+13 2055.8
## - Water.activity 1 1.2795e+13 2055.9
## - BIO7.TannualRange 1 1.2806e+13 2055.9
## - Litter.TC 1 1.2891e+13 2056.4
## <none> 1.2734e+13 2057.5
## - Clay 1 1.3123e+13 2057.6
## + Litter.C.N 1 1.2727e+13 2059.5
## + pH.1.2 1 1.2728e+13 2059.5
## + MAP 1 1.2732e+13 2059.5
## + Soil.temperature 1 1.2733e+13 2059.5
## - Litter 1 1.3809e+13 2061.2
## - Silt 1 1.3875e+13 2061.5
## - Soil.C.N 1 1.4254e+13 2063.4
## - WHC 1 1.4338e+13 2063.8
## - BIO15.precSeasonality 1 1.4868e+13 2066.4
## - landcover 9 2.4205e+13 2084.5
##
## Step: AIC=2055.62
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC +
## Soil.C.N + Silt + Clay + BIO7.TannualRange + BIO15.precSeasonality +
## WHC + SOM.COMB + landcover
##
## Df Deviance AIC
## - SOM.COMB 1 1.2785e+13 2053.8
## - Water.activity 1 1.2796e+13 2053.9
## - BIO7.TannualRange 1 1.2812e+13 2053.9
## - Litter.TC 1 1.2907e+13 2054.5
## <none> 1.2751e+13 2055.6
## - Clay 1 1.3125e+13 2055.7
## + BIO5.TmaxHottestMonth 1 1.2734e+13 2057.5
## + Soil.temperature 1 1.2743e+13 2057.6
## + pH.1.2 1 1.2746e+13 2057.6
## + Litter.C.N 1 1.2746e+13 2057.6
## + MAP 1 1.2750e+13 2057.6
## - Litter 1 1.3809e+13 2059.2
## - Silt 1 1.3920e+13 2059.8
## - WHC 1 1.4453e+13 2062.4
## - Soil.C.N 1 1.4680e+13 2063.5
## - BIO15.precSeasonality 1 1.5690e+13 2068.1
## - landcover 9 2.4235e+13 2082.6
##
## Step: AIC=2053.81
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC +
## Soil.C.N + Silt + Clay + BIO7.TannualRange + BIO15.precSeasonality +
## WHC + landcover
##
## Df Deviance AIC
## - Water.activity 1 1.2845e+13 2052.1
## - BIO7.TannualRange 1 1.2847e+13 2052.2
## - Litter.TC 1 1.2953e+13 2052.7
## - Clay 1 1.3126e+13 2053.7
## <none> 1.2785e+13 2053.8
## + SOM.COMB 1 1.2751e+13 2055.6
## + BIO5.TmaxHottestMonth 1 1.2776e+13 2055.8
## + Litter.C.N 1 1.2784e+13 2055.8
## + MAP 1 1.2785e+13 2055.8
## + pH.1.2 1 1.2785e+13 2055.8
## + Soil.temperature 1 1.2785e+13 2055.8
## - Litter 1 1.3891e+13 2057.6
## - Silt 1 1.4019e+13 2058.3
## - WHC 1 1.4546e+13 2060.8
## - Soil.C.N 1 1.4701e+13 2061.6
## - BIO15.precSeasonality 1 1.5704e+13 2066.2
## - landcover 9 2.4837e+13 2082.3
##
## Step: AIC=2052.13
## traits_WA_df[, "genome_size10"] ~ Litter + Litter.TC + Soil.C.N +
## Silt + Clay + BIO7.TannualRange + BIO15.precSeasonality +
## WHC + landcover
##
## Df Deviance AIC
## - BIO7.TannualRange 1 1.2880e+13 2050.3
## - Litter.TC 1 1.3020e+13 2051.1
## - Clay 1 1.3131e+13 2051.7
## <none> 1.2845e+13 2052.1
## + Water.activity 1 1.2785e+13 2053.8
## + SOM.COMB 1 1.2796e+13 2053.9
## + Soil.temperature 1 1.2824e+13 2054.0
## + pH.1.2 1 1.2837e+13 2054.1
## + BIO5.TmaxHottestMonth 1 1.2838e+13 2054.1
## + Litter.C.N 1 1.2839e+13 2054.1
## + MAP 1 1.2844e+13 2054.1
## - Silt 1 1.4048e+13 2056.4
## - Litter 1 1.4367e+13 2058.0
## - WHC 1 1.4561e+13 2058.9
## - Soil.C.N 1 1.4704e+13 2059.6
## - BIO15.precSeasonality 1 1.6042e+13 2065.7
## - landcover 9 2.4951e+13 2080.6
##
## Step: AIC=2050.32
## traits_WA_df[, "genome_size10"] ~ Litter + Litter.TC + Soil.C.N +
## Silt + Clay + BIO15.precSeasonality + WHC + landcover
##
## Df Deviance AIC
## - Litter.TC 1 1.3025e+13 2049.1
## - Clay 1 1.3149e+13 2049.8
## <none> 1.2880e+13 2050.3
## + SOM.COMB 1 1.2834e+13 2052.1
## + BIO7.TannualRange 1 1.2845e+13 2052.1
## + Water.activity 1 1.2847e+13 2052.2
## + Soil.temperature 1 1.2852e+13 2052.2
## + BIO5.TmaxHottestMonth 1 1.2873e+13 2052.3
## + Litter.C.N 1 1.2875e+13 2052.3
## + MAP 1 1.2877e+13 2052.3
## + pH.1.2 1 1.2878e+13 2052.3
## - Silt 1 1.4176e+13 2055.0
## - Litter 1 1.4524e+13 2056.7
## - WHC 1 1.4561e+13 2056.9
## - Soil.C.N 1 1.4830e+13 2058.2
## - BIO15.precSeasonality 1 1.6044e+13 2063.7
## - landcover 9 2.4956e+13 2078.6
##
## Step: AIC=2049.11
## traits_WA_df[, "genome_size10"] ~ Litter + Soil.C.N + Silt +
## Clay + BIO15.precSeasonality + WHC + landcover
##
## Df Deviance AIC
## - Clay 1 1.3266e+13 2048.4
## <none> 1.3025e+13 2049.1
## + Litter.TC 1 1.2880e+13 2050.3
## + SOM.COMB 1 1.2965e+13 2050.8
## + Water.activity 1 1.2974e+13 2050.8
## + Soil.temperature 1 1.3004e+13 2051.0
## + BIO5.TmaxHottestMonth 1 1.3015e+13 2051.1
## + BIO7.TannualRange 1 1.3020e+13 2051.1
## + Litter.C.N 1 1.3023e+13 2051.1
## + pH.1.2 1 1.3025e+13 2051.1
## + MAP 1 1.3025e+13 2051.1
## - Silt 1 1.4314e+13 2053.7
## - WHC 1 1.4569e+13 2054.9
## - Litter 1 1.4808e+13 2056.1
## - Soil.C.N 1 1.5052e+13 2057.2
## - BIO15.precSeasonality 1 1.6044e+13 2061.7
## - landcover 9 2.5041e+13 2076.9
##
## Step: AIC=2048.39
## traits_WA_df[, "genome_size10"] ~ Litter + Soil.C.N + Silt +
## BIO15.precSeasonality + WHC + landcover
##
## Df Deviance AIC
## <none> 1.3266e+13 2048.4
## + Clay 1 1.3025e+13 2049.1
## + Litter.TC 1 1.3149e+13 2049.8
## + MAP 1 1.3238e+13 2050.2
## + pH.1.2 1 1.3246e+13 2050.3
## + Water.activity 1 1.3259e+13 2050.4
## + Litter.C.N 1 1.3261e+13 2050.4
## + Soil.temperature 1 1.3261e+13 2050.4
## + BIO7.TannualRange 1 1.3264e+13 2050.4
## + SOM.COMB 1 1.3264e+13 2050.4
## + BIO5.TmaxHottestMonth 1 1.3265e+13 2050.4
## - WHC 1 1.4628e+13 2053.2
## - Silt 1 1.4916e+13 2054.6
## - Soil.C.N 1 1.5130e+13 2055.6
## - Litter 1 1.5469e+13 2057.2
## - BIO15.precSeasonality 1 1.6114e+13 2060.0
## - landcover 9 2.5566e+13 2076.3
# variables importance
varImp.glmstep1 <- VarImp.glm(mod = glmStep1, perm = 100, plot = T, pt = plot.theme1)
grid.arrange(varImp.glmstep1$varImpPlot)
vi_glmstep1<- varImp.glmstep1$varImpDat
# Monavariate response plot
respM <- respMono(mod = glmStep1, pred = pred, w = 2, h = 2, type = "step", scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)
rm(respM)
# repeated split sampling with final model from stepwise
cv <- crossVal(pred = glmStep1$model[,2:ncol(glmStep1$model)], resp = traits_WA_df, var = 'genome_size10', plot = F, type = "glm")
cv_glmstep1_fin <- cv$summary
# with stepwise selection for each run
cv <- crossVal.step(pred = pred, resp = traits_WA_df, var = 'genome_size10', plot = F, direction = "both")
cv_glmstep1_sel <- cv$summary
# frequency variables appear in model
Vars_glmstep1_sel <- cv$variablesFraction
# to reduce overfitting, use same number of variables as in glmstep
set.seed(7)
pred2 <- pred[,rownames(vi_rf1)[1:(ncol(glmStep1$model)-1)]]
rf2 <- randomForest(x = pred2, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)
# Variance importance
p <- varImpPlotgg(rf2)
grid.arrange(p)
vi_rf2 <- varImpPlot(rf2, sort = T)
# monovariate response plots
respM <- respMono(mod = rf2, pred = pred2, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)
rm(respM)
# model performance: repeated split sampling
cv <- crossVal(pred = pred2, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf2 <-cv$summary
# predictive accuracy
crossValSum <- as.data.frame(rbind(cv_rf1,cv_glmstep1_sel,cv_glmstep1_fin,cv_rf2))
crossValSum$modelType <- c('random forest all predictors', 'stepwise with var selection full','stepwise final model from full dataset',
'random forest most importants vars')
crossValSum
# for stepwise selection in cross validation: % each variable appears in model
Vars_glmstep1_sel <- as.data.frame(Vars_glmstep1_sel)
Vars_glmstep1_sel$Variable <- rownames(Vars_glmstep1_sel)
var_sum <- Vars_glmstep1_sel
var_sum
set.seed(7)
# random forest with all variables and glmstep
mods <- list(rf1,glmStep1)
names(mods) <- c('rf1','glmStep1')
weights <- c(crossValSum[c("cv_rf1","cv_glmstep1_fin"),'meanCor'])
names(weights) <- names(mods)
cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble1 <- cv$summary
# random forest with important variables only and glmstep
mods <- list(rf2,glmStep1)
names(mods) <- c('rf2','glmStep1')
weights <- c(crossValSum[c("cv_rf2","cv_glmstep1_fin"),'meanCor'])
names(weights) <- names(mods)
cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble2 <- cv$summary
# combine output from cross validation with different ensembles
crossValEns <- cbind(cv_ensemble1,cv_ensemble2)
crossValEns <- as.data.frame(t(crossValEns))
crossValEns$description <- c(
'rf with all varaibles and glmstep',
'rf with most important variables and glmstep')
crossValEns
# combine variable importance
# for rfs
vi_rf1 <- as.data.frame(vi_rf1)
vi_rf2 <- as.data.frame(vi_rf2)
vi_rf1[,'Variable']<- rownames(vi_rf1)
vi_rf2[,'Variable'] <- rownames(vi_rf2)
vi <- merge(vi_rf1,vi_rf2, by = 'Variable', all = T)
colnames(vi)[c(2,4)] <- c('rf1','rf2')
vi <- vi[,-grep('Node',colnames(vi))] # remove columns with node impurity
# glmstep
vi <- merge(vi,vi_glmstep1, by = 'Variable', all = T)
colnames(vi)[4:5] <- c('glmstep1.mean','glmstep1.sd') # sd from permutations when calculating variable importance
# mean and sd for random forest sets4 and 6
vi[,'rf1_2_mean'] <- apply(vi[,c('rf1','rf2')],1,function(x)mean(x,na.rm = T))
vi[,'rf1_2_sd'] <- apply(vi[,c('rf1','rf2')],1,function(x)sd(x,na.rm = T))
# show number of variables that as selected by glmstep
a <- vi[order(vi$rf1, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf2))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p1 <- ggplot(a, aes(x = rf1, y = Variable)) + geom_col() +
xlab('%IncMSE') + plot.theme1 + ggtitle('rf1')
p1 <- set_panel_size(p1, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))
a <- vi[order(vi$rf2, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf2))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p2 <- ggplot(a, aes(x = rf2, y = Variable)) + geom_col() +
xlab('%IncMSE') + plot.theme1 + ggtitle('rf2')
p2 <- set_panel_size(p2, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))
a <- vi[order(vi$rf1_2_mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf2))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p3 <- ggplot(a, aes(x = rf1_2_mean, y = Variable)) + geom_col() +
geom_errorbar(aes(xmin = rf1_2_mean - rf1_2_sd, xmax = rf1_2_mean + rf1_2_sd), width = 0.4, size =1) +
xlab('%IncMSE') + plot.theme1 + ggtitle('rf1, 2 mean')
p3 <- set_panel_size(p3, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))
# glmstep4 and % variable in model
a <- vi[order(vi$glmstep1.mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$glmstep1.mean))))),]# adapt rows to nonNa
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
# percentages variable in model crossval => value, 1- value
b <- data.frame(Variable = a$Variable)
for(i in a$Variable){
b[b$Variable==i,"in"] <- var_sum[var_sum$Variable==i,'Vars_glmstep1_sel']
b[b$Variable==i,"out"] <- 1-var_sum[var_sum$Variable==i,'Vars_glmstep1_sel']
}
b <- melt(b)
## Using Variable as id variables
b$variable <- factor(b$variable, levels = c("out","in"))
# main plot: var importance
p4 <- ggplot(a, aes(x = glmstep1.mean, y = Variable)) + geom_col() +
xlab('%IncMSE') + plot.theme1 + ggtitle('glmstep1')
p4 <- set_panel_size(p4, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))
# % variable in model as extra plot
p5 <- ggplot(b, aes(y=Variable,x=value,fill=variable))+geom_bar(stat="identity",color="black",width = 0.6) +
scale_fill_manual(values = c("white","black")) + plot.theme1 + ggtitle('glmstep1 % var in model')
p5 <- set_panel_size(p5, width = unit(0.4, "inch"), height = unit(nrow(a)/2, "inch"))
grid.arrange(p1,p2,p3,p4,p5,ncol=5)
set.seed(7)
all(rownames(predDB)==rownames(traits_WA_df))
## [1] TRUE
rf3 <- randomForest(x = predDB, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)
# Variance importance
p <- varImpPlotgg(rf3)
grid.arrange(p)
vi_rf3 <- varImpPlot(rf3, sort = T)
vi_rf3 <- vi_rf3[order(vi_rf3[,1],decreasing = T),]
# monovariate response plots
respM <- respMono(mod = rf3, pred = predDB, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 4)
rm(respM)
# model performance: repeated split sampling
cv <- crossVal(pred = predDB, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf3 <-cv$summary
set.seed(7)
full <- glm(traits_WA_df[,'genome_size10'] ~., data = predDB, family = gaussian)
null <- glm(traits_WA_df[,'genome_size10'] ~1, data = predDB, family = gaussian)
glmStep2 <- stepAIC(full, scope = list(upper = full, lower = null), direction = 'both')
## Start: AIC=2062.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + pH.1.2 +
## Silt + Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange +
## BIO15.precSeasonality + WHC + landcover
##
## Df Deviance AIC
## - pH.1.2 1 1.3978e+13 2060.1
## - BIO5.TmaxHottestMonth 1 1.4046e+13 2060.4
## - BIO7.TannualRange 1 1.4123e+13 2060.8
## - MAP 1 1.4156e+13 2060.9
## - Soil.TOC 1 1.4188e+13 2061.1
## <none> 1.3978e+13 2062.1
## - Clay 1 1.4904e+13 2064.5
## - Silt 1 1.5377e+13 2066.7
## - WHC 1 1.5449e+13 2067.1
## - BIO15.precSeasonality 1 1.6634e+13 2072.2
## - Soil.C.N 1 1.7158e+13 2074.4
## - landcover 9 2.3642e+13 2080.8
##
## Step: AIC=2060.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt +
## Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange +
## BIO15.precSeasonality + WHC + landcover
##
## Df Deviance AIC
## - BIO5.TmaxHottestMonth 1 1.4049e+13 2058.4
## - BIO7.TannualRange 1 1.4125e+13 2058.8
## - MAP 1 1.4157e+13 2058.9
## - Soil.TOC 1 1.4207e+13 2059.2
## <none> 1.3978e+13 2060.1
## + pH.1.2 1 1.3978e+13 2062.1
## - Clay 1 1.5076e+13 2063.3
## - Silt 1 1.5423e+13 2064.9
## - WHC 1 1.5453e+13 2065.1
## - BIO15.precSeasonality 1 1.6827e+13 2071.0
## - Soil.C.N 1 1.7240e+13 2072.7
## - landcover 9 2.5840e+13 2085.1
##
## Step: AIC=2058.41
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt +
## Clay + MAP + BIO7.TannualRange + BIO15.precSeasonality +
## WHC + landcover
##
## Df Deviance AIC
## - BIO7.TannualRange 1 1.4212e+13 2057.2
## - MAP 1 1.4267e+13 2057.5
## - Soil.TOC 1 1.4434e+13 2058.3
## <none> 1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth 1 1.3978e+13 2060.1
## + pH.1.2 1 1.4046e+13 2060.4
## - Clay 1 1.5228e+13 2062.1
## - Silt 1 1.5451e+13 2063.1
## - WHC 1 1.5453e+13 2063.1
## - Soil.C.N 1 1.7306e+13 2071.0
## - BIO15.precSeasonality 1 1.7555e+13 2072.0
## - landcover 9 2.6610e+13 2085.1
##
## Step: AIC=2057.21
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt +
## Clay + MAP + BIO15.precSeasonality + WHC + landcover
##
## Df Deviance AIC
## - MAP 1 1.4323e+13 2055.8
## <none> 1.4212e+13 2057.2
## - Soil.TOC 1 1.4632e+13 2057.2
## + BIO7.TannualRange 1 1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth 1 1.4125e+13 2058.8
## + pH.1.2 1 1.4212e+13 2059.2
## - Clay 1 1.5296e+13 2060.4
## - WHC 1 1.5589e+13 2061.7
## - Silt 1 1.5730e+13 2062.3
## - Soil.C.N 1 1.7531e+13 2069.9
## - BIO15.precSeasonality 1 1.7800e+13 2071.0
## - landcover 9 2.6714e+13 2083.4
##
## Step: AIC=2055.76
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt +
## Clay + BIO15.precSeasonality + WHC + landcover
##
## Df Deviance AIC
## <none> 1.4323e+13 2055.8
## - Soil.TOC 1 1.4808e+13 2056.1
## + BIO5.TmaxHottestMonth 1 1.4211e+13 2057.2
## + MAP 1 1.4212e+13 2057.2
## + BIO7.TannualRange 1 1.4267e+13 2057.5
## + pH.1.2 1 1.4319e+13 2057.7
## - Clay 1 1.5306e+13 2058.4
## - Silt 1 1.5750e+13 2060.4
## - WHC 1 1.5833e+13 2060.8
## - Soil.C.N 1 1.7540e+13 2067.9
## - BIO15.precSeasonality 1 1.8118e+13 2070.2
## - landcover 9 2.7288e+13 2082.9
# variables importance
varImp.glmstep2 <- VarImp.glm(mod = glmStep2, perm = 100, plot = T, pt = plot.theme1)
grid.arrange(varImp.glmstep2$varImpPlot)
vi_glmstep2 <- varImp.glmstep2$varImpDat
# Monavariate response plot
respM <- respMono(mod = glmStep2, pred = predDB, w = 2, h = 2, type = "step", scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)
rm(respM)
# repeated split sampling with final model from stepwise
cv <- crossVal(pred = glmStep2$model[,2:ncol(glmStep2$model)], resp = traits_WA_df, var = 'genome_size10', plot = F, type = "glm")
cv_glmstep2_fin <- cv$summary
# with stepwise selection for each set
cv <- crossVal.step(pred = predDB, resp = traits_WA_df, var = 'genome_size10', plot = F, direction = "both")
cv_glmstep2_sel <- cv$summary
# frequency variables appear in model
Vars_glmstep2_sel <- cv$variablesFraction
# same number of variables as in glmstep
set.seed(7)
pred4 <- predDB[,rownames(vi_rf3)[1:(ncol(glmStep2$model)-1)]]
rf4 <- randomForest(x = pred4, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)
# Variance importance
p <- varImpPlotgg(rf4)
grid.arrange(p)
vi_rf4 <- varImpPlot(rf4, sort = T)
# monovariate response plots
respM <- respMono(mod = rf4, pred = pred4, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)
rm(respM)
# model performance: repeated split sampling
cv <- crossVal(pred = pred4, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf4 <-cv$summary
# add to previous table
crossValSum <- rbind(crossValSum,cv_rf3,cv_rf4,cv_glmstep2_sel,cv_glmstep2_fin)
rownames(crossValSum)[5:8] <- c('cv_rf3','cv_rf4','cv_glmstep2_sel','cv_glmstep2_fin')
crossValSum$modelType[5:8] <- c('random forest database Vars all predictors', 'random forest database Vars all predictors most important Vars',
'stepwise with var selection database vars','stepwise final model database vars')
crossValSum
# for stepwise selection in cross validation: % each variable appears in model
Vars_glmstep2_sel <- as.data.frame(Vars_glmstep2_sel)
Vars_glmstep2_sel$Variable <- rownames(Vars_glmstep2_sel)
var_sum <- merge(var_sum, Vars_glmstep2_sel, by = 'Variable', all = T)
var_sum
set.seed(7)
# random forest all variables and glmstep
mods <- list(rf3,glmStep2)
names(mods) <- c('rf3','glmStep2')
weights <- c(crossValSum[c("cv_rf3","cv_glmstep2_fin"),'meanCor'])
names(weights) <- names(mods)
cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble3 <- cv$summary
# random forest with important variables only and glmstep
mods <- list(rf4,glmStep2)
names(mods) <- c('rf4','glmStep2')
weights <- c(crossValSum[c("cv_rf4","cv_glmstep2_fin"),'meanCor'])
names(weights) <- names(mods)
cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble4 <- cv$summary
# combine output from cross validation with different ensembles
crossValEns <- cbind(cv_ensemble1,cv_ensemble2,cv_ensemble3,cv_ensemble4)
crossValEns <- as.data.frame(t(crossValEns))
crossValEns$description <- c(
'all predictors, rf and glmstep',
'all predictors, rf with selected subset and glmstep',
'predictors in databases, rf and glmstep',
'predictors in databases, rf with selected subset and glmstep'
)
crossValEns
# combine variable importance
# for rfs
vi_rf3 <- as.data.frame(vi_rf3)
vi_rf4 <- as.data.frame(vi_rf4)
vi_rf3[,'Variable']<- rownames(vi_rf3)
vi_rf4[,'Variable'] <- rownames(vi_rf4)
vi <- merge(vi_rf3,vi_rf4, by = 'Variable', all = T)
colnames(vi)[c(2,4)] <- c('rf3','rf4')
vi <- vi[,-grep('Node',colnames(vi))] # remove columns with node impurity
# glmstep
vi <- merge(vi,vi_glmstep2, by = 'Variable', all = T)
colnames(vi)[4:5] <- c('glmstep2.mean','glmstep2.sd') # sd from permutations when calculating variable importance
# mean and sd for random forest
vi[,'rf3_4_mean'] <- apply(vi[,c('rf3','rf4')],1,function(x)mean(x,na.rm = T))
vi[,'rf3_4_sd'] <- apply(vi[,c('rf3','rf4')],1,function(x)sd(x,na.rm = T))
# number of variables that was selected by glmstep
a <- vi[order(vi$rf3, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf4))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p1 <- ggplot(a, aes(x = rf3, y = Variable)) + geom_col() +
xlab('%IncMSE') + plot.theme1 + ggtitle('rf3')
p1 <- set_panel_size(p1, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))
a <- vi[order(vi$rf4, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf4))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p2 <- ggplot(a, aes(x = rf4, y = Variable)) + geom_col() +
xlab('%IncMSE') + plot.theme1 + ggtitle('rf4')
p2 <- set_panel_size(p2, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))
a <- vi[order(vi$rf3_4_mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf4))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p3 <- ggplot(a, aes(x = rf3_4_mean, y = Variable)) + geom_col() +
geom_errorbar(aes(xmin = rf3_4_mean - rf3_4_sd, xmax = rf3_4_mean + rf3_4_sd), width = 0.4, size =1) +
xlab('%IncMSE') + plot.theme1 + ggtitle('rf3, 4 mean')
p3 <- set_panel_size(p3, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))
# glmstep and % variable in model
a <- vi[order(vi$glmstep2.mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$glmstep2.mean))))),]# adapt rows to nonNa
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
# percentages variable in model crossval
b <- data.frame(Variable = a$Variable)
for(i in a$Variable){
b[b$Variable==i,"in"] <- var_sum[var_sum$Variable==i,'Vars_glmstep2_sel']
b[b$Variable==i,"out"] <- 1-var_sum[var_sum$Variable==i,'Vars_glmstep2_sel']
}
b <- melt(b)
## Using Variable as id variables
b$variable <- factor(b$variable, levels = c("out","in"))
# main plot: var importance
p4 <- ggplot(a, aes(x = glmstep2.mean, y = Variable)) + geom_col() +
xlab('%IncMSE') + plot.theme1 + ggtitle('glmstep2')
p4 <- set_panel_size(p4, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))
# % variable in model as extra plot
p5 <- ggplot(b, aes(y=Variable,x=value,fill=variable))+geom_bar(stat="identity",color="black",width = 0.6) +
scale_fill_manual(values = c("white","black")) + plot.theme1 + ggtitle('glmstep2 % var in model')
p5 <- set_panel_size(p5, width = unit(0.4, "inch"), height = unit(nrow(a)/2, "inch"))
grid.arrange(p1,p2,p3,p4,p5,ncol=5)
# import predictor rasters
load("predictorGrids.RData")
# plot predictors
plot(gridStack, col=rainbow(100,start=.0,end=.8))
## Predictions
# recode land cover in predictor table with numeric identifier as in raster
predDB_re <- predDB
predDB_re$landcover <- as.character(predDB_re$landcover)
predDB_re$landcover <- gsub("Closed shrublands","Closed Shrublands",predDB_re$landcover) # correct misspelling
# extract numeric IDs and landcover from database (order is different compared to covleg which was used to assign
# description to numeric ID!!!)
a <- as.data.frame(levels(gridStack$landcover))
for (i in unique(predDB_re$landcover)){
predDB_re[predDB_re$landcover==i,'landcover'] <- a[a$Name==i,'ID' ]
}
predDB_re$landcover <- as.factor(predDB_re$landcover)
# model with recoded predictors
# glmstep
set.seed(7)
full <- glm(traits_WA_df[,'genome_size10'] ~., data = predDB_re, family = gaussian)
null <- glm(traits_WA_df[,'genome_size10'] ~1, data = predDB_re, family = gaussian)
glmStep2_re <- stepAIC(full, scope = list(upper = full, lower = null), direction = 'both')
## Start: AIC=2062.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + pH.1.2 +
## Silt + Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange +
## BIO15.precSeasonality + WHC + landcover
##
## Df Deviance AIC
## - pH.1.2 1 1.3978e+13 2060.1
## - BIO5.TmaxHottestMonth 1 1.4046e+13 2060.4
## - BIO7.TannualRange 1 1.4123e+13 2060.8
## - MAP 1 1.4156e+13 2060.9
## - Soil.TOC 1 1.4188e+13 2061.1
## <none> 1.3978e+13 2062.1
## - Clay 1 1.4904e+13 2064.5
## - Silt 1 1.5377e+13 2066.7
## - WHC 1 1.5449e+13 2067.1
## - BIO15.precSeasonality 1 1.6634e+13 2072.2
## - Soil.C.N 1 1.7158e+13 2074.4
## - landcover 9 2.3642e+13 2080.8
##
## Step: AIC=2060.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt +
## Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange +
## BIO15.precSeasonality + WHC + landcover
##
## Df Deviance AIC
## - BIO5.TmaxHottestMonth 1 1.4049e+13 2058.4
## - BIO7.TannualRange 1 1.4125e+13 2058.8
## - MAP 1 1.4157e+13 2058.9
## - Soil.TOC 1 1.4207e+13 2059.2
## <none> 1.3978e+13 2060.1
## + pH.1.2 1 1.3978e+13 2062.1
## - Clay 1 1.5076e+13 2063.3
## - Silt 1 1.5423e+13 2064.9
## - WHC 1 1.5453e+13 2065.1
## - BIO15.precSeasonality 1 1.6827e+13 2071.0
## - Soil.C.N 1 1.7240e+13 2072.7
## - landcover 9 2.5840e+13 2085.1
##
## Step: AIC=2058.41
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt +
## Clay + MAP + BIO7.TannualRange + BIO15.precSeasonality +
## WHC + landcover
##
## Df Deviance AIC
## - BIO7.TannualRange 1 1.4212e+13 2057.2
## - MAP 1 1.4267e+13 2057.5
## - Soil.TOC 1 1.4434e+13 2058.3
## <none> 1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth 1 1.3978e+13 2060.1
## + pH.1.2 1 1.4046e+13 2060.4
## - Clay 1 1.5228e+13 2062.1
## - Silt 1 1.5451e+13 2063.1
## - WHC 1 1.5453e+13 2063.1
## - Soil.C.N 1 1.7306e+13 2071.0
## - BIO15.precSeasonality 1 1.7555e+13 2072.0
## - landcover 9 2.6610e+13 2085.1
##
## Step: AIC=2057.21
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt +
## Clay + MAP + BIO15.precSeasonality + WHC + landcover
##
## Df Deviance AIC
## - MAP 1 1.4323e+13 2055.8
## <none> 1.4212e+13 2057.2
## - Soil.TOC 1 1.4632e+13 2057.2
## + BIO7.TannualRange 1 1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth 1 1.4125e+13 2058.8
## + pH.1.2 1 1.4212e+13 2059.2
## - Clay 1 1.5296e+13 2060.4
## - WHC 1 1.5589e+13 2061.7
## - Silt 1 1.5730e+13 2062.3
## - Soil.C.N 1 1.7531e+13 2069.9
## - BIO15.precSeasonality 1 1.7800e+13 2071.0
## - landcover 9 2.6714e+13 2083.4
##
## Step: AIC=2055.76
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt +
## Clay + BIO15.precSeasonality + WHC + landcover
##
## Df Deviance AIC
## <none> 1.4323e+13 2055.8
## - Soil.TOC 1 1.4808e+13 2056.1
## + BIO5.TmaxHottestMonth 1 1.4211e+13 2057.2
## + MAP 1 1.4212e+13 2057.2
## + BIO7.TannualRange 1 1.4267e+13 2057.5
## + pH.1.2 1 1.4319e+13 2057.7
## - Clay 1 1.5306e+13 2058.4
## - Silt 1 1.5750e+13 2060.4
## - WHC 1 1.5833e+13 2060.8
## - Soil.C.N 1 1.7540e+13 2067.9
## - BIO15.precSeasonality 1 1.8118e+13 2070.2
## - landcover 9 2.7288e+13 2082.9
gs_glmstep2 <- predict(gridStack,glmStep2_re)
# plot
# convert to stars
gs_glmstep2_st <- st_as_stars(gs_glmstep2)
# same scale for all predictions
p2 <- ggplot() + geom_stars(data = gs_glmstep2_st, aes(x = x, y = y, fill = layer),
na.action = na.omit) +
labs(title = "stepwise glm", x = "Longitude", y = "Latitude") +
scale_fill_viridis(option = "magma", name = "Genome size", limits = c(4000000,8500000), direction = -1) +
theme_minimal()
p2 <- set_panel_size(p2, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))
# rf with most important variables
set.seed(7)
pred4_re <- predDB_re[,rownames(vi_rf3)[1:(ncol(glmStep2_re$model)-1)]]
rf4_re <- randomForest(x = pred4_re, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)
gs_rf4 <- predict(gridStack,rf4_re)
# convert to stars
gs_rf4_st <- st_as_stars(gs_rf4)
# same scale for all predictions
p3 <- ggplot() + geom_stars(data = gs_rf4_st, aes(x = x, y = y, fill = layer),
na.action = na.omit) +
labs(title = "random forest most important variables", x = "Longitude", y = "Latitude") +
scale_fill_viridis(option = "magma", name = "Genome size", limits = c(4000000,8500000), direction = -1) +
theme_minimal()
p3 <- set_panel_size(p3, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))
# ensemble predictions of random forest and glmstep
weights <- c(crossValSum[c("cv_rf4","cv_glmstep2_fin"),'meanCor'])
# stack rasters from which ensemble prediction (weighted mean) should be calculated
predStack <- stack(gs_rf4,gs_glmstep2)
gs_ens_rf4_glmstep2 <- weighted.mean(predStack,weights)
# plot
# convert to stars
gs_ens_rf4_glmstep2_st <- st_as_stars(gs_ens_rf4_glmstep2)
# same scale for all predictions
p4 <- ggplot() + geom_stars(data = gs_ens_rf4_glmstep2_st, aes(x = x, y = y, fill = layer),
na.action = na.omit) +
labs(title = "ensemble random forets - stepwise glm", x = "Longitude", y = "Latitude") +
scale_fill_viridis(option = "magma", name = "Genome size", limits = c(4000000,8500000), direction = -1) +
theme_minimal()
p4 <- set_panel_size(p4, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))
# sd between models in ensemble
gs_ens_rf4_glmstep2_sd <- calc(predStack, fun = sd)
# plot
# convert to stars
gs_ens_rf4_glmstep2_sd_st <- st_as_stars(gs_ens_rf4_glmstep2_sd)
# all values
p5 <- ggplot() + geom_stars(data = gs_ens_rf4_glmstep2_sd_st, aes(x = x, y = y, fill = layer),
na.action = na.omit) +
labs(title = "sd prediction random forest - stepwise glm", x = "Longitude", y = "Latitude") +
scale_fill_viridis(option = "magma", name = "Genome size", direction = -1) +
theme_minimal()
p5 <- set_panel_size(p5, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))
# predictions
grid.arrange(p2,p3,p4,ncol=2)
#sd predictions
grid.arrange(p5)