classication of traits

as described in https://github.com/jdonhauser/ampliconTraits using usearch v11.0.667 e.g.

usearch -sinaps ASVs.fasta -db genome_size10.fasta -attr genome_size10 -tabbedout genomeclassification.txt -strand plus

calculation of sequence identity with the top hit

e.g for genome size

usearch -usearch_global dna-sequences.fasta -db genome_size10.fasta -strand plus -id 0.5 \
  -maxaccepts 8 -maxrejects 128 -top_hit_only \
  -userout genome_size10_topHit.txt -userfields query+target+id 

libraries

library(vegan)
library(grid)
library(reshape2)
library(egg)
library(RColorBrewer)
library(viridis)
library(ggforce)
library(sp)
library(missMDA)
library(cluster)
library(usdm)
library(MASS)
library(corrplot)
library(ecospat) 
library(randomForest)
library(fitdistrplus)
library(Hmisc)
library(rgdal)
library(raster)
library(dismo) 
library(maptools)
library(terra)
library(viridis)
library(MicEnvMod)
library(stars)


sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] stars_0.6-4          sf_1.0-12            abind_1.4-5         
##  [4] MicEnvMod_0.0.0.9000 maptools_1.1-4       dismo_1.3-9         
##  [7] raster_3.5-29        rgdal_1.6-4          Hmisc_4.7-1         
## [10] Formula_1.2-4        fitdistrplus_1.1-8   survival_3.4-0      
## [13] randomForest_4.7-1.1 ecospat_3.5.1        corrplot_0.92       
## [16] MASS_7.3-57          usdm_2.1-6           terra_1.7-23        
## [19] cluster_2.1.4        missMDA_1.18         sp_2.0-0            
## [22] ggforce_0.4.1        viridis_0.6.4        viridisLite_0.4.2   
## [25] RColorBrewer_1.1-3   egg_0.4.5            ggplot2_3.4.4       
## [28] gridExtra_2.3        reshape2_1.4.4       vegan_2.6-2         
## [31] lattice_0.20-45      permute_0.9-7       
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.4.1        plyr_1.8.7             splines_4.1.3         
##   [4] digest_0.6.29          foreach_1.5.2          htmltools_0.5.5       
##   [7] earth_5.3.2            fansi_1.0.3            magrittr_2.0.3        
##  [10] checkmate_2.1.0        doParallel_1.0.17      ks_1.14.0             
##  [13] jpeg_0.1-9             colorspace_2.1-0       ggrepel_0.9.1         
##  [16] xfun_0.32              dplyr_1.0.9            jsonlite_1.8.0        
##  [19] iterators_1.0.14       ape_5.6-2              glue_1.6.2            
##  [22] polyclip_1.10-4        PresenceAbsence_1.1.11 gtable_0.3.4          
##  [25] emmeans_1.8.7          scales_1.2.1           mvtnorm_1.1-3         
##  [28] DBI_1.1.3              Rcpp_1.0.10            plotrix_3.8-2         
##  [31] xtable_1.8-4           htmlTable_2.4.1        units_0.8-0           
##  [34] flashClust_1.01-2      foreign_0.8-82         proxy_0.4-27          
##  [37] mclust_6.0.0           DT_0.28                htmlwidgets_1.6.2     
##  [40] nabor_0.5.0            mice_3.15.0            pkgconfig_2.0.3       
##  [43] reshape_0.8.9          farver_2.1.1           nnet_7.3-17           
##  [46] multcompView_0.1-9     sass_0.4.2             deldir_1.0-6          
##  [49] utf8_1.2.2             tidyselect_1.1.2       rlang_1.1.1           
##  [52] munsell_0.5.0          TeachingDemos_2.12     tools_4.1.3           
##  [55] cachem_1.0.6           xgboost_1.6.0.1        cli_3.3.0             
##  [58] generics_0.1.3         ade4_1.7-19            broom_1.0.1           
##  [61] evaluate_0.16          stringr_1.4.1          fastmap_1.1.0         
##  [64] yaml_2.3.5             maxnet_0.1.4           knitr_1.40            
##  [67] purrr_0.3.4            nlme_3.1-159           pracma_2.4.2          
##  [70] leaps_3.1              biomod2_4.2-4          compiler_4.1.3        
##  [73] rstudioapi_0.14        png_0.1-7              e1071_1.7-11          
##  [76] tibble_3.2.1           tweenr_2.0.2           bslib_0.4.0           
##  [79] stringi_1.7.6          plotmo_3.6.2           poibin_1.5            
##  [82] Matrix_1.4-1           classInt_0.4-7         gbm_2.1.8.1           
##  [85] vctrs_0.6.1            pillar_1.9.0           lifecycle_1.0.4       
##  [88] jquerylib_0.1.4        estimability_1.4.1     data.table_1.14.2     
##  [91] R6_2.5.1               latticeExtra_0.6-30    KernSmooth_2.23-20    
##  [94] codetools_0.2-18       gtools_3.9.3           assertthat_0.2.1      
##  [97] withr_2.5.2            mgcv_1.8-40            parallel_4.1.3        
## [100] rpart_4.1.16           tidyr_1.2.0            coda_0.19-4           
## [103] class_7.3-20           rmarkdown_2.16         mda_0.5-3             
## [106] pROC_1.18.0            scatterplot3d_0.3-44   base64enc_0.1-3       
## [109] FactoMineR_2.8         interp_1.1-3

setup

plot.theme1 <- theme(panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.background = element_rect(fill = "white",
                                                     colour = "black",
                                                     size = 0.5, linetype = "solid"),
                     panel.border= element_rect(fill=NA,size = 0.5, linetype = 'solid',colour = "black"),
                     axis.text.x = element_text(size=13),axis.text.y = element_text(size=13),legend.text = element_text(size=13),
                     axis.title = element_text(size=14),
                     legend.title = element_text(color = "black", size = 14),
                     strip.text.x = element_text(size=14),
                     strip.background = element_rect(colour="black", fill="white")
)



# colors for landcover
covLeg <- read.csv("./igbp_legend.csv")
# create color vector with the colors indicated in the legend table
colLC <- c()
for (i in 1:nrow(covLeg)){
  colLC[i] <- rgb(covLeg[i,'r']/360,covLeg[i,'g']/360,covLeg[i,'b']/360)
}
names(colLC) <- covLeg$Name

### Function for aggregating asvs *********************************************************************** ####
#countab: counttable,taxa are rows
#taxo=taxonomy table
#col2matchcount vector of rownames or column in counttab to which order of taxonomy table should be matched
#col2matchtax:vector of rownames or column in taxo to match
#Taxlevel: taxonomic level on which should be aggregated
#Samp: sample data
#fac: factor in Sample of which the mean should be calculated
#Summarize: should rare taxa be summarized and represented as others
#sumlevel: abundance threshold below which taxa are summarkued as others
# long: transform to long format, default TRUE

AbuBarTable <- function(countab,taxo,col2matchcount,col2matchtax,Taxlevel,Samp,fac,Summarize=F,sumlevel,long=T){
  #replace NA with unclassified
  taxo <- as.data.frame(apply(taxo,2,function(x){
    sapply(x,function(y){ifelse(is.na(y),"unclassified",y)})
  }))
  
  
  tax <- taxo[match(col2matchcount,col2matchtax),]
  
  taxabu <- aggregate(countab,list(tax[,Taxlevel]),sum)
  rownames(taxabu)<- taxabu[,1]
  taxabu <- taxabu[,-1]
  
  taxabu.rel <- apply(taxabu,2,function(x)x/sum(x))
  taxabu.rel <- as.data.frame(t(taxabu.rel))
  taxabu.rel <- taxabu.rel[rownames(Samp),]
  
  for(i in 1:ncol(Samp)){
    Samp[,i] <- as.character(Samp[,i])
  }
  
  taxabu.rel.mean <- aggregate(taxabu.rel,list(Samp[,fac]),mean)
  colnames(taxabu.rel.mean)[1] <- fac
  rownames(taxabu.rel.mean) <- taxabu.rel.mean[,1]
  
  a=c()
  for (i in colnames(taxabu.rel.mean)){
    a[i] <- is.numeric(taxabu.rel.mean[,i])
  }
  
  taxabu.rel.mean  <- taxabu.rel.mean[,a]
  
  
  if(Summarize==T){
    num=taxabu.rel.mean
    num.l=as.list(as.data.frame(t(num)))
    
    num.l=lapply(num.l,function(x){
      names(x)=colnames(num)
      Others=sum(x[which(x<sumlevel)])
      x=x[-which(x<sumlevel)]
      names(Others)="Others"
      x=c(x,Others)
    })
    
    
    taxabu.rel.mean <- as.data.frame(do.call(rbind, lapply(num.l, "[",unique(as.vector(unlist(sapply(num.l,names)))))))
   
    colnames(taxabu.rel.mean) <- unique(as.vector(unlist(sapply(num.l,names))))
    
    
    taxabu.rel.mean <- apply(taxabu.rel.mean,2,function(x){
      sapply(x,function(y){ifelse(is.na(y),0,y)})
    })
    
  }
  
  
  a <- as.data.frame(Samp[!duplicated(Samp[,fac]),])
  
  if(all(rownames(taxabu.rel.mean)==a[,fac])){
    taxabu.rel.mean <- cbind(taxabu.rel.mean,a)
  }else{
    taxabu.rel.mean <- taxabu.rel.mean[match(a[,fac],rownames(taxabu.rel.mean)),]
    taxabu.rel.mean <- cbind(taxabu.rel.mean,a)
  }
  
  
  require(reshape2)
  if (long){
    taxabu.rel.mean.long <- melt(taxabu.rel.mean)
    a=unique(as.character(taxabu.rel.mean.long$variable))
    if("Others"%in%taxabu.rel.mean.long$variable){
      taxabu.rel.mean.long$variable <- factor(taxabu.rel.mean.long$variable,levels=c(a[-(which(a=="Others"))],"Others"))
    }
    
    taxabu.rel.mean.long[,fac] <- factor(taxabu.rel.mean.long[,fac],levels=unique(taxabu.rel.mean.long[,fac]))
    
    return(taxabu.rel.mean.long)
  }else{
    return(taxabu.rel.mean)
  }
  
}

Import files

load("./traitAnnotationProperties.RData" )

Properties of trait annotations across the dataset

Calculate community weighted trait means for continuous traits for bootstrap > 70 and sequence identity with tophit > 80

# based on mean of interval 
# d1_lo5 has only one value for this dataset, omit

# continuous traits
cont <- c("d1_lo10","d1_lo20","d1_lo30","d1_up10","d1_up20","d1_up30","d1_up5","d2_lo10","d2_lo20","d2_lo30",
          "d2_lo5","d2_up10","d2_up20","d2_up30","d2_up5","doubling_h10","doubling_h20","doubling_h30","doubling_h40","doubling_h50",
          "doubling_h5","genome_size10","genome_size20","genome_size5","optimum_ph10","optimum_ph20",
          "optimum_ph5","optimum_tmp10","optimum_tmp20","rRNA16S_genes_exact_int","rRNA16S_genes10",
          "rRNA16S_genes5")


traits_WA <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 70 or identity is < 80 or NA
  a$Value <- ifelse(a$Bootstrap < 70 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
  
  # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  
  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  traits_WA[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
traits_WA2 <- traits_WA

for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df <- do.call(cbind, traits_WA2)
colnames(traits_WA_df) <- names(traits_WA2)

Correlation matrices for different intervals for the same trait

# d1lo
ecospat.cor.plot(traits_WA_df[,grep('d1_lo',colnames(traits_WA_df))])

# d1_up
ecospat.cor.plot(traits_WA_df[,grep('d1_up',colnames(traits_WA_df))][,c(4,1,2,3)])

# d2_lo
ecospat.cor.plot(traits_WA_df[,grep('d2_lo',colnames(traits_WA_df))][,c(4,1,2,3)])

# d2_up
ecospat.cor.plot(traits_WA_df[,grep('d2_up',colnames(traits_WA_df))][,c(4,1,2,3)])

#doubling_h
ecospat.cor.plot(traits_WA_df[,grep('doubling_h',colnames(traits_WA_df))][,c(6,1,2,3,4,5)])

#genome_size
ecospat.cor.plot(traits_WA_df[,grep('genome_size',colnames(traits_WA_df))][,c(3,1,2)])

#optimum_ph
ecospat.cor.plot(traits_WA_df[,grep('optimum_ph',colnames(traits_WA_df))][,c(3,1,2)])

#optimum_tmp
ecospat.cor.plot(traits_WA_df[,grep('optimum_tmp',colnames(traits_WA_df))])

#rRNA16S_genes
ecospat.cor.plot(traits_WA_df[,grep('rRNA16S_genes',colnames(traits_WA_df))][,c(3,1,2)])

Correlation matrices of abundance weighted average for different bootstrap cutoffs

with and without cutoff for sequence identity ### bootstrap > 70 considered classified, no seqID cutoff

traits_WA2 <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 70
  a$Value <- ifelse(a$Bootstrap < 70, 'unclassified', a$Value)
  
  # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  

  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  
  traits_WA2[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df_70 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_70) <- names(traits_WA2)

bootstrap > 60 considered classified, seqID > 80

traits_WA2 <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 60 or identity is < 80 or NA
  a$Value <- ifelse(a$Bootstrap < 60 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
  
    # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  
  
  
  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  
  traits_WA2[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df_60_80 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_60_80) <- names(traits_WA2)

bootstrap > 60 considered classified, no seqID cutoff

traits_WA2 <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 60
  a$Value <- ifelse(a$Bootstrap < 60, 'unclassified', a$Value)
  
   # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  
  
  
  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  
  traits_WA2[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df_60 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_60) <- names(traits_WA2)

bootstrap > 50 considered classified, seqID > 80

traits_WA2 <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 50 or identity is < 80 or NA
  a$Value <- ifelse(a$Bootstrap < 50 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
  
  # extract lower and upper boundary of interval and calculate mean
    # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  
  
  
  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  
  traits_WA2[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df_50_80 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_50_80) <- names(traits_WA2)

bootstrap > 50 considered classified, no seqID cutoff

traits_WA2 <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 50
  a$Value <- ifelse(a$Bootstrap < 50, 'unclassified', a$Value)
  
   # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  
  
  
  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  
  traits_WA2[[i]] <- abu
}
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
# keep only last column( weighted average)
for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df_50 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_50) <- names(traits_WA2)

Correlation matrices for different bootstrap cutoffs

### add cutoff to column names and merge tables
colnames(traits_WA_df) <- paste0(colnames(traits_WA_df),'_7080')
colnames(traits_WA_df_70) <- paste0(colnames(traits_WA_df_70),'_70')

colnames(traits_WA_df_60_80) <- paste0(colnames(traits_WA_df_60_80),'_6080')
colnames(traits_WA_df_60) <- paste0(colnames(traits_WA_df_60),'_60')

colnames(traits_WA_df_50_80) <- paste0(colnames(traits_WA_df_50_80),'_5080')
colnames(traits_WA_df_50) <- paste0(colnames(traits_WA_df_50),'_50')

# merge tables 

traits_WA_df_all <- cbind(traits_WA_df,traits_WA_df_70,traits_WA_df_60_80,traits_WA_df_60,traits_WA_df_50_80,
                          traits_WA_df_50)


# for one of the intervals where stable results were obtained

# d1lo20 
ecospat.cor.plot(traits_WA_df_all[,grep('d1_lo20',colnames(traits_WA_df_all))])

# d1lo20 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d1_lo20.*80$',colnames(traits_WA_df_all))])

# d1up30
ecospat.cor.plot(traits_WA_df_all[,grep('d1_up30',colnames(traits_WA_df_all))])

# d1up30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d1_up30.*80$',colnames(traits_WA_df_all))])

# d2lo30
ecospat.cor.plot(traits_WA_df_all[,grep('d2_lo30',colnames(traits_WA_df_all))])

# d2lo30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d2_lo30.*80$',colnames(traits_WA_df_all))])

# d2up30
ecospat.cor.plot(traits_WA_df_all[,grep('d2_up30',colnames(traits_WA_df_all))])

# d2up30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d1_up30.*80$',colnames(traits_WA_df_all))])

# doubling_h30
ecospat.cor.plot(traits_WA_df_all[,grep('doubling_h30',colnames(traits_WA_df_all))])

# doubling_h30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('doubling_h30.*80$',colnames(traits_WA_df_all))])

# genome_size10
ecospat.cor.plot(traits_WA_df_all[,grep('genome_size10',colnames(traits_WA_df_all))])

# genome_size10 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('genome_size10.*80$',colnames(traits_WA_df_all))])

# optimum_ph20
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_ph20',colnames(traits_WA_df_all))])

# optimum_ph20 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_ph20.*80$',colnames(traits_WA_df_all))])

# optimum_tmp20
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_tmp20',colnames(traits_WA_df_all))])

# optimum_tmp20 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_tmp20.*80$',colnames(traits_WA_df_all))])

# rRNA16S_genes exact
ecospat.cor.plot(traits_WA_df_all[,grep('rRNA16S_genes_exact_int',colnames(traits_WA_df_all))])

# rRNA16S_genes exact - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('rRNA16S_genes_exact_int.*80$',colnames(traits_WA_df_all))])

Fraction of unclassified samples

# aggregate abundances of trait levels
sam <- sam[,c(1:2,4)] # subset to non-numeric

traits_abu <- list()
for (i in names(traits)){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 70 or identity is < 80 or NA
  a$Value <- ifelse(a$Bootstrap < 70 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "Value", sam, "Sample.ID")
  traits_abu[[i]] <- abu
}
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
# add name of trait
for (i in names(traits_abu)){
  traits_abu[[i]]$trait <- i
}

traits_abu_df <- do.call(rbind, traits_abu)
# subset to unclassified
traits_abu_df <- traits_abu_df[traits_abu_df$variable=='unclassified',]

# subset with continuous traits
traits_abu_df_cont <- traits_abu_df[grep('[0-9]$|_int',traits_abu_df$trait),]
# add column with interval and trait without number
traits_abu_df_cont$trait2 <- gsub('[0-9]*$','',traits_abu_df_cont$trait)
traits_abu_df_cont$intervals <- gsub('.*_','',traits_abu_df_cont$trait)
traits_abu_df_cont$intervals <- gsub('[a-z]*','',traits_abu_df_cont$intervals)
traits_abu_df_cont[traits_abu_df_cont$trait=="rRNA16S_genes_exact_int",'intervals'] <- 'exact'
traits_abu_df_cont[traits_abu_df_cont$trait2=="rRNA16S_genes_exact_int",'trait2'] <- 'rRNA16S_genes'

# order intervals
traits_abu_df_cont$intervals <- factor(traits_abu_df_cont$intervals, levels = c("5", "10", "20", "30", "40", "50", "exact"))

# boxplot for continuous variables as a function of interval
ggplot(traits_abu_df_cont, aes(x = intervals, y = value)) + geom_boxplot(outlier.size = 0.3) +
  facet_grid(~trait2, scales = 'free', space = 'free') + plot.theme1 +
  geom_jitter(size = 0.3, alpha = 0.3, color = 'blue') +
  labs(y = '% unclassified', x = 'number of intervals') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

# subset with categorigal traits
traits_abu_df_cat <- traits_abu_df[-(grep('[0-9]$|exact',traits_abu_df$trait)),]

# boxplot for categorical variables
ggplot(traits_abu_df_cat, aes(x = trait, y = value)) + geom_boxplot(outlier.size = 0.3) +
  plot.theme1 +
  geom_jitter(size = 0.3, alpha = 0.3, color = 'blue') +
  labs(y = '% unclassified', x = 'Trait') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

bootstrap values as a function of number of intervals

traits_df <- do.call(rbind,traits)
# subset with continuous traits
traits_df_cont <- traits_df[grep('[0-9]$|_int',traits_df$Trait),]
# add column with interval and trait without number
traits_df_cont$trait2 <- gsub('[0-9]*$','',traits_df_cont$Trait) 
traits_df_cont$intervals <- gsub('.*_','',traits_df_cont$Trait)
traits_df_cont$intervals <- gsub('[a-z]*','',traits_df_cont$intervals)
traits_df_cont[traits_df_cont$Trait=="rRNA16S_genes_exact_int",'intervals'] <- 'exact'
traits_df_cont[traits_df_cont$trait2=="rRNA16S_genes_exact_int",'trait2'] <- 'rRNA16S_genes'

# order intervals
traits_df_cont$intervals <- factor(traits_df_cont$intervals, levels = c("5", "10", "20", "30", "40", "50", "exact"))

# boxplot for continuous variables as a function of interval
ggplot(traits_df_cont, aes(x = intervals, y = Bootstrap)) + geom_boxplot(outlier.size = 0.3) +
  facet_grid(~trait2, scales = 'free', space = 'free') + plot.theme1 +
  # geom_jitter(size = 0.3, alpha = 0.1, color = 'blue') +
  labs(y = 'Bootstrap value', x = 'number of intervals') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

# for categorical traits
traits_df_cat <- traits_df[-(grep('[0-9]$|exact',traits_df$Trait)),]

ggplot(traits_df_cat, aes(x = Trait, y = Bootstrap)) + geom_boxplot(outlier.size = 0.3) +
  plot.theme1 +
  # geom_jitter(size = 0.3, alpha = 0.3, color = 'blue') +
  labs(y = '% unclassified', x = 'Trait') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

sequence identity with tophit properties

distribution across dataset

#retain only one interval for each continuous trait (seq ID with tophit should be the same for different intervals)
traits_df2 <- traits_df[traits_df$Trait%in%c("cell_shape","d1_lo10", "d1_up10","d2_lo10","d2_up10", "doubling_h10", "genome_size10",
                                             "gram_stain", "metabolism", "motility", "optimum_ph10","optimum_tmp10",
                                             "range_salinity", "range_tmp", "rRNA16S_genes_exact", "sporulation"),]

ggplot(traits_df2, aes(x = Trait, y = topHit_ID)) + geom_boxplot(outlier.size = 0.3) +
  plot.theme1 +
  # geom_jitter(size = 0.3, alpha = 0.1, color = 'blue') +
  labs(y = '% sequence identity top hit', x = 'number of intervals') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

# for each sample and trait: average of ASVs that occur in this sample (not weighted by abundance)
meanseqID <- data.frame()
for (i in unique(traits_df2$Trait)){
  a <- traits_df2[traits_df2$Trait==i,]
  for(j in colnames(asv)){
    asvList <- rownames(asv[asv[,j]!=0,])
    meanseqID[j,i] <- mean(a[a$ASV%in%asvList,'topHit_ID'],na.rm = T)
  }
}
rm(asvList,a)
meanseqID <- melt(meanseqID)
## No id variables; using all as measure variables
ggplot(meanseqID, aes(x=variable,y=value)) + geom_boxplot(outlier.size = 0.3) +
  plot.theme1 +
  geom_jitter(size = 0.3, alpha = 0.1, color = 'blue') +
  labs(y = 'mean % sequence identity \n top hit', x = 'number of intervals') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

### bootstrap values as function of sequence identity for each trait

ggplot(traits_df, aes(x = topHit_ID, y = Bootstrap)) + geom_point(alpha = 0.2, size = 0.8) +
  geom_smooth(formula = y ~ s(x)) +
  plot.theme1 +
  facet_wrap(~Trait)
## `geom_smooth()` using method = 'gam'

# violin plots for different intervals of sequence identity (in scatterplots it is hard to see patterns)

# 10bins of equal length => group by interval
traits_df$topHit_ID_int <- cut_interval(traits_df$topHit_ID, 10)

# reorder levels of trait
traits_df$Trait <- factor(traits_df$Trait, levels = c("cell_shape", "d1_lo5", "d1_lo10","d1_lo20","d1_lo30",
                                                      "d1_up5","d1_up10","d1_up20", "d1_up30",
                                                      "d2_lo5", "d2_lo10","d2_lo20", "d2_lo30",
                                                      "d2_up5",  "d2_up10", "d2_up20", "d2_up30",
                                                      "doubling_h5", "doubling_h10", "doubling_h20","doubling_h30","doubling_h40", "doubling_h50",
                                                      "genome_size5", "genome_size10" , "genome_size20", "gram_stain", "metabolism","motility" ,
                                                      "optimum_ph5","optimum_ph10", "optimum_ph20","optimum_tmp10", "optimum_tmp20",
                                                      "range_salinity", "range_tmp",
                                                      "rRNA16S_genes5","rRNA16S_genes10", "rRNA16S_genes_exact_int", "rRNA16S_genes_exact", "sporulation"))



ggplot(traits_df, aes(x = topHit_ID, y = Bootstrap, group = topHit_ID_int)) + geom_violin(scale = "width", fill = "lightblue") +
  plot.theme1 +
  facet_wrap(~Trait)

Modelling CWM of genome size

Prepare predictor variables

final predictor tables provided (after imputing NAs, aggregating means of replicates and selection of non-colinear predictors); code for imputing NAs and selecting non-colinear variables shown for illustration

# code used for imputing missing values (NOT RUN HERE); samNum is table of numerical predictor variables
# estimate number of components
nb <- estim_ncpPCA(samNum, ncp.min=0, ncp.max=10, scale = TRUE)
# estimate values
samNumEst <- imputePCA(samNum, ncp=nb$ncp)
# extract table with values including estimates for NA
samNumEst <- samNumEst$completeObs
# select non-colinear predictors; (NOT RUN HERE); samNumEst.mean is table of numerical predictor variables
vifstep(samNumEst.mean) 

properties of predictor variables

# import data for prredictor and response variables
load("modelGenomeSize.RData")

vif(pred[,-15]) # all < 10
# make landcover factor
pred$landcover <- factor(pred$landcover, levels = rev(unique(pred$landcover)))
predDB$landcover <- factor(predDB$landcover, levels = rev(unique(predDB$landcover)))

# plot predictor variables
# continuous variables in long format to plot as boxplot
pred.long.cont <- melt(pred[,-15])
## No id variables; using all as measure variables
p1 <- ggplot(pred.long.cont, aes(x = variable, y = value)) + geom_boxplot() +
  facet_wrap(~variable, scales = 'free', ncol = 8) + plot.theme1 +
  geom_jitter( alpha = 0.3, color = 'blue') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))
p1 <- set_panel_size(p1, width = unit(0.5, "inch"), height = unit(2.7, "inch"))

# frequency for landcover
p2 <- ggplot(pred, aes(x=landcover, fill = landcover)) +
  geom_histogram(stat = "count") +
  stat_count(aes(y=..count.., label=..count..), geom="text", vjust=-.5) +
  scale_fill_manual(values = colLC) +
  plot.theme1 +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

p2 <- set_panel_size(p2, width = unit(3.5, "inch"), height = unit(2.8, "inch"))
grid.arrange(p1,p2, ncol=2)

# range of predictors
apply(pred,2,function(x){min(x)})
##      Soil.temperature        Water.activity                Litter 
##              " 3.200"           "0.4656000"           "   0.0000" 
##             Litter.TC              Soil.C.N                pH.1.2 
##            "23.22000"            "10.00800"               "2.620" 
##                  Silt                  Clay                   MAP 
##            " 0.75000"            " 0.85000"          "  45.34242" 
## BIO5.TmaxHottestMonth     BIO7.TannualRange BIO15.precSeasonality 
##           " 2.786956"            "18.08999"            "10.88828" 
##                   WHC              SOM.COMB             landcover 
##           "-2.000000"            " 0.34400"              "Barren" 
##            Litter.C.N 
##           " 28.70625"
apply(pred,2,function(x){max(x)})
##      Soil.temperature        Water.activity                Litter 
##              "45.144"           "1.0138000"           "1348.3700" 
##             Litter.TC              Soil.C.N                pH.1.2 
##            "50.35000"            "89.99400"               "8.116" 
##                  Silt                  Clay                   MAP 
##            "57.40000"            "40.25000"          "1635.92255" 
## BIO5.TmaxHottestMonth     BIO7.TannualRange BIO15.precSeasonality 
##           "35.900002"            "38.81161"            "86.91499" 
##                   WHC              SOM.COMB             landcover 
##           "23.785325"            "59.11500"      "Woody Savannas" 
##            Litter.C.N 
##           "133.90957"

Model genome_size

Assess distribution of response variable for use with glm

#find best distribution to model indices
#vector with distributions to test
dist=c("norm", "pois", "exp", "gamma", "nbinom", "geom", "unif", "logis")

ModLogl <- list()
ModAIC <- list()
ModBIC <- list()

for (i in dist){
  t <- fitdist(traits_WA_df[,'genome_size10'], i,  method="mme")
  # Log likelihood
  ModLogl[[i]] <- t$loglik
  # Akaike information criterion
  ModAIC[[i]] <- t$aic
  # Bic or SBC (Schwarz Bayesian criterion)
  ModBIC[[i]] <- t$bic
  rm(t)
  
} 


# plots
t <- list()
for (i in dist){
  t[[i]] <- fitdist(traits_WA_df$genome_size10, i,  method="mme")
}

par(mfrow = c(2, 2))
plot.legend <- c(names(t)[grep("[0-9]",ModAIC)])
denscomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend, main = 'genome_size10')
qqcomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend)
cdfcomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend)
ppcomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend)

#create data.frames from lists
ModLogl <- as.data.frame(do.call(cbind,ModLogl))
ModAIC <- as.data.frame(do.call(cbind,ModAIC))
ModBIC <- as.data.frame(do.call(cbind,ModBIC))

ModDist <- rbind(ModLogl,ModAIC,ModBIC)
rownames(ModDist) <- c('Logl','AIC','BIC')
rm(ModLogl,ModAIC,ModBIC,t)

#  normal gives best fit

Models with all variables

random forest

set.seed(7)
all(rownames(pred)==rownames(traits_WA_df))
## [1] TRUE
rf1 <- randomForest(x = pred, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)

# Variance importance
p <- varImpPlotgg(rf1)

grid.arrange(p)

vi_rf1 <- varImpPlot(rf1, sort = T)

vi_rf1 <- vi_rf1[order(vi_rf1[,1],decreasing = T),]

# monovariate response plots 
respM <- respMono(mod = rf1, pred = pred, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 4)

rm(respM)


# model performance: repeated split sampling 
cv <- crossVal(pred = pred, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf1 <-cv$summary

glm stepwise with AIC as stopping criterion

set.seed(7)
full <- glm(traits_WA_df[,'genome_size10'] ~., data = pred, family = gaussian)
null <- glm(traits_WA_df[,'genome_size10'] ~1, data = pred, family = gaussian)
glmStep1 <- stepAIC(full, scope = list(upper = full, lower = null), direction = 'both')
## Start:  AIC=2065.45
## traits_WA_df[, "genome_size10"] ~ Soil.temperature + Water.activity + 
##     Litter + Litter.TC + Soil.C.N + pH.1.2 + Silt + Clay + MAP + 
##     BIO5.TmaxHottestMonth + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + SOM.COMB + landcover + Litter.C.N
## 
##                         Df   Deviance    AIC
## - Soil.temperature       1 1.2721e+13 2063.5
## - MAP                    1 1.2722e+13 2063.5
## - pH.1.2                 1 1.2726e+13 2063.5
## - Litter.C.N             1 1.2726e+13 2063.5
## - BIO5.TmaxHottestMonth  1 1.2735e+13 2063.5
## - BIO7.TannualRange      1 1.2764e+13 2063.7
## - SOM.COMB               1 1.2766e+13 2063.7
## - Water.activity         1 1.2778e+13 2063.8
## - Litter.TC              1 1.2868e+13 2064.3
## - Clay                   1 1.3009e+13 2065.0
## <none>                     1.2721e+13 2065.4
## - Litter                 1 1.3720e+13 2068.8
## - Silt                   1 1.3767e+13 2069.0
## - Soil.C.N               1 1.3795e+13 2069.1
## - WHC                    1 1.4283e+13 2071.6
## - BIO15.precSeasonality  1 1.4737e+13 2073.8
## - landcover              9 2.0384e+13 2080.5
## 
## Step:  AIC=2063.46
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + pH.1.2 + Silt + Clay + MAP + BIO5.TmaxHottestMonth + 
##     BIO7.TannualRange + BIO15.precSeasonality + WHC + SOM.COMB + 
##     landcover + Litter.C.N
## 
##                         Df   Deviance    AIC
## - MAP                    1 1.2723e+13 2061.5
## - pH.1.2                 1 1.2726e+13 2061.5
## - Litter.C.N             1 1.2726e+13 2061.5
## - BIO5.TmaxHottestMonth  1 1.2742e+13 2061.6
## - SOM.COMB               1 1.2772e+13 2061.7
## - BIO7.TannualRange      1 1.2774e+13 2061.8
## - Water.activity         1 1.2783e+13 2061.8
## - Litter.TC              1 1.2868e+13 2062.3
## - Clay                   1 1.3012e+13 2063.0
## <none>                     1.2721e+13 2063.5
## + Soil.temperature       1 1.2721e+13 2065.4
## - Litter                 1 1.3721e+13 2066.8
## - Silt                   1 1.3770e+13 2067.0
## - Soil.C.N               1 1.3947e+13 2067.9
## - WHC                    1 1.4320e+13 2069.7
## - BIO15.precSeasonality  1 1.4785e+13 2072.0
## - landcover              9 2.0460e+13 2078.7
## 
## Step:  AIC=2061.47
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + pH.1.2 + Silt + Clay + BIO5.TmaxHottestMonth + 
##     BIO7.TannualRange + BIO15.precSeasonality + WHC + SOM.COMB + 
##     landcover + Litter.C.N
## 
##                         Df   Deviance    AIC
## - pH.1.2                 1 1.2727e+13 2059.5
## - Litter.C.N             1 1.2728e+13 2059.5
## - BIO5.TmaxHottestMonth  1 1.2742e+13 2059.6
## - SOM.COMB               1 1.2774e+13 2059.8
## - Water.activity         1 1.2783e+13 2059.8
## - BIO7.TannualRange      1 1.2791e+13 2059.8
## - Litter.TC              1 1.2870e+13 2060.3
## - Clay                   1 1.3050e+13 2061.2
## <none>                     1.2723e+13 2061.5
## + MAP                    1 1.2721e+13 2063.5
## + Soil.temperature       1 1.2722e+13 2063.5
## - Silt                   1 1.3800e+13 2065.2
## - Litter                 1 1.3808e+13 2065.2
## - Soil.C.N               1 1.4100e+13 2066.7
## - WHC                    1 1.4335e+13 2067.8
## - BIO15.precSeasonality  1 1.4798e+13 2070.0
## - landcover              9 2.0929e+13 2078.3
## 
## Step:  AIC=2059.49
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + Silt + Clay + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + SOM.COMB + landcover + Litter.C.N
## 
##                         Df   Deviance    AIC
## - Litter.C.N             1 1.2734e+13 2057.5
## - BIO5.TmaxHottestMonth  1 1.2746e+13 2057.6
## - SOM.COMB               1 1.2775e+13 2057.8
## - Water.activity         1 1.2784e+13 2057.8
## - BIO7.TannualRange      1 1.2799e+13 2057.9
## - Litter.TC              1 1.2890e+13 2058.4
## - Clay                   1 1.3088e+13 2059.4
## <none>                     1.2727e+13 2059.5
## + pH.1.2                 1 1.2723e+13 2061.5
## + MAP                    1 1.2726e+13 2061.5
## + Soil.temperature       1 1.2727e+13 2061.5
## - Litter                 1 1.3808e+13 2063.2
## - Silt                   1 1.3873e+13 2063.5
## - Soil.C.N               1 1.4239e+13 2065.3
## - WHC                    1 1.4338e+13 2065.8
## - BIO15.precSeasonality  1 1.4867e+13 2068.4
## - landcover              9 2.4203e+13 2086.5
## 
## Step:  AIC=2057.52
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + Silt + Clay + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + SOM.COMB + landcover
## 
##                         Df   Deviance    AIC
## - BIO5.TmaxHottestMonth  1 1.2751e+13 2055.6
## - SOM.COMB               1 1.2776e+13 2055.8
## - Water.activity         1 1.2795e+13 2055.9
## - BIO7.TannualRange      1 1.2806e+13 2055.9
## - Litter.TC              1 1.2891e+13 2056.4
## <none>                     1.2734e+13 2057.5
## - Clay                   1 1.3123e+13 2057.6
## + Litter.C.N             1 1.2727e+13 2059.5
## + pH.1.2                 1 1.2728e+13 2059.5
## + MAP                    1 1.2732e+13 2059.5
## + Soil.temperature       1 1.2733e+13 2059.5
## - Litter                 1 1.3809e+13 2061.2
## - Silt                   1 1.3875e+13 2061.5
## - Soil.C.N               1 1.4254e+13 2063.4
## - WHC                    1 1.4338e+13 2063.8
## - BIO15.precSeasonality  1 1.4868e+13 2066.4
## - landcover              9 2.4205e+13 2084.5
## 
## Step:  AIC=2055.62
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + Silt + Clay + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + SOM.COMB + landcover
## 
##                         Df   Deviance    AIC
## - SOM.COMB               1 1.2785e+13 2053.8
## - Water.activity         1 1.2796e+13 2053.9
## - BIO7.TannualRange      1 1.2812e+13 2053.9
## - Litter.TC              1 1.2907e+13 2054.5
## <none>                     1.2751e+13 2055.6
## - Clay                   1 1.3125e+13 2055.7
## + BIO5.TmaxHottestMonth  1 1.2734e+13 2057.5
## + Soil.temperature       1 1.2743e+13 2057.6
## + pH.1.2                 1 1.2746e+13 2057.6
## + Litter.C.N             1 1.2746e+13 2057.6
## + MAP                    1 1.2750e+13 2057.6
## - Litter                 1 1.3809e+13 2059.2
## - Silt                   1 1.3920e+13 2059.8
## - WHC                    1 1.4453e+13 2062.4
## - Soil.C.N               1 1.4680e+13 2063.5
## - BIO15.precSeasonality  1 1.5690e+13 2068.1
## - landcover              9 2.4235e+13 2082.6
## 
## Step:  AIC=2053.81
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + Silt + Clay + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + landcover
## 
##                         Df   Deviance    AIC
## - Water.activity         1 1.2845e+13 2052.1
## - BIO7.TannualRange      1 1.2847e+13 2052.2
## - Litter.TC              1 1.2953e+13 2052.7
## - Clay                   1 1.3126e+13 2053.7
## <none>                     1.2785e+13 2053.8
## + SOM.COMB               1 1.2751e+13 2055.6
## + BIO5.TmaxHottestMonth  1 1.2776e+13 2055.8
## + Litter.C.N             1 1.2784e+13 2055.8
## + MAP                    1 1.2785e+13 2055.8
## + pH.1.2                 1 1.2785e+13 2055.8
## + Soil.temperature       1 1.2785e+13 2055.8
## - Litter                 1 1.3891e+13 2057.6
## - Silt                   1 1.4019e+13 2058.3
## - WHC                    1 1.4546e+13 2060.8
## - Soil.C.N               1 1.4701e+13 2061.6
## - BIO15.precSeasonality  1 1.5704e+13 2066.2
## - landcover              9 2.4837e+13 2082.3
## 
## Step:  AIC=2052.13
## traits_WA_df[, "genome_size10"] ~ Litter + Litter.TC + Soil.C.N + 
##     Silt + Clay + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + landcover
## 
##                         Df   Deviance    AIC
## - BIO7.TannualRange      1 1.2880e+13 2050.3
## - Litter.TC              1 1.3020e+13 2051.1
## - Clay                   1 1.3131e+13 2051.7
## <none>                     1.2845e+13 2052.1
## + Water.activity         1 1.2785e+13 2053.8
## + SOM.COMB               1 1.2796e+13 2053.9
## + Soil.temperature       1 1.2824e+13 2054.0
## + pH.1.2                 1 1.2837e+13 2054.1
## + BIO5.TmaxHottestMonth  1 1.2838e+13 2054.1
## + Litter.C.N             1 1.2839e+13 2054.1
## + MAP                    1 1.2844e+13 2054.1
## - Silt                   1 1.4048e+13 2056.4
## - Litter                 1 1.4367e+13 2058.0
## - WHC                    1 1.4561e+13 2058.9
## - Soil.C.N               1 1.4704e+13 2059.6
## - BIO15.precSeasonality  1 1.6042e+13 2065.7
## - landcover              9 2.4951e+13 2080.6
## 
## Step:  AIC=2050.32
## traits_WA_df[, "genome_size10"] ~ Litter + Litter.TC + Soil.C.N + 
##     Silt + Clay + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - Litter.TC              1 1.3025e+13 2049.1
## - Clay                   1 1.3149e+13 2049.8
## <none>                     1.2880e+13 2050.3
## + SOM.COMB               1 1.2834e+13 2052.1
## + BIO7.TannualRange      1 1.2845e+13 2052.1
## + Water.activity         1 1.2847e+13 2052.2
## + Soil.temperature       1 1.2852e+13 2052.2
## + BIO5.TmaxHottestMonth  1 1.2873e+13 2052.3
## + Litter.C.N             1 1.2875e+13 2052.3
## + MAP                    1 1.2877e+13 2052.3
## + pH.1.2                 1 1.2878e+13 2052.3
## - Silt                   1 1.4176e+13 2055.0
## - Litter                 1 1.4524e+13 2056.7
## - WHC                    1 1.4561e+13 2056.9
## - Soil.C.N               1 1.4830e+13 2058.2
## - BIO15.precSeasonality  1 1.6044e+13 2063.7
## - landcover              9 2.4956e+13 2078.6
## 
## Step:  AIC=2049.11
## traits_WA_df[, "genome_size10"] ~ Litter + Soil.C.N + Silt + 
##     Clay + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - Clay                   1 1.3266e+13 2048.4
## <none>                     1.3025e+13 2049.1
## + Litter.TC              1 1.2880e+13 2050.3
## + SOM.COMB               1 1.2965e+13 2050.8
## + Water.activity         1 1.2974e+13 2050.8
## + Soil.temperature       1 1.3004e+13 2051.0
## + BIO5.TmaxHottestMonth  1 1.3015e+13 2051.1
## + BIO7.TannualRange      1 1.3020e+13 2051.1
## + Litter.C.N             1 1.3023e+13 2051.1
## + pH.1.2                 1 1.3025e+13 2051.1
## + MAP                    1 1.3025e+13 2051.1
## - Silt                   1 1.4314e+13 2053.7
## - WHC                    1 1.4569e+13 2054.9
## - Litter                 1 1.4808e+13 2056.1
## - Soil.C.N               1 1.5052e+13 2057.2
## - BIO15.precSeasonality  1 1.6044e+13 2061.7
## - landcover              9 2.5041e+13 2076.9
## 
## Step:  AIC=2048.39
## traits_WA_df[, "genome_size10"] ~ Litter + Soil.C.N + Silt + 
##     BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## <none>                     1.3266e+13 2048.4
## + Clay                   1 1.3025e+13 2049.1
## + Litter.TC              1 1.3149e+13 2049.8
## + MAP                    1 1.3238e+13 2050.2
## + pH.1.2                 1 1.3246e+13 2050.3
## + Water.activity         1 1.3259e+13 2050.4
## + Litter.C.N             1 1.3261e+13 2050.4
## + Soil.temperature       1 1.3261e+13 2050.4
## + BIO7.TannualRange      1 1.3264e+13 2050.4
## + SOM.COMB               1 1.3264e+13 2050.4
## + BIO5.TmaxHottestMonth  1 1.3265e+13 2050.4
## - WHC                    1 1.4628e+13 2053.2
## - Silt                   1 1.4916e+13 2054.6
## - Soil.C.N               1 1.5130e+13 2055.6
## - Litter                 1 1.5469e+13 2057.2
## - BIO15.precSeasonality  1 1.6114e+13 2060.0
## - landcover              9 2.5566e+13 2076.3
# variables importance
varImp.glmstep1 <- VarImp.glm(mod = glmStep1, perm = 100, plot = T, pt = plot.theme1)
grid.arrange(varImp.glmstep1$varImpPlot)

vi_glmstep1<- varImp.glmstep1$varImpDat

# Monavariate response plot
respM <- respMono(mod = glmStep1, pred = pred, w = 2, h = 2, type = "step", scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)

rm(respM)

# repeated split sampling with final model from stepwise
cv <- crossVal(pred = glmStep1$model[,2:ncol(glmStep1$model)], resp = traits_WA_df, var = 'genome_size10', plot = F, type = "glm")
cv_glmstep1_fin <- cv$summary

# with stepwise selection for each run
cv <- crossVal.step(pred = pred, resp = traits_WA_df, var = 'genome_size10', plot = F, direction = "both")
cv_glmstep1_sel <- cv$summary
# frequency variables appear in model
Vars_glmstep1_sel <- cv$variablesFraction

random forest with most important variables only

# to reduce overfitting, use same number of variables as in glmstep
set.seed(7)

pred2 <- pred[,rownames(vi_rf1)[1:(ncol(glmStep1$model)-1)]]
rf2 <- randomForest(x = pred2, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)

# Variance importance
p <- varImpPlotgg(rf2)

grid.arrange(p)

vi_rf2 <- varImpPlot(rf2, sort = T)

# monovariate response plots 
respM <- respMono(mod = rf2, pred = pred2, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)

rm(respM)

# model performance: repeated split sampling 
cv <- crossVal(pred = pred2, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf2 <-cv$summary

summary of crossvalidation results

# predictive accuracy
crossValSum <- as.data.frame(rbind(cv_rf1,cv_glmstep1_sel,cv_glmstep1_fin,cv_rf2))
crossValSum$modelType <- c('random forest all predictors', 'stepwise with var selection full','stepwise final model from full dataset',
                           'random forest most importants vars')

crossValSum
# for stepwise selection in cross validation: % each variable appears in model
Vars_glmstep1_sel <- as.data.frame(Vars_glmstep1_sel)
Vars_glmstep1_sel$Variable <- rownames(Vars_glmstep1_sel)
var_sum <- Vars_glmstep1_sel
var_sum

ensemble prediction

set.seed(7)
# random forest with all variables and glmstep
mods <- list(rf1,glmStep1)
names(mods) <- c('rf1','glmStep1')

weights <- c(crossValSum[c("cv_rf1","cv_glmstep1_fin"),'meanCor'])
names(weights) <- names(mods)

cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble1 <- cv$summary

# random forest with important variables only and glmstep
mods <- list(rf2,glmStep1)
names(mods) <- c('rf2','glmStep1')

weights <- c(crossValSum[c("cv_rf2","cv_glmstep1_fin"),'meanCor'])
names(weights) <- names(mods)

cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble2 <- cv$summary

# combine output from cross validation with different ensembles
crossValEns <- cbind(cv_ensemble1,cv_ensemble2)
crossValEns <- as.data.frame(t(crossValEns))
crossValEns$description <- c(
  'rf with all varaibles and glmstep',
  'rf with most important variables and glmstep')
crossValEns

Plot mean variable importance for different model types and variable sets

# combine variable importance
# for rfs
vi_rf1 <- as.data.frame(vi_rf1)
vi_rf2 <- as.data.frame(vi_rf2)

vi_rf1[,'Variable']<- rownames(vi_rf1)
vi_rf2[,'Variable'] <- rownames(vi_rf2)

vi <- merge(vi_rf1,vi_rf2, by = 'Variable', all = T)
colnames(vi)[c(2,4)] <- c('rf1','rf2')

vi <- vi[,-grep('Node',colnames(vi))] # remove columns with node impurity

# glmstep
vi <- merge(vi,vi_glmstep1, by = 'Variable', all = T)
colnames(vi)[4:5] <- c('glmstep1.mean','glmstep1.sd') # sd from permutations when calculating variable importance

# mean and sd for random forest sets4 and 6
vi[,'rf1_2_mean'] <- apply(vi[,c('rf1','rf2')],1,function(x)mean(x,na.rm = T))
vi[,'rf1_2_sd'] <- apply(vi[,c('rf1','rf2')],1,function(x)sd(x,na.rm = T))

Plots variable importance for most important variables

# show number of variables that as selected by glmstep 
a <- vi[order(vi$rf1, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf2))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p1 <- ggplot(a, aes(x = rf1, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf1')
p1 <- set_panel_size(p1, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

a <- vi[order(vi$rf2, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf2))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p2 <- ggplot(a, aes(x = rf2, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf2')
p2 <- set_panel_size(p2, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

a <- vi[order(vi$rf1_2_mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf2))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p3 <- ggplot(a, aes(x = rf1_2_mean, y = Variable)) + geom_col() +
  geom_errorbar(aes(xmin = rf1_2_mean - rf1_2_sd, xmax = rf1_2_mean + rf1_2_sd), width = 0.4, size =1) +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf1, 2 mean')
p3 <- set_panel_size(p3, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

# glmstep4 and % variable in model 
a <- vi[order(vi$glmstep1.mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$glmstep1.mean))))),]# adapt rows to nonNa
a$Variable <- factor(a$Variable, levels = rev(a$Variable))

# percentages variable in model crossval  => value, 1- value
b <- data.frame(Variable = a$Variable)
for(i in a$Variable){
  b[b$Variable==i,"in"] <- var_sum[var_sum$Variable==i,'Vars_glmstep1_sel']
  b[b$Variable==i,"out"] <- 1-var_sum[var_sum$Variable==i,'Vars_glmstep1_sel']
}
b <- melt(b)
## Using Variable as id variables
b$variable <- factor(b$variable, levels = c("out","in"))

# main plot: var importance 
p4 <- ggplot(a, aes(x = glmstep1.mean, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('glmstep1')
p4 <- set_panel_size(p4, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

# % variable in model as extra plot
p5 <- ggplot(b, aes(y=Variable,x=value,fill=variable))+geom_bar(stat="identity",color="black",width = 0.6) +
  scale_fill_manual(values = c("white","black")) + plot.theme1 + ggtitle('glmstep1 % var in model')
p5 <- set_panel_size(p5, width = unit(0.4, "inch"), height = unit(nrow(a)/2, "inch"))

grid.arrange(p1,p2,p3,p4,p5,ncol=5)

Models with variables in databases

random forest

set.seed(7)
all(rownames(predDB)==rownames(traits_WA_df))
## [1] TRUE
rf3 <- randomForest(x = predDB, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)

# Variance importance
p <- varImpPlotgg(rf3)

grid.arrange(p)

vi_rf3 <- varImpPlot(rf3, sort = T)

vi_rf3 <- vi_rf3[order(vi_rf3[,1],decreasing = T),]

# monovariate response plots 
respM <- respMono(mod = rf3, pred = predDB, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 4)

rm(respM)


# model performance: repeated split sampling 
cv <- crossVal(pred = predDB, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf3 <-cv$summary

glm stepwise with AIC as stopping criterion

set.seed(7)
full <- glm(traits_WA_df[,'genome_size10'] ~., data = predDB, family = gaussian)
null <- glm(traits_WA_df[,'genome_size10'] ~1, data = predDB, family = gaussian)
glmStep2 <- stepAIC(full, scope = list(upper = full, lower = null), direction = 'both')
## Start:  AIC=2062.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + pH.1.2 + 
##     Silt + Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - pH.1.2                 1 1.3978e+13 2060.1
## - BIO5.TmaxHottestMonth  1 1.4046e+13 2060.4
## - BIO7.TannualRange      1 1.4123e+13 2060.8
## - MAP                    1 1.4156e+13 2060.9
## - Soil.TOC               1 1.4188e+13 2061.1
## <none>                     1.3978e+13 2062.1
## - Clay                   1 1.4904e+13 2064.5
## - Silt                   1 1.5377e+13 2066.7
## - WHC                    1 1.5449e+13 2067.1
## - BIO15.precSeasonality  1 1.6634e+13 2072.2
## - Soil.C.N               1 1.7158e+13 2074.4
## - landcover              9 2.3642e+13 2080.8
## 
## Step:  AIC=2060.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - BIO5.TmaxHottestMonth  1 1.4049e+13 2058.4
## - BIO7.TannualRange      1 1.4125e+13 2058.8
## - MAP                    1 1.4157e+13 2058.9
## - Soil.TOC               1 1.4207e+13 2059.2
## <none>                     1.3978e+13 2060.1
## + pH.1.2                 1 1.3978e+13 2062.1
## - Clay                   1 1.5076e+13 2063.3
## - Silt                   1 1.5423e+13 2064.9
## - WHC                    1 1.5453e+13 2065.1
## - BIO15.precSeasonality  1 1.6827e+13 2071.0
## - Soil.C.N               1 1.7240e+13 2072.7
## - landcover              9 2.5840e+13 2085.1
## 
## Step:  AIC=2058.41
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + landcover
## 
##                         Df   Deviance    AIC
## - BIO7.TannualRange      1 1.4212e+13 2057.2
## - MAP                    1 1.4267e+13 2057.5
## - Soil.TOC               1 1.4434e+13 2058.3
## <none>                     1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth  1 1.3978e+13 2060.1
## + pH.1.2                 1 1.4046e+13 2060.4
## - Clay                   1 1.5228e+13 2062.1
## - Silt                   1 1.5451e+13 2063.1
## - WHC                    1 1.5453e+13 2063.1
## - Soil.C.N               1 1.7306e+13 2071.0
## - BIO15.precSeasonality  1 1.7555e+13 2072.0
## - landcover              9 2.6610e+13 2085.1
## 
## Step:  AIC=2057.21
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - MAP                    1 1.4323e+13 2055.8
## <none>                     1.4212e+13 2057.2
## - Soil.TOC               1 1.4632e+13 2057.2
## + BIO7.TannualRange      1 1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth  1 1.4125e+13 2058.8
## + pH.1.2                 1 1.4212e+13 2059.2
## - Clay                   1 1.5296e+13 2060.4
## - WHC                    1 1.5589e+13 2061.7
## - Silt                   1 1.5730e+13 2062.3
## - Soil.C.N               1 1.7531e+13 2069.9
## - BIO15.precSeasonality  1 1.7800e+13 2071.0
## - landcover              9 2.6714e+13 2083.4
## 
## Step:  AIC=2055.76
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## <none>                     1.4323e+13 2055.8
## - Soil.TOC               1 1.4808e+13 2056.1
## + BIO5.TmaxHottestMonth  1 1.4211e+13 2057.2
## + MAP                    1 1.4212e+13 2057.2
## + BIO7.TannualRange      1 1.4267e+13 2057.5
## + pH.1.2                 1 1.4319e+13 2057.7
## - Clay                   1 1.5306e+13 2058.4
## - Silt                   1 1.5750e+13 2060.4
## - WHC                    1 1.5833e+13 2060.8
## - Soil.C.N               1 1.7540e+13 2067.9
## - BIO15.precSeasonality  1 1.8118e+13 2070.2
## - landcover              9 2.7288e+13 2082.9
# variables importance
varImp.glmstep2 <- VarImp.glm(mod = glmStep2, perm = 100, plot = T, pt = plot.theme1)
grid.arrange(varImp.glmstep2$varImpPlot)

vi_glmstep2 <- varImp.glmstep2$varImpDat

# Monavariate response plot
respM <- respMono(mod = glmStep2, pred = predDB, w = 2, h = 2, type = "step", scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)

rm(respM)

# repeated split sampling with final model from stepwise
cv <- crossVal(pred = glmStep2$model[,2:ncol(glmStep2$model)], resp = traits_WA_df, var = 'genome_size10', plot = F, type = "glm")
cv_glmstep2_fin <- cv$summary

# with stepwise selection for each set
cv <- crossVal.step(pred = predDB, resp = traits_WA_df, var = 'genome_size10', plot = F, direction = "both")
cv_glmstep2_sel <- cv$summary
# frequency variables appear in model
Vars_glmstep2_sel <- cv$variablesFraction

random forest with most important variables only

# same number of variables as in glmstep
set.seed(7)

pred4 <- predDB[,rownames(vi_rf3)[1:(ncol(glmStep2$model)-1)]]
rf4 <- randomForest(x = pred4, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)

# Variance importance
p <- varImpPlotgg(rf4)

grid.arrange(p)

vi_rf4 <- varImpPlot(rf4, sort = T)

# monovariate response plots 
respM <- respMono(mod = rf4, pred = pred4, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)

rm(respM)


# model performance: repeated split sampling 
cv <- crossVal(pred = pred4, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf4 <-cv$summary

summary of crossvalidation results

# add to previous table
crossValSum <- rbind(crossValSum,cv_rf3,cv_rf4,cv_glmstep2_sel,cv_glmstep2_fin)
rownames(crossValSum)[5:8] <- c('cv_rf3','cv_rf4','cv_glmstep2_sel','cv_glmstep2_fin')
crossValSum$modelType[5:8] <- c('random forest database Vars all predictors', 'random forest database Vars all predictors most important Vars',
                                'stepwise with var selection database vars','stepwise final model database vars')

crossValSum
# for stepwise selection in cross validation: % each variable appears in model
Vars_glmstep2_sel <- as.data.frame(Vars_glmstep2_sel)
Vars_glmstep2_sel$Variable <- rownames(Vars_glmstep2_sel)
var_sum <- merge(var_sum, Vars_glmstep2_sel, by = 'Variable', all = T)
var_sum

ensemble prediction

set.seed(7)
# random forest all variables and glmstep
mods <- list(rf3,glmStep2)
names(mods) <- c('rf3','glmStep2')

weights <- c(crossValSum[c("cv_rf3","cv_glmstep2_fin"),'meanCor'])
names(weights) <- names(mods)

cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble3 <- cv$summary

# random forest with important variables only and glmstep
mods <- list(rf4,glmStep2)
names(mods) <- c('rf4','glmStep2')

weights <- c(crossValSum[c("cv_rf4","cv_glmstep2_fin"),'meanCor'])
names(weights) <- names(mods)

cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble4 <- cv$summary

# combine output from cross validation with different ensembles
crossValEns <- cbind(cv_ensemble1,cv_ensemble2,cv_ensemble3,cv_ensemble4)
crossValEns <- as.data.frame(t(crossValEns))
crossValEns$description <- c(
  'all predictors, rf and glmstep',
  'all predictors, rf with selected subset and glmstep',
  'predictors in databases, rf and glmstep',
  'predictors in databases, rf with selected subset and glmstep'
  )
crossValEns

Plot variable importance for different model types and variable sets

# combine variable importance
# for rfs
vi_rf3 <- as.data.frame(vi_rf3)
vi_rf4 <- as.data.frame(vi_rf4)

vi_rf3[,'Variable']<- rownames(vi_rf3)
vi_rf4[,'Variable'] <- rownames(vi_rf4)

vi <- merge(vi_rf3,vi_rf4, by = 'Variable', all = T)
colnames(vi)[c(2,4)] <- c('rf3','rf4')

vi <- vi[,-grep('Node',colnames(vi))] # remove columns with node impurity

# glmstep
vi <- merge(vi,vi_glmstep2, by = 'Variable', all = T)
colnames(vi)[4:5] <- c('glmstep2.mean','glmstep2.sd') # sd from permutations when calculating variable importance

# mean and sd for random forest 
vi[,'rf3_4_mean'] <- apply(vi[,c('rf3','rf4')],1,function(x)mean(x,na.rm = T))
vi[,'rf3_4_sd'] <- apply(vi[,c('rf3','rf4')],1,function(x)sd(x,na.rm = T))

Plot variable importance for most important variables

# number of variables that was selected by glmstep 
a <- vi[order(vi$rf3, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf4))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p1 <- ggplot(a, aes(x = rf3, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf3')
p1 <- set_panel_size(p1, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

a <- vi[order(vi$rf4, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf4))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p2 <- ggplot(a, aes(x = rf4, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf4')
p2 <- set_panel_size(p2, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

a <- vi[order(vi$rf3_4_mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf4))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p3 <- ggplot(a, aes(x = rf3_4_mean, y = Variable)) + geom_col() +
  geom_errorbar(aes(xmin = rf3_4_mean - rf3_4_sd, xmax = rf3_4_mean + rf3_4_sd), width = 0.4, size =1) +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf3, 4 mean')
p3 <- set_panel_size(p3, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

# glmstep and % variable in model 
a <- vi[order(vi$glmstep2.mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$glmstep2.mean))))),]# adapt rows to nonNa
a$Variable <- factor(a$Variable, levels = rev(a$Variable))

# percentages variable in model crossval
b <- data.frame(Variable = a$Variable)
for(i in a$Variable){
  b[b$Variable==i,"in"] <- var_sum[var_sum$Variable==i,'Vars_glmstep2_sel']
  b[b$Variable==i,"out"] <- 1-var_sum[var_sum$Variable==i,'Vars_glmstep2_sel']
}
b <- melt(b)
## Using Variable as id variables
b$variable <- factor(b$variable, levels = c("out","in"))

# main plot: var importance 
p4 <- ggplot(a, aes(x = glmstep2.mean, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('glmstep2')
p4 <- set_panel_size(p4, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

# % variable in model as extra plot
p5 <- ggplot(b, aes(y=Variable,x=value,fill=variable))+geom_bar(stat="identity",color="black",width = 0.6) +
  scale_fill_manual(values = c("white","black")) + plot.theme1 + ggtitle('glmstep2 % var in model')
p5 <- set_panel_size(p5, width = unit(0.4, "inch"), height = unit(nrow(a)/2, "inch"))

grid.arrange(p1,p2,p3,p4,p5,ncol=5)

Predict CWM of genome size for Spain

# import predictor rasters
load("predictorGrids.RData")

# plot predictors

plot(gridStack, col=rainbow(100,start=.0,end=.8))

## Predictions

# recode land cover in predictor table with numeric identifier as in raster 
predDB_re <- predDB
predDB_re$landcover <- as.character(predDB_re$landcover)
predDB_re$landcover <- gsub("Closed shrublands","Closed Shrublands",predDB_re$landcover) # correct misspelling
# extract  numeric IDs and landcover from database (order is different compared to covleg which was used to assign
# description to numeric ID!!!)
a <- as.data.frame(levels(gridStack$landcover))

for (i in unique(predDB_re$landcover)){
  predDB_re[predDB_re$landcover==i,'landcover'] <- a[a$Name==i,'ID' ]
}
predDB_re$landcover <- as.factor(predDB_re$landcover)

# model with recoded predictors
# glmstep 
set.seed(7)
full <- glm(traits_WA_df[,'genome_size10'] ~., data = predDB_re, family = gaussian)
null <- glm(traits_WA_df[,'genome_size10'] ~1, data = predDB_re, family = gaussian)
glmStep2_re <- stepAIC(full, scope = list(upper = full, lower = null), direction = 'both')
## Start:  AIC=2062.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + pH.1.2 + 
##     Silt + Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - pH.1.2                 1 1.3978e+13 2060.1
## - BIO5.TmaxHottestMonth  1 1.4046e+13 2060.4
## - BIO7.TannualRange      1 1.4123e+13 2060.8
## - MAP                    1 1.4156e+13 2060.9
## - Soil.TOC               1 1.4188e+13 2061.1
## <none>                     1.3978e+13 2062.1
## - Clay                   1 1.4904e+13 2064.5
## - Silt                   1 1.5377e+13 2066.7
## - WHC                    1 1.5449e+13 2067.1
## - BIO15.precSeasonality  1 1.6634e+13 2072.2
## - Soil.C.N               1 1.7158e+13 2074.4
## - landcover              9 2.3642e+13 2080.8
## 
## Step:  AIC=2060.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - BIO5.TmaxHottestMonth  1 1.4049e+13 2058.4
## - BIO7.TannualRange      1 1.4125e+13 2058.8
## - MAP                    1 1.4157e+13 2058.9
## - Soil.TOC               1 1.4207e+13 2059.2
## <none>                     1.3978e+13 2060.1
## + pH.1.2                 1 1.3978e+13 2062.1
## - Clay                   1 1.5076e+13 2063.3
## - Silt                   1 1.5423e+13 2064.9
## - WHC                    1 1.5453e+13 2065.1
## - BIO15.precSeasonality  1 1.6827e+13 2071.0
## - Soil.C.N               1 1.7240e+13 2072.7
## - landcover              9 2.5840e+13 2085.1
## 
## Step:  AIC=2058.41
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + landcover
## 
##                         Df   Deviance    AIC
## - BIO7.TannualRange      1 1.4212e+13 2057.2
## - MAP                    1 1.4267e+13 2057.5
## - Soil.TOC               1 1.4434e+13 2058.3
## <none>                     1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth  1 1.3978e+13 2060.1
## + pH.1.2                 1 1.4046e+13 2060.4
## - Clay                   1 1.5228e+13 2062.1
## - Silt                   1 1.5451e+13 2063.1
## - WHC                    1 1.5453e+13 2063.1
## - Soil.C.N               1 1.7306e+13 2071.0
## - BIO15.precSeasonality  1 1.7555e+13 2072.0
## - landcover              9 2.6610e+13 2085.1
## 
## Step:  AIC=2057.21
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - MAP                    1 1.4323e+13 2055.8
## <none>                     1.4212e+13 2057.2
## - Soil.TOC               1 1.4632e+13 2057.2
## + BIO7.TannualRange      1 1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth  1 1.4125e+13 2058.8
## + pH.1.2                 1 1.4212e+13 2059.2
## - Clay                   1 1.5296e+13 2060.4
## - WHC                    1 1.5589e+13 2061.7
## - Silt                   1 1.5730e+13 2062.3
## - Soil.C.N               1 1.7531e+13 2069.9
## - BIO15.precSeasonality  1 1.7800e+13 2071.0
## - landcover              9 2.6714e+13 2083.4
## 
## Step:  AIC=2055.76
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## <none>                     1.4323e+13 2055.8
## - Soil.TOC               1 1.4808e+13 2056.1
## + BIO5.TmaxHottestMonth  1 1.4211e+13 2057.2
## + MAP                    1 1.4212e+13 2057.2
## + BIO7.TannualRange      1 1.4267e+13 2057.5
## + pH.1.2                 1 1.4319e+13 2057.7
## - Clay                   1 1.5306e+13 2058.4
## - Silt                   1 1.5750e+13 2060.4
## - WHC                    1 1.5833e+13 2060.8
## - Soil.C.N               1 1.7540e+13 2067.9
## - BIO15.precSeasonality  1 1.8118e+13 2070.2
## - landcover              9 2.7288e+13 2082.9
gs_glmstep2 <- predict(gridStack,glmStep2_re)

# plot
# convert to stars
gs_glmstep2_st <- st_as_stars(gs_glmstep2) 
# same scale for all predictions
p2 <- ggplot() + geom_stars(data = gs_glmstep2_st, aes(x = x, y = y, fill = layer),
                            na.action = na.omit) +
  labs(title = "stepwise glm", x = "Longitude", y = "Latitude") +
  scale_fill_viridis(option = "magma", name = "Genome size", limits  = c(4000000,8500000), direction = -1) +
  theme_minimal()
p2 <- set_panel_size(p2, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))

# rf with most important variables
set.seed(7)
pred4_re <- predDB_re[,rownames(vi_rf3)[1:(ncol(glmStep2_re$model)-1)]]
rf4_re <- randomForest(x = pred4_re, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)
gs_rf4 <- predict(gridStack,rf4_re)

# convert to stars
gs_rf4_st <- st_as_stars(gs_rf4) 
# same scale for all predictions
p3 <- ggplot() + geom_stars(data = gs_rf4_st, aes(x = x, y = y, fill = layer),
                            na.action = na.omit) +
  labs(title = "random forest most important variables", x = "Longitude", y = "Latitude") +
  scale_fill_viridis(option = "magma", name = "Genome size", limits  = c(4000000,8500000), direction = -1) +
  theme_minimal()
p3 <- set_panel_size(p3, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))



# ensemble predictions of random forest and glmstep 
weights <- c(crossValSum[c("cv_rf4","cv_glmstep2_fin"),'meanCor'])
# stack rasters from which ensemble prediction (weighted mean) should be calculated
predStack <- stack(gs_rf4,gs_glmstep2)
gs_ens_rf4_glmstep2 <- weighted.mean(predStack,weights)

# plot
# convert to stars
gs_ens_rf4_glmstep2_st <- st_as_stars(gs_ens_rf4_glmstep2) 
# same scale for all predictions
p4 <- ggplot() + geom_stars(data = gs_ens_rf4_glmstep2_st, aes(x = x, y = y, fill = layer),
                            na.action = na.omit) +
  labs(title = "ensemble random forets - stepwise glm", x = "Longitude", y = "Latitude") +
  scale_fill_viridis(option = "magma", name = "Genome size", limits  = c(4000000,8500000), direction = -1) +
  theme_minimal()
p4 <- set_panel_size(p4, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))


# sd between models in ensemble
gs_ens_rf4_glmstep2_sd <- calc(predStack, fun = sd)
# plot
# convert to stars
gs_ens_rf4_glmstep2_sd_st <- st_as_stars(gs_ens_rf4_glmstep2_sd) 
# all values
p5 <- ggplot() + geom_stars(data = gs_ens_rf4_glmstep2_sd_st, aes(x = x, y = y, fill = layer),
                            na.action = na.omit) +
  labs(title = "sd prediction random forest - stepwise glm", x = "Longitude", y = "Latitude") +
  scale_fill_viridis(option = "magma", name = "Genome size", direction = -1) +
  theme_minimal()
p5 <- set_panel_size(p5, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))


# predictions
grid.arrange(p2,p3,p4,ncol=2)

#sd predictions
grid.arrange(p5)