classication of traits

as described in https://github.com/jdonhauser/ampliconTraits using usearch v11.0.667 e.g.

usearch -sinaps ASVs.fasta -db genome_size10.fasta -attr genome_size10 -tabbedout genomeclassification.txt -strand plus

calculation of sequence identity with the top hit

e.g for genome size

usearch -usearch_global dna-sequences.fasta -db genome_size10.fasta -strand plus -id 0.5 \
  -maxaccepts 8 -maxrejects 128 -top_hit_only \
  -userout genome_size10_topHit.txt -userfields query+target+id

libraries

library(vegan)
library(grid)
library(reshape2)
library(egg)
library(RColorBrewer)
library(viridis)
library(ggforce)
library(sp)
library(missMDA)
library(cluster)
library(usdm)
library(MASS)
library(corrplot)
library(ecospat) 
library(randomForest)
library(fitdistrplus)
library(Hmisc)
library(rgdal)
library(raster)
library(dismo) 
library(maptools)
library(terra)
library(viridis)
library(MicEnvMod)
library(stars)


sessionInfo()

## R version 4.1.3 (2022-03-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] stars_0.6-4          sf_1.0-12            abind_1.4-5         
##  [4] MicEnvMod_0.0.0.9000 maptools_1.1-4       dismo_1.3-9         
##  [7] raster_3.5-29        rgdal_1.6-4          Hmisc_4.7-1         
## [10] Formula_1.2-4        fitdistrplus_1.1-8   survival_3.4-0      
## [13] randomForest_4.7-1.1 ecospat_3.5.1        corrplot_0.92       
## [16] MASS_7.3-57          usdm_2.1-6           terra_1.7-23        
## [19] cluster_2.1.4        missMDA_1.18         sp_2.0-0            
## [22] ggforce_0.4.1        viridis_0.6.4        viridisLite_0.4.2   
## [25] RColorBrewer_1.1-3   egg_0.4.5            ggplot2_3.4.4       
## [28] gridExtra_2.3        reshape2_1.4.4       vegan_2.6-2         
## [31] lattice_0.20-45      permute_0.9-7       
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.4.1        plyr_1.8.7             splines_4.1.3         
##   [4] digest_0.6.29          foreach_1.5.2          htmltools_0.5.5       
##   [7] earth_5.3.2            fansi_1.0.3            magrittr_2.0.3        
##  [10] checkmate_2.1.0        doParallel_1.0.17      ks_1.14.0             
##  [13] jpeg_0.1-9             colorspace_2.1-0       ggrepel_0.9.1         
##  [16] xfun_0.32              dplyr_1.0.9            jsonlite_1.8.0        
##  [19] iterators_1.0.14       ape_5.6-2              glue_1.6.2            
##  [22] polyclip_1.10-4        PresenceAbsence_1.1.11 gtable_0.3.4          
##  [25] emmeans_1.8.7          scales_1.2.1           mvtnorm_1.1-3         
##  [28] DBI_1.1.3              Rcpp_1.0.10            plotrix_3.8-2         
##  [31] xtable_1.8-4           htmlTable_2.4.1        units_0.8-0           
##  [34] flashClust_1.01-2      foreign_0.8-82         proxy_0.4-27          
##  [37] mclust_6.0.0           DT_0.28                htmlwidgets_1.6.2     
##  [40] nabor_0.5.0            mice_3.15.0            pkgconfig_2.0.3       
##  [43] reshape_0.8.9          farver_2.1.1           nnet_7.3-17           
##  [46] multcompView_0.1-9     sass_0.4.2             deldir_1.0-6          
##  [49] utf8_1.2.2             tidyselect_1.1.2       rlang_1.1.1           
##  [52] munsell_0.5.0          TeachingDemos_2.12     tools_4.1.3           
##  [55] cachem_1.0.6           xgboost_1.6.0.1        cli_3.3.0             
##  [58] generics_0.1.3         ade4_1.7-19            broom_1.0.1           
##  [61] evaluate_0.16          stringr_1.4.1          fastmap_1.1.0         
##  [64] yaml_2.3.5             maxnet_0.1.4           knitr_1.40            
##  [67] purrr_0.3.4            nlme_3.1-159           pracma_2.4.2          
##  [70] leaps_3.1              biomod2_4.2-4          compiler_4.1.3        
##  [73] rstudioapi_0.14        png_0.1-7              e1071_1.7-11          
##  [76] tibble_3.2.1           tweenr_2.0.2           bslib_0.4.0           
##  [79] stringi_1.7.6          plotmo_3.6.2           poibin_1.5            
##  [82] Matrix_1.4-1           classInt_0.4-7         gbm_2.1.8.1           
##  [85] vctrs_0.6.1            pillar_1.9.0           lifecycle_1.0.4       
##  [88] jquerylib_0.1.4        estimability_1.4.1     data.table_1.14.2     
##  [91] R6_2.5.1               latticeExtra_0.6-30    KernSmooth_2.23-20    
##  [94] codetools_0.2-18       gtools_3.9.3           assertthat_0.2.1      
##  [97] withr_2.5.2            mgcv_1.8-40            parallel_4.1.3        
## [100] rpart_4.1.16           tidyr_1.2.0            coda_0.19-4           
## [103] class_7.3-20           rmarkdown_2.16         mda_0.5-3             
## [106] pROC_1.18.0            scatterplot3d_0.3-44   base64enc_0.1-3       
## [109] FactoMineR_2.8         interp_1.1-3

setup

plot.theme1 <- theme(panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.background = element_rect(fill = "white",
                                                     colour = "black",
                                                     size = 0.5, linetype = "solid"),
                     panel.border= element_rect(fill=NA,size = 0.5, linetype = 'solid',colour = "black"),
                     axis.text.x = element_text(size=13),axis.text.y = element_text(size=13),legend.text = element_text(size=13),
                     axis.title = element_text(size=14),
                     legend.title = element_text(color = "black", size = 14),
                     strip.text.x = element_text(size=14),
                     strip.background = element_rect(colour="black", fill="white")
)



# colors for landcover
covLeg <- read.csv("./igbp_legend.csv")
# create color vector with the colors indicated in the legend table
colLC <- c()
for (i in 1:nrow(covLeg)){
  colLC[i] <- rgb(covLeg[i,'r']/360,covLeg[i,'g']/360,covLeg[i,'b']/360)
}
names(colLC) <- covLeg$Name

### Function for aggregating asvs *********************************************************************** ####
#countab: counttable,taxa are rows
#taxo=taxonomy table
#col2matchcount vector of rownames or column in counttab to which order of taxonomy table should be matched
#col2matchtax:vector of rownames or column in taxo to match
#Taxlevel: taxonomic level on which should be aggregated
#Samp: sample data
#fac: factor in Sample of which the mean should be calculated
#Summarize: should rare taxa be summarized and represented as others
#sumlevel: abundance threshold below which taxa are summarkued as others
# long: transform to long format, default TRUE

AbuBarTable <- function(countab,taxo,col2matchcount,col2matchtax,Taxlevel,Samp,fac,Summarize=F,sumlevel,long=T){
  #replace NA with unclassified
  taxo <- as.data.frame(apply(taxo,2,function(x){
    sapply(x,function(y){ifelse(is.na(y),"unclassified",y)})
  }))
  
  
  tax <- taxo[match(col2matchcount,col2matchtax),]
  
  taxabu <- aggregate(countab,list(tax[,Taxlevel]),sum)
  rownames(taxabu)<- taxabu[,1]
  taxabu <- taxabu[,-1]
  
  taxabu.rel <- apply(taxabu,2,function(x)x/sum(x))
  taxabu.rel <- as.data.frame(t(taxabu.rel))
  taxabu.rel <- taxabu.rel[rownames(Samp),]
  
  for(i in 1:ncol(Samp)){
    Samp[,i] <- as.character(Samp[,i])
  }
  
  taxabu.rel.mean <- aggregate(taxabu.rel,list(Samp[,fac]),mean)
  colnames(taxabu.rel.mean)[1] <- fac
  rownames(taxabu.rel.mean) <- taxabu.rel.mean[,1]
  
  a=c()
  for (i in colnames(taxabu.rel.mean)){
    a[i] <- is.numeric(taxabu.rel.mean[,i])
  }
  
  taxabu.rel.mean  <- taxabu.rel.mean[,a]
  
  
  if(Summarize==T){
    num=taxabu.rel.mean
    num.l=as.list(as.data.frame(t(num)))
    
    num.l=lapply(num.l,function(x){
      names(x)=colnames(num)
      Others=sum(x[which(x<sumlevel)])
      x=x[-which(x<sumlevel)]
      names(Others)="Others"
      x=c(x,Others)
    })
    
    
    taxabu.rel.mean <- as.data.frame(do.call(rbind, lapply(num.l, "[",unique(as.vector(unlist(sapply(num.l,names)))))))
   
    colnames(taxabu.rel.mean) <- unique(as.vector(unlist(sapply(num.l,names))))
    
    
    taxabu.rel.mean <- apply(taxabu.rel.mean,2,function(x){
      sapply(x,function(y){ifelse(is.na(y),0,y)})
    })
    
  }
  
  
  a <- as.data.frame(Samp[!duplicated(Samp[,fac]),])
  
  if(all(rownames(taxabu.rel.mean)==a[,fac])){
    taxabu.rel.mean <- cbind(taxabu.rel.mean,a)
  }else{
    taxabu.rel.mean <- taxabu.rel.mean[match(a[,fac],rownames(taxabu.rel.mean)),]
    taxabu.rel.mean <- cbind(taxabu.rel.mean,a)
  }
  
  
  require(reshape2)
  if (long){
    taxabu.rel.mean.long <- melt(taxabu.rel.mean)
    a=unique(as.character(taxabu.rel.mean.long$variable))
    if("Others"%in%taxabu.rel.mean.long$variable){
      taxabu.rel.mean.long$variable <- factor(taxabu.rel.mean.long$variable,levels=c(a[-(which(a=="Others"))],"Others"))
    }
    
    taxabu.rel.mean.long[,fac] <- factor(taxabu.rel.mean.long[,fac],levels=unique(taxabu.rel.mean.long[,fac]))
    
    return(taxabu.rel.mean.long)
  }else{
    return(taxabu.rel.mean)
  }
  
}

Import files

load("./traitAnnotationProperties.RData" )

Properties of trait annotations across the dataset

Calculate community weighted trait means for continuous traits for bootstrap > 70 and sequence identity with tophit > 80

# based on mean of interval 
# d1_lo5 has only one value for this dataset, omit

# continuous traits
cont <- c("d1_lo10","d1_lo20","d1_lo30","d1_up10","d1_up20","d1_up30","d1_up5","d2_lo10","d2_lo20","d2_lo30",
          "d2_lo5","d2_up10","d2_up20","d2_up30","d2_up5","doubling_h10","doubling_h20","doubling_h30","doubling_h40","doubling_h50",
          "doubling_h5","genome_size10","genome_size20","genome_size5","optimum_ph10","optimum_ph20",
          "optimum_ph5","optimum_tmp10","optimum_tmp20","rRNA16S_genes_exact_int","rRNA16S_genes10",
          "rRNA16S_genes5")


traits_WA <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 70 or identity is < 80 or NA
  a$Value <- ifelse(a$Bootstrap < 70 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
  
  # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  
  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  traits_WA[[i]] <- abu
}

## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables

# keep only last column( weighted average)
traits_WA2 <- traits_WA

for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df <- do.call(cbind, traits_WA2)
colnames(traits_WA_df) <- names(traits_WA2)

Correlation matrices for different intervals for the same trait

# d1lo
ecospat.cor.plot(traits_WA_df[,grep('d1_lo',colnames(traits_WA_df))])

# d1_up
ecospat.cor.plot(traits_WA_df[,grep('d1_up',colnames(traits_WA_df))][,c(4,1,2,3)])

# d2_lo
ecospat.cor.plot(traits_WA_df[,grep('d2_lo',colnames(traits_WA_df))][,c(4,1,2,3)])

# d2_up
ecospat.cor.plot(traits_WA_df[,grep('d2_up',colnames(traits_WA_df))][,c(4,1,2,3)])

#doubling_h
ecospat.cor.plot(traits_WA_df[,grep('doubling_h',colnames(traits_WA_df))][,c(6,1,2,3,4,5)])

#genome_size
ecospat.cor.plot(traits_WA_df[,grep('genome_size',colnames(traits_WA_df))][,c(3,1,2)])

#optimum_ph
ecospat.cor.plot(traits_WA_df[,grep('optimum_ph',colnames(traits_WA_df))][,c(3,1,2)])

#optimum_tmp
ecospat.cor.plot(traits_WA_df[,grep('optimum_tmp',colnames(traits_WA_df))])

#rRNA16S_genes
ecospat.cor.plot(traits_WA_df[,grep('rRNA16S_genes',colnames(traits_WA_df))][,c(3,1,2)])

Correlation matrices of abundance weighted average for different bootstrap cutoffs

with and without cutoff for sequence identity ### bootstrap > 70 considered classified, no seqID cutoff

traits_WA2 <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 70
  a$Value <- ifelse(a$Bootstrap < 70, 'unclassified', a$Value)
  
  # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  

  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  
  traits_WA2[[i]] <- abu
}

## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables

# keep only last column( weighted average)
for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df_70 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_70) <- names(traits_WA2)

bootstrap > 60 considered classified, seqID > 80

traits_WA2 <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 60 or identity is < 80 or NA
  a$Value <- ifelse(a$Bootstrap < 60 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
  
    # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  
  
  
  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  
  traits_WA2[[i]] <- abu
}

## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables

# keep only last column( weighted average)
for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df_60_80 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_60_80) <- names(traits_WA2)

bootstrap > 60 considered classified, no seqID cutoff

traits_WA2 <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 60
  a$Value <- ifelse(a$Bootstrap < 60, 'unclassified', a$Value)
  
   # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  
  
  
  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  
  traits_WA2[[i]] <- abu
}

## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables

# keep only last column( weighted average)
for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df_60 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_60) <- names(traits_WA2)

bootstrap > 50 considered classified, seqID > 80

traits_WA2 <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 50 or identity is < 80 or NA
  a$Value <- ifelse(a$Bootstrap < 50 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
  
  # extract lower and upper boundary of interval and calculate mean
    # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  
  
  
  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  
  traits_WA2[[i]] <- abu
}

## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables

# keep only last column( weighted average)
for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df_50_80 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_50_80) <- names(traits_WA2)

bootstrap > 50 considered classified, no seqID cutoff

traits_WA2 <- list()
for (i in cont){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 50
  a$Value <- ifelse(a$Bootstrap < 50, 'unclassified', a$Value)
  
   # extract lower and upper boundary of interval and calculate mean
  for (j in unique(a$Value)){
    if (j=='unclassified'){
      a[a$Value==j,'lo'] <- NA
      a[a$Value==j,'up'] <- NA
    }else{
      
      a[a$Value==j,'lo'] <- gsub('-.*','', j)
      a[a$Value==j,'up'] <- gsub('.*-','', j)
      
    }
  }
  a$lo <- as.numeric(a$lo)
  a$up <- as.numeric(a$up)
  
  # calculate mean
  a$mean <- (a$lo+a$up)/2
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "mean", sam, "Sample.ID")
  
  # calculated weighted average
  for (j in unique(abu$Sample.ID)){
    b <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','variable']))
    # abundance relative to sum of classified
    c <- as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])) /
      sum(as.numeric(as.character(abu[abu$Sample.ID==j & abu$variable!='unclassified','value'])))
    abu[abu$Sample.ID==j, paste0(i,'_weightedMean')] <- sum(b*c)
    rm(b,c)
  }
  
  
  
  # dereplicate, so each sample occurs only once
  abu <- abu[!duplicated(abu$Sample.ID),]
  
  traits_WA2[[i]] <- abu
}

## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables
## Using Sample.ID, Gradient, Site, GxS as id variables

# keep only last column( weighted average)
for (i in names(traits_WA2)){
  traits_WA2[[i]] <- as.data.frame(traits_WA2[[i]][,ncol(traits_WA2[[i]])])
}

traits_WA_df_50 <- do.call(cbind, traits_WA2)
colnames(traits_WA_df_50) <- names(traits_WA2)

Correlation matrices for different bootstrap cutoffs

### add cutoff to column names and merge tables
colnames(traits_WA_df) <- paste0(colnames(traits_WA_df),'_7080')
colnames(traits_WA_df_70) <- paste0(colnames(traits_WA_df_70),'_70')

colnames(traits_WA_df_60_80) <- paste0(colnames(traits_WA_df_60_80),'_6080')
colnames(traits_WA_df_60) <- paste0(colnames(traits_WA_df_60),'_60')

colnames(traits_WA_df_50_80) <- paste0(colnames(traits_WA_df_50_80),'_5080')
colnames(traits_WA_df_50) <- paste0(colnames(traits_WA_df_50),'_50')

# merge tables 

traits_WA_df_all <- cbind(traits_WA_df,traits_WA_df_70,traits_WA_df_60_80,traits_WA_df_60,traits_WA_df_50_80,
                          traits_WA_df_50)


# for one of the intervals where stable results were obtained

# d1lo20 
ecospat.cor.plot(traits_WA_df_all[,grep('d1_lo20',colnames(traits_WA_df_all))])

# d1lo20 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d1_lo20.*80$',colnames(traits_WA_df_all))])

# d1up30
ecospat.cor.plot(traits_WA_df_all[,grep('d1_up30',colnames(traits_WA_df_all))])

# d1up30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d1_up30.*80$',colnames(traits_WA_df_all))])

# d2lo30
ecospat.cor.plot(traits_WA_df_all[,grep('d2_lo30',colnames(traits_WA_df_all))])

# d2lo30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d2_lo30.*80$',colnames(traits_WA_df_all))])

# d2up30
ecospat.cor.plot(traits_WA_df_all[,grep('d2_up30',colnames(traits_WA_df_all))])

# d2up30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('d1_up30.*80$',colnames(traits_WA_df_all))])

# doubling_h30
ecospat.cor.plot(traits_WA_df_all[,grep('doubling_h30',colnames(traits_WA_df_all))])

# doubling_h30 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('doubling_h30.*80$',colnames(traits_WA_df_all))])

# genome_size10
ecospat.cor.plot(traits_WA_df_all[,grep('genome_size10',colnames(traits_WA_df_all))])

# genome_size10 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('genome_size10.*80$',colnames(traits_WA_df_all))])

# optimum_ph20
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_ph20',colnames(traits_WA_df_all))])

# optimum_ph20 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_ph20.*80$',colnames(traits_WA_df_all))])

# optimum_tmp20
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_tmp20',colnames(traits_WA_df_all))])

# optimum_tmp20 - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('optimum_tmp20.*80$',colnames(traits_WA_df_all))])

# rRNA16S_genes exact
ecospat.cor.plot(traits_WA_df_all[,grep('rRNA16S_genes_exact_int',colnames(traits_WA_df_all))])

# rRNA16S_genes exact - only with sequence identity cutoff
ecospat.cor.plot(traits_WA_df_all[,grep('rRNA16S_genes_exact_int.*80$',colnames(traits_WA_df_all))])

Fraction of unclassified samples

# aggregate abundances of trait levels
sam <- sam[,c(1:2,4)] # subset to non-numeric

traits_abu <- list()
for (i in names(traits)){
  a <- traits[[i]]
  rownames(a) <- a[,1]
  # replace values with unclassified if bootstrap is < 70 or identity is < 80 or NA
  a$Value <- ifelse(a$Bootstrap < 70 | a$topHit_ID < 80 | is.na(a$topHit_ID), 'unclassified', a$Value)
  
  abu <- AbuBarTable(asv, a, rownames(asv), rownames(a), "Value", sam, "Sample.ID")
  traits_abu[[i]] <- abu
}

## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables
## Using Sample.ID, Gradient, GxS as id variables

# add name of trait
for (i in names(traits_abu)){
  traits_abu[[i]]$trait <- i
}

traits_abu_df <- do.call(rbind, traits_abu)
# subset to unclassified
traits_abu_df <- traits_abu_df[traits_abu_df$variable=='unclassified',]

# subset with continuous traits
traits_abu_df_cont <- traits_abu_df[grep('[0-9]$|_int',traits_abu_df$trait),]
# add column with interval and trait without number
traits_abu_df_cont$trait2 <- gsub('[0-9]*$','',traits_abu_df_cont$trait)
traits_abu_df_cont$intervals <- gsub('.*_','',traits_abu_df_cont$trait)
traits_abu_df_cont$intervals <- gsub('[a-z]*','',traits_abu_df_cont$intervals)
traits_abu_df_cont[traits_abu_df_cont$trait=="rRNA16S_genes_exact_int",'intervals'] <- 'exact'
traits_abu_df_cont[traits_abu_df_cont$trait2=="rRNA16S_genes_exact_int",'trait2'] <- 'rRNA16S_genes'

# order intervals
traits_abu_df_cont$intervals <- factor(traits_abu_df_cont$intervals, levels = c("5", "10", "20", "30", "40", "50", "exact"))

# boxplot for continuous variables as a function of interval
ggplot(traits_abu_df_cont, aes(x = intervals, y = value)) + geom_boxplot(outlier.size = 0.3) +
  facet_grid(~trait2, scales = 'free', space = 'free') + plot.theme1 +
  geom_jitter(size = 0.3, alpha = 0.3, color = 'blue') +
  labs(y = '% unclassified', x = 'number of intervals') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

# subset with categorigal traits
traits_abu_df_cat <- traits_abu_df[-(grep('[0-9]$|exact',traits_abu_df$trait)),]

# boxplot for categorical variables
ggplot(traits_abu_df_cat, aes(x = trait, y = value)) + geom_boxplot(outlier.size = 0.3) +
  plot.theme1 +
  geom_jitter(size = 0.3, alpha = 0.3, color = 'blue') +
  labs(y = '% unclassified', x = 'Trait') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

bootstrap values as a function of number of intervals

traits_df <- do.call(rbind,traits)
# subset with continuous traits
traits_df_cont <- traits_df[grep('[0-9]$|_int',traits_df$Trait),]
# add column with interval and trait without number
traits_df_cont$trait2 <- gsub('[0-9]*$','',traits_df_cont$Trait) 
traits_df_cont$intervals <- gsub('.*_','',traits_df_cont$Trait)
traits_df_cont$intervals <- gsub('[a-z]*','',traits_df_cont$intervals)
traits_df_cont[traits_df_cont$Trait=="rRNA16S_genes_exact_int",'intervals'] <- 'exact'
traits_df_cont[traits_df_cont$trait2=="rRNA16S_genes_exact_int",'trait2'] <- 'rRNA16S_genes'

# order intervals
traits_df_cont$intervals <- factor(traits_df_cont$intervals, levels = c("5", "10", "20", "30", "40", "50", "exact"))

# boxplot for continuous variables as a function of interval
ggplot(traits_df_cont, aes(x = intervals, y = Bootstrap)) + geom_boxplot(outlier.size = 0.3) +
  facet_grid(~trait2, scales = 'free', space = 'free') + plot.theme1 +
  # geom_jitter(size = 0.3, alpha = 0.1, color = 'blue') +
  labs(y = 'Bootstrap value', x = 'number of intervals') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

# for categorical traits
traits_df_cat <- traits_df[-(grep('[0-9]$|exact',traits_df$Trait)),]

ggplot(traits_df_cat, aes(x = Trait, y = Bootstrap)) + geom_boxplot(outlier.size = 0.3) +
  plot.theme1 +
  # geom_jitter(size = 0.3, alpha = 0.3, color = 'blue') +
  labs(y = '% unclassified', x = 'Trait') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

sequence identity with tophit properties

distribution across dataset

#retain only one interval for each continuous trait (seq ID with tophit should be the same for different intervals)
traits_df2 <- traits_df[traits_df$Trait%in%c("cell_shape","d1_lo10", "d1_up10","d2_lo10","d2_up10", "doubling_h10", "genome_size10",
                                             "gram_stain", "metabolism", "motility", "optimum_ph10","optimum_tmp10",
                                             "range_salinity", "range_tmp", "rRNA16S_genes_exact", "sporulation"),]

ggplot(traits_df2, aes(x = Trait, y = topHit_ID)) + geom_boxplot(outlier.size = 0.3) +
  plot.theme1 +
  # geom_jitter(size = 0.3, alpha = 0.1, color = 'blue') +
  labs(y = '% sequence identity top hit', x = 'number of intervals') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

# for each sample and trait: average of ASVs that occur in this sample (not weighted by abundance)
meanseqID <- data.frame()
for (i in unique(traits_df2$Trait)){
  a <- traits_df2[traits_df2$Trait==i,]
  for(j in colnames(asv)){
    asvList <- rownames(asv[asv[,j]!=0,])
    meanseqID[j,i] <- mean(a[a$ASV%in%asvList,'topHit_ID'],na.rm = T)
  }
}
rm(asvList,a)
meanseqID <- melt(meanseqID)

## No id variables; using all as measure variables

ggplot(meanseqID, aes(x=variable,y=value)) + geom_boxplot(outlier.size = 0.3) +
  plot.theme1 +
  geom_jitter(size = 0.3, alpha = 0.1, color = 'blue') +
  labs(y = 'mean % sequence identity \n top hit', x = 'number of intervals') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

### bootstrap values as function of sequence identity for each trait

ggplot(traits_df, aes(x = topHit_ID, y = Bootstrap)) + geom_point(alpha = 0.2, size = 0.8) +
  geom_smooth(formula = y ~ s(x)) +
  plot.theme1 +
  facet_wrap(~Trait)

## `geom_smooth()` using method = 'gam'

# violin plots for different intervals of sequence identity (in scatterplots it is hard to see patterns)

# 10bins of equal length => group by interval
traits_df$topHit_ID_int <- cut_interval(traits_df$topHit_ID, 10)

# reorder levels of trait
traits_df$Trait <- factor(traits_df$Trait, levels = c("cell_shape", "d1_lo5", "d1_lo10","d1_lo20","d1_lo30",
                                                      "d1_up5","d1_up10","d1_up20", "d1_up30",
                                                      "d2_lo5", "d2_lo10","d2_lo20", "d2_lo30",
                                                      "d2_up5",  "d2_up10", "d2_up20", "d2_up30",
                                                      "doubling_h5", "doubling_h10", "doubling_h20","doubling_h30","doubling_h40", "doubling_h50",
                                                      "genome_size5", "genome_size10" , "genome_size20", "gram_stain", "metabolism","motility" ,
                                                      "optimum_ph5","optimum_ph10", "optimum_ph20","optimum_tmp10", "optimum_tmp20",
                                                      "range_salinity", "range_tmp",
                                                      "rRNA16S_genes5","rRNA16S_genes10", "rRNA16S_genes_exact_int", "rRNA16S_genes_exact", "sporulation"))



ggplot(traits_df, aes(x = topHit_ID, y = Bootstrap, group = topHit_ID_int)) + geom_violin(scale = "width", fill = "lightblue") +
  plot.theme1 +
  facet_wrap(~Trait)

Modelling CWM of genome size

Prepare predictor variables

final predictor tables provided (after imputing NAs, aggregating means of replicates and selection of non-colinear predictors); code for imputing NAs and selecting non-colinear variables shown for illustration

# code used for imputing missing values (NOT RUN HERE); samNum is table of numerical predictor variables
# estimate number of components
nb <- estim_ncpPCA(samNum, ncp.min=0, ncp.max=10, scale = TRUE)
# estimate values
samNumEst <- imputePCA(samNum, ncp=nb$ncp)
# extract table with values including estimates for NA
samNumEst <- samNumEst$completeObs

# select non-colinear predictors; (NOT RUN HERE); samNumEst.mean is table of numerical predictor variables
vifstep(samNumEst.mean)

properties of predictor variables

# import data for prredictor and response variables
load("modelGenomeSize.RData")

vif(pred[,-15]) # all < 10

# make landcover factor
pred$landcover <- factor(pred$landcover, levels = rev(unique(pred$landcover)))
predDB$landcover <- factor(predDB$landcover, levels = rev(unique(predDB$landcover)))

# plot predictor variables
# continuous variables in long format to plot as boxplot
pred.long.cont <- melt(pred[,-15])

## No id variables; using all as measure variables

p1 <- ggplot(pred.long.cont, aes(x = variable, y = value)) + geom_boxplot() +
  facet_wrap(~variable, scales = 'free', ncol = 8) + plot.theme1 +
  geom_jitter( alpha = 0.3, color = 'blue') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))
p1 <- set_panel_size(p1, width = unit(0.5, "inch"), height = unit(2.7, "inch"))

# frequency for landcover
p2 <- ggplot(pred, aes(x=landcover, fill = landcover)) +
  geom_histogram(stat = "count") +
  stat_count(aes(y=..count.., label=..count..), geom="text", vjust=-.5) +
  scale_fill_manual(values = colLC) +
  plot.theme1 +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

p2 <- set_panel_size(p2, width = unit(3.5, "inch"), height = unit(2.8, "inch"))
grid.arrange(p1,p2, ncol=2)

# range of predictors
apply(pred,2,function(x){min(x)})

##      Soil.temperature        Water.activity                Litter 
##              " 3.200"           "0.4656000"           "   0.0000" 
##             Litter.TC              Soil.C.N                pH.1.2 
##            "23.22000"            "10.00800"               "2.620" 
##                  Silt                  Clay                   MAP 
##            " 0.75000"            " 0.85000"          "  45.34242" 
## BIO5.TmaxHottestMonth     BIO7.TannualRange BIO15.precSeasonality 
##           " 2.786956"            "18.08999"            "10.88828" 
##                   WHC              SOM.COMB             landcover 
##           "-2.000000"            " 0.34400"              "Barren" 
##            Litter.C.N 
##           " 28.70625"

apply(pred,2,function(x){max(x)})

##      Soil.temperature        Water.activity                Litter 
##              "45.144"           "1.0138000"           "1348.3700" 
##             Litter.TC              Soil.C.N                pH.1.2 
##            "50.35000"            "89.99400"               "8.116" 
##                  Silt                  Clay                   MAP 
##            "57.40000"            "40.25000"          "1635.92255" 
## BIO5.TmaxHottestMonth     BIO7.TannualRange BIO15.precSeasonality 
##           "35.900002"            "38.81161"            "86.91499" 
##                   WHC              SOM.COMB             landcover 
##           "23.785325"            "59.11500"      "Woody Savannas" 
##            Litter.C.N 
##           "133.90957"

Model genome_size

Assess distribution of response variable for use with glm

#find best distribution to model indices
#vector with distributions to test
dist=c("norm", "pois", "exp", "gamma", "nbinom", "geom", "unif", "logis")

ModLogl <- list()
ModAIC <- list()
ModBIC <- list()

for (i in dist){
  t <- fitdist(traits_WA_df[,'genome_size10'], i,  method="mme")
  # Log likelihood
  ModLogl[[i]] <- t$loglik
  # Akaike information criterion
  ModAIC[[i]] <- t$aic
  # Bic or SBC (Schwarz Bayesian criterion)
  ModBIC[[i]] <- t$bic
  rm(t)
  
} 


# plots
t <- list()
for (i in dist){
  t[[i]] <- fitdist(traits_WA_df$genome_size10, i,  method="mme")
}

par(mfrow = c(2, 2))
plot.legend <- c(names(t)[grep("[0-9]",ModAIC)])
denscomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend, main = 'genome_size10')
qqcomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend)
cdfcomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend)
ppcomp(t[grep("[0-9]",ModAIC)], legendtext = plot.legend)

#create data.frames from lists
ModLogl <- as.data.frame(do.call(cbind,ModLogl))
ModAIC <- as.data.frame(do.call(cbind,ModAIC))
ModBIC <- as.data.frame(do.call(cbind,ModBIC))

ModDist <- rbind(ModLogl,ModAIC,ModBIC)
rownames(ModDist) <- c('Logl','AIC','BIC')
rm(ModLogl,ModAIC,ModBIC,t)

#  normal gives best fit

Models with all variables

random forest

set.seed(7)
all(rownames(pred)==rownames(traits_WA_df))

## [1] TRUE

rf1 <- randomForest(x = pred, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)

# Variance importance
p <- varImpPlotgg(rf1)

grid.arrange(p)

vi_rf1 <- varImpPlot(rf1, sort = T)

vi_rf1 <- vi_rf1[order(vi_rf1[,1],decreasing = T),]

# monovariate response plots 
respM <- respMono(mod = rf1, pred = pred, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 4)

rm(respM)


# model performance: repeated split sampling 
cv <- crossVal(pred = pred, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf1 <-cv$summary

glm stepwise with AIC as stopping criterion

set.seed(7)
full <- glm(traits_WA_df[,'genome_size10'] ~., data = pred, family = gaussian)
null <- glm(traits_WA_df[,'genome_size10'] ~1, data = pred, family = gaussian)
glmStep1 <- stepAIC(full, scope = list(upper = full, lower = null), direction = 'both')

## Start:  AIC=2065.45
## traits_WA_df[, "genome_size10"] ~ Soil.temperature + Water.activity + 
##     Litter + Litter.TC + Soil.C.N + pH.1.2 + Silt + Clay + MAP + 
##     BIO5.TmaxHottestMonth + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + SOM.COMB + landcover + Litter.C.N
## 
##                         Df   Deviance    AIC
## - Soil.temperature       1 1.2721e+13 2063.5
## - MAP                    1 1.2722e+13 2063.5
## - pH.1.2                 1 1.2726e+13 2063.5
## - Litter.C.N             1 1.2726e+13 2063.5
## - BIO5.TmaxHottestMonth  1 1.2735e+13 2063.5
## - BIO7.TannualRange      1 1.2764e+13 2063.7
## - SOM.COMB               1 1.2766e+13 2063.7
## - Water.activity         1 1.2778e+13 2063.8
## - Litter.TC              1 1.2868e+13 2064.3
## - Clay                   1 1.3009e+13 2065.0
## <none>                     1.2721e+13 2065.4
## - Litter                 1 1.3720e+13 2068.8
## - Silt                   1 1.3767e+13 2069.0
## - Soil.C.N               1 1.3795e+13 2069.1
## - WHC                    1 1.4283e+13 2071.6
## - BIO15.precSeasonality  1 1.4737e+13 2073.8
## - landcover              9 2.0384e+13 2080.5
## 
## Step:  AIC=2063.46
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + pH.1.2 + Silt + Clay + MAP + BIO5.TmaxHottestMonth + 
##     BIO7.TannualRange + BIO15.precSeasonality + WHC + SOM.COMB + 
##     landcover + Litter.C.N
## 
##                         Df   Deviance    AIC
## - MAP                    1 1.2723e+13 2061.5
## - pH.1.2                 1 1.2726e+13 2061.5
## - Litter.C.N             1 1.2726e+13 2061.5
## - BIO5.TmaxHottestMonth  1 1.2742e+13 2061.6
## - SOM.COMB               1 1.2772e+13 2061.7
## - BIO7.TannualRange      1 1.2774e+13 2061.8
## - Water.activity         1 1.2783e+13 2061.8
## - Litter.TC              1 1.2868e+13 2062.3
## - Clay                   1 1.3012e+13 2063.0
## <none>                     1.2721e+13 2063.5
## + Soil.temperature       1 1.2721e+13 2065.4
## - Litter                 1 1.3721e+13 2066.8
## - Silt                   1 1.3770e+13 2067.0
## - Soil.C.N               1 1.3947e+13 2067.9
## - WHC                    1 1.4320e+13 2069.7
## - BIO15.precSeasonality  1 1.4785e+13 2072.0
## - landcover              9 2.0460e+13 2078.7
## 
## Step:  AIC=2061.47
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + pH.1.2 + Silt + Clay + BIO5.TmaxHottestMonth + 
##     BIO7.TannualRange + BIO15.precSeasonality + WHC + SOM.COMB + 
##     landcover + Litter.C.N
## 
##                         Df   Deviance    AIC
## - pH.1.2                 1 1.2727e+13 2059.5
## - Litter.C.N             1 1.2728e+13 2059.5
## - BIO5.TmaxHottestMonth  1 1.2742e+13 2059.6
## - SOM.COMB               1 1.2774e+13 2059.8
## - Water.activity         1 1.2783e+13 2059.8
## - BIO7.TannualRange      1 1.2791e+13 2059.8
## - Litter.TC              1 1.2870e+13 2060.3
## - Clay                   1 1.3050e+13 2061.2
## <none>                     1.2723e+13 2061.5
## + MAP                    1 1.2721e+13 2063.5
## + Soil.temperature       1 1.2722e+13 2063.5
## - Silt                   1 1.3800e+13 2065.2
## - Litter                 1 1.3808e+13 2065.2
## - Soil.C.N               1 1.4100e+13 2066.7
## - WHC                    1 1.4335e+13 2067.8
## - BIO15.precSeasonality  1 1.4798e+13 2070.0
## - landcover              9 2.0929e+13 2078.3
## 
## Step:  AIC=2059.49
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + Silt + Clay + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + SOM.COMB + landcover + Litter.C.N
## 
##                         Df   Deviance    AIC
## - Litter.C.N             1 1.2734e+13 2057.5
## - BIO5.TmaxHottestMonth  1 1.2746e+13 2057.6
## - SOM.COMB               1 1.2775e+13 2057.8
## - Water.activity         1 1.2784e+13 2057.8
## - BIO7.TannualRange      1 1.2799e+13 2057.9
## - Litter.TC              1 1.2890e+13 2058.4
## - Clay                   1 1.3088e+13 2059.4
## <none>                     1.2727e+13 2059.5
## + pH.1.2                 1 1.2723e+13 2061.5
## + MAP                    1 1.2726e+13 2061.5
## + Soil.temperature       1 1.2727e+13 2061.5
## - Litter                 1 1.3808e+13 2063.2
## - Silt                   1 1.3873e+13 2063.5
## - Soil.C.N               1 1.4239e+13 2065.3
## - WHC                    1 1.4338e+13 2065.8
## - BIO15.precSeasonality  1 1.4867e+13 2068.4
## - landcover              9 2.4203e+13 2086.5
## 
## Step:  AIC=2057.52
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + Silt + Clay + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + SOM.COMB + landcover
## 
##                         Df   Deviance    AIC
## - BIO5.TmaxHottestMonth  1 1.2751e+13 2055.6
## - SOM.COMB               1 1.2776e+13 2055.8
## - Water.activity         1 1.2795e+13 2055.9
## - BIO7.TannualRange      1 1.2806e+13 2055.9
## - Litter.TC              1 1.2891e+13 2056.4
## <none>                     1.2734e+13 2057.5
## - Clay                   1 1.3123e+13 2057.6
## + Litter.C.N             1 1.2727e+13 2059.5
## + pH.1.2                 1 1.2728e+13 2059.5
## + MAP                    1 1.2732e+13 2059.5
## + Soil.temperature       1 1.2733e+13 2059.5
## - Litter                 1 1.3809e+13 2061.2
## - Silt                   1 1.3875e+13 2061.5
## - Soil.C.N               1 1.4254e+13 2063.4
## - WHC                    1 1.4338e+13 2063.8
## - BIO15.precSeasonality  1 1.4868e+13 2066.4
## - landcover              9 2.4205e+13 2084.5
## 
## Step:  AIC=2055.62
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + Silt + Clay + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + SOM.COMB + landcover
## 
##                         Df   Deviance    AIC
## - SOM.COMB               1 1.2785e+13 2053.8
## - Water.activity         1 1.2796e+13 2053.9
## - BIO7.TannualRange      1 1.2812e+13 2053.9
## - Litter.TC              1 1.2907e+13 2054.5
## <none>                     1.2751e+13 2055.6
## - Clay                   1 1.3125e+13 2055.7
## + BIO5.TmaxHottestMonth  1 1.2734e+13 2057.5
## + Soil.temperature       1 1.2743e+13 2057.6
## + pH.1.2                 1 1.2746e+13 2057.6
## + Litter.C.N             1 1.2746e+13 2057.6
## + MAP                    1 1.2750e+13 2057.6
## - Litter                 1 1.3809e+13 2059.2
## - Silt                   1 1.3920e+13 2059.8
## - WHC                    1 1.4453e+13 2062.4
## - Soil.C.N               1 1.4680e+13 2063.5
## - BIO15.precSeasonality  1 1.5690e+13 2068.1
## - landcover              9 2.4235e+13 2082.6
## 
## Step:  AIC=2053.81
## traits_WA_df[, "genome_size10"] ~ Water.activity + Litter + Litter.TC + 
##     Soil.C.N + Silt + Clay + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + landcover
## 
##                         Df   Deviance    AIC
## - Water.activity         1 1.2845e+13 2052.1
## - BIO7.TannualRange      1 1.2847e+13 2052.2
## - Litter.TC              1 1.2953e+13 2052.7
## - Clay                   1 1.3126e+13 2053.7
## <none>                     1.2785e+13 2053.8
## + SOM.COMB               1 1.2751e+13 2055.6
## + BIO5.TmaxHottestMonth  1 1.2776e+13 2055.8
## + Litter.C.N             1 1.2784e+13 2055.8
## + MAP                    1 1.2785e+13 2055.8
## + pH.1.2                 1 1.2785e+13 2055.8
## + Soil.temperature       1 1.2785e+13 2055.8
## - Litter                 1 1.3891e+13 2057.6
## - Silt                   1 1.4019e+13 2058.3
## - WHC                    1 1.4546e+13 2060.8
## - Soil.C.N               1 1.4701e+13 2061.6
## - BIO15.precSeasonality  1 1.5704e+13 2066.2
## - landcover              9 2.4837e+13 2082.3
## 
## Step:  AIC=2052.13
## traits_WA_df[, "genome_size10"] ~ Litter + Litter.TC + Soil.C.N + 
##     Silt + Clay + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + landcover
## 
##                         Df   Deviance    AIC
## - BIO7.TannualRange      1 1.2880e+13 2050.3
## - Litter.TC              1 1.3020e+13 2051.1
## - Clay                   1 1.3131e+13 2051.7
## <none>                     1.2845e+13 2052.1
## + Water.activity         1 1.2785e+13 2053.8
## + SOM.COMB               1 1.2796e+13 2053.9
## + Soil.temperature       1 1.2824e+13 2054.0
## + pH.1.2                 1 1.2837e+13 2054.1
## + BIO5.TmaxHottestMonth  1 1.2838e+13 2054.1
## + Litter.C.N             1 1.2839e+13 2054.1
## + MAP                    1 1.2844e+13 2054.1
## - Silt                   1 1.4048e+13 2056.4
## - Litter                 1 1.4367e+13 2058.0
## - WHC                    1 1.4561e+13 2058.9
## - Soil.C.N               1 1.4704e+13 2059.6
## - BIO15.precSeasonality  1 1.6042e+13 2065.7
## - landcover              9 2.4951e+13 2080.6
## 
## Step:  AIC=2050.32
## traits_WA_df[, "genome_size10"] ~ Litter + Litter.TC + Soil.C.N + 
##     Silt + Clay + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - Litter.TC              1 1.3025e+13 2049.1
## - Clay                   1 1.3149e+13 2049.8
## <none>                     1.2880e+13 2050.3
## + SOM.COMB               1 1.2834e+13 2052.1
## + BIO7.TannualRange      1 1.2845e+13 2052.1
## + Water.activity         1 1.2847e+13 2052.2
## + Soil.temperature       1 1.2852e+13 2052.2
## + BIO5.TmaxHottestMonth  1 1.2873e+13 2052.3
## + Litter.C.N             1 1.2875e+13 2052.3
## + MAP                    1 1.2877e+13 2052.3
## + pH.1.2                 1 1.2878e+13 2052.3
## - Silt                   1 1.4176e+13 2055.0
## - Litter                 1 1.4524e+13 2056.7
## - WHC                    1 1.4561e+13 2056.9
## - Soil.C.N               1 1.4830e+13 2058.2
## - BIO15.precSeasonality  1 1.6044e+13 2063.7
## - landcover              9 2.4956e+13 2078.6
## 
## Step:  AIC=2049.11
## traits_WA_df[, "genome_size10"] ~ Litter + Soil.C.N + Silt + 
##     Clay + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - Clay                   1 1.3266e+13 2048.4
## <none>                     1.3025e+13 2049.1
## + Litter.TC              1 1.2880e+13 2050.3
## + SOM.COMB               1 1.2965e+13 2050.8
## + Water.activity         1 1.2974e+13 2050.8
## + Soil.temperature       1 1.3004e+13 2051.0
## + BIO5.TmaxHottestMonth  1 1.3015e+13 2051.1
## + BIO7.TannualRange      1 1.3020e+13 2051.1
## + Litter.C.N             1 1.3023e+13 2051.1
## + pH.1.2                 1 1.3025e+13 2051.1
## + MAP                    1 1.3025e+13 2051.1
## - Silt                   1 1.4314e+13 2053.7
## - WHC                    1 1.4569e+13 2054.9
## - Litter                 1 1.4808e+13 2056.1
## - Soil.C.N               1 1.5052e+13 2057.2
## - BIO15.precSeasonality  1 1.6044e+13 2061.7
## - landcover              9 2.5041e+13 2076.9
## 
## Step:  AIC=2048.39
## traits_WA_df[, "genome_size10"] ~ Litter + Soil.C.N + Silt + 
##     BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## <none>                     1.3266e+13 2048.4
## + Clay                   1 1.3025e+13 2049.1
## + Litter.TC              1 1.3149e+13 2049.8
## + MAP                    1 1.3238e+13 2050.2
## + pH.1.2                 1 1.3246e+13 2050.3
## + Water.activity         1 1.3259e+13 2050.4
## + Litter.C.N             1 1.3261e+13 2050.4
## + Soil.temperature       1 1.3261e+13 2050.4
## + BIO7.TannualRange      1 1.3264e+13 2050.4
## + SOM.COMB               1 1.3264e+13 2050.4
## + BIO5.TmaxHottestMonth  1 1.3265e+13 2050.4
## - WHC                    1 1.4628e+13 2053.2
## - Silt                   1 1.4916e+13 2054.6
## - Soil.C.N               1 1.5130e+13 2055.6
## - Litter                 1 1.5469e+13 2057.2
## - BIO15.precSeasonality  1 1.6114e+13 2060.0
## - landcover              9 2.5566e+13 2076.3

# variables importance
varImp.glmstep1 <- VarImp.glm(mod = glmStep1, perm = 100, plot = T, pt = plot.theme1)
grid.arrange(varImp.glmstep1$varImpPlot)

vi_glmstep1<- varImp.glmstep1$varImpDat

# Monavariate response plot
respM <- respMono(mod = glmStep1, pred = pred, w = 2, h = 2, type = "step", scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)

rm(respM)

# repeated split sampling with final model from stepwise
cv <- crossVal(pred = glmStep1$model[,2:ncol(glmStep1$model)], resp = traits_WA_df, var = 'genome_size10', plot = F, type = "glm")
cv_glmstep1_fin <- cv$summary

# with stepwise selection for each run
cv <- crossVal.step(pred = pred, resp = traits_WA_df, var = 'genome_size10', plot = F, direction = "both")
cv_glmstep1_sel <- cv$summary
# frequency variables appear in model
Vars_glmstep1_sel <- cv$variablesFraction

random forest with most important variables only

# to reduce overfitting, use same number of variables as in glmstep
set.seed(7)

pred2 <- pred[,rownames(vi_rf1)[1:(ncol(glmStep1$model)-1)]]
rf2 <- randomForest(x = pred2, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)

# Variance importance
p <- varImpPlotgg(rf2)

grid.arrange(p)

vi_rf2 <- varImpPlot(rf2, sort = T)

# monovariate response plots 
respM <- respMono(mod = rf2, pred = pred2, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)

rm(respM)

# model performance: repeated split sampling 
cv <- crossVal(pred = pred2, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf2 <-cv$summary

summary of crossvalidation results

# predictive accuracy
crossValSum <- as.data.frame(rbind(cv_rf1,cv_glmstep1_sel,cv_glmstep1_fin,cv_rf2))
crossValSum$modelType <- c('random forest all predictors', 'stepwise with var selection full','stepwise final model from full dataset',
                           'random forest most importants vars')

crossValSum

# for stepwise selection in cross validation: % each variable appears in model
Vars_glmstep1_sel <- as.data.frame(Vars_glmstep1_sel)
Vars_glmstep1_sel$Variable <- rownames(Vars_glmstep1_sel)
var_sum <- Vars_glmstep1_sel
var_sum

ensemble prediction

set.seed(7)
# random forest with all variables and glmstep
mods <- list(rf1,glmStep1)
names(mods) <- c('rf1','glmStep1')

weights <- c(crossValSum[c("cv_rf1","cv_glmstep1_fin"),'meanCor'])
names(weights) <- names(mods)

cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble1 <- cv$summary

# random forest with important variables only and glmstep
mods <- list(rf2,glmStep1)
names(mods) <- c('rf2','glmStep1')

weights <- c(crossValSum[c("cv_rf2","cv_glmstep1_fin"),'meanCor'])
names(weights) <- names(mods)

cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble2 <- cv$summary

# combine output from cross validation with different ensembles
crossValEns <- cbind(cv_ensemble1,cv_ensemble2)
crossValEns <- as.data.frame(t(crossValEns))
crossValEns$description <- c(
  'rf with all varaibles and glmstep',
  'rf with most important variables and glmstep')
crossValEns

Plot mean variable importance for different model types and variable sets

# combine variable importance
# for rfs
vi_rf1 <- as.data.frame(vi_rf1)
vi_rf2 <- as.data.frame(vi_rf2)

vi_rf1[,'Variable']<- rownames(vi_rf1)
vi_rf2[,'Variable'] <- rownames(vi_rf2)

vi <- merge(vi_rf1,vi_rf2, by = 'Variable', all = T)
colnames(vi)[c(2,4)] <- c('rf1','rf2')

vi <- vi[,-grep('Node',colnames(vi))] # remove columns with node impurity

# glmstep
vi <- merge(vi,vi_glmstep1, by = 'Variable', all = T)
colnames(vi)[4:5] <- c('glmstep1.mean','glmstep1.sd') # sd from permutations when calculating variable importance

# mean and sd for random forest sets4 and 6
vi[,'rf1_2_mean'] <- apply(vi[,c('rf1','rf2')],1,function(x)mean(x,na.rm = T))
vi[,'rf1_2_sd'] <- apply(vi[,c('rf1','rf2')],1,function(x)sd(x,na.rm = T))

Plots variable importance for most important variables

# show number of variables that as selected by glmstep 
a <- vi[order(vi$rf1, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf2))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p1 <- ggplot(a, aes(x = rf1, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf1')
p1 <- set_panel_size(p1, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

a <- vi[order(vi$rf2, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf2))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p2 <- ggplot(a, aes(x = rf2, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf2')
p2 <- set_panel_size(p2, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

a <- vi[order(vi$rf1_2_mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf2))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p3 <- ggplot(a, aes(x = rf1_2_mean, y = Variable)) + geom_col() +
  geom_errorbar(aes(xmin = rf1_2_mean - rf1_2_sd, xmax = rf1_2_mean + rf1_2_sd), width = 0.4, size =1) +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf1, 2 mean')
p3 <- set_panel_size(p3, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

# glmstep4 and % variable in model 
a <- vi[order(vi$glmstep1.mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$glmstep1.mean))))),]# adapt rows to nonNa
a$Variable <- factor(a$Variable, levels = rev(a$Variable))

# percentages variable in model crossval  => value, 1- value
b <- data.frame(Variable = a$Variable)
for(i in a$Variable){
  b[b$Variable==i,"in"] <- var_sum[var_sum$Variable==i,'Vars_glmstep1_sel']
  b[b$Variable==i,"out"] <- 1-var_sum[var_sum$Variable==i,'Vars_glmstep1_sel']
}
b <- melt(b)

## Using Variable as id variables

b$variable <- factor(b$variable, levels = c("out","in"))

# main plot: var importance 
p4 <- ggplot(a, aes(x = glmstep1.mean, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('glmstep1')
p4 <- set_panel_size(p4, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

# % variable in model as extra plot
p5 <- ggplot(b, aes(y=Variable,x=value,fill=variable))+geom_bar(stat="identity",color="black",width = 0.6) +
  scale_fill_manual(values = c("white","black")) + plot.theme1 + ggtitle('glmstep1 % var in model')
p5 <- set_panel_size(p5, width = unit(0.4, "inch"), height = unit(nrow(a)/2, "inch"))

grid.arrange(p1,p2,p3,p4,p5,ncol=5)

Models with variables in databases

random forest

set.seed(7)
all(rownames(predDB)==rownames(traits_WA_df))

## [1] TRUE

rf3 <- randomForest(x = predDB, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)

# Variance importance
p <- varImpPlotgg(rf3)

grid.arrange(p)

vi_rf3 <- varImpPlot(rf3, sort = T)

vi_rf3 <- vi_rf3[order(vi_rf3[,1],decreasing = T),]

# monovariate response plots 
respM <- respMono(mod = rf3, pred = predDB, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 4)

rm(respM)


# model performance: repeated split sampling 
cv <- crossVal(pred = predDB, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf3 <-cv$summary

glm stepwise with AIC as stopping criterion

set.seed(7)
full <- glm(traits_WA_df[,'genome_size10'] ~., data = predDB, family = gaussian)
null <- glm(traits_WA_df[,'genome_size10'] ~1, data = predDB, family = gaussian)
glmStep2 <- stepAIC(full, scope = list(upper = full, lower = null), direction = 'both')

## Start:  AIC=2062.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + pH.1.2 + 
##     Silt + Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - pH.1.2                 1 1.3978e+13 2060.1
## - BIO5.TmaxHottestMonth  1 1.4046e+13 2060.4
## - BIO7.TannualRange      1 1.4123e+13 2060.8
## - MAP                    1 1.4156e+13 2060.9
## - Soil.TOC               1 1.4188e+13 2061.1
## <none>                     1.3978e+13 2062.1
## - Clay                   1 1.4904e+13 2064.5
## - Silt                   1 1.5377e+13 2066.7
## - WHC                    1 1.5449e+13 2067.1
## - BIO15.precSeasonality  1 1.6634e+13 2072.2
## - Soil.C.N               1 1.7158e+13 2074.4
## - landcover              9 2.3642e+13 2080.8
## 
## Step:  AIC=2060.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - BIO5.TmaxHottestMonth  1 1.4049e+13 2058.4
## - BIO7.TannualRange      1 1.4125e+13 2058.8
## - MAP                    1 1.4157e+13 2058.9
## - Soil.TOC               1 1.4207e+13 2059.2
## <none>                     1.3978e+13 2060.1
## + pH.1.2                 1 1.3978e+13 2062.1
## - Clay                   1 1.5076e+13 2063.3
## - Silt                   1 1.5423e+13 2064.9
## - WHC                    1 1.5453e+13 2065.1
## - BIO15.precSeasonality  1 1.6827e+13 2071.0
## - Soil.C.N               1 1.7240e+13 2072.7
## - landcover              9 2.5840e+13 2085.1
## 
## Step:  AIC=2058.41
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + landcover
## 
##                         Df   Deviance    AIC
## - BIO7.TannualRange      1 1.4212e+13 2057.2
## - MAP                    1 1.4267e+13 2057.5
## - Soil.TOC               1 1.4434e+13 2058.3
## <none>                     1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth  1 1.3978e+13 2060.1
## + pH.1.2                 1 1.4046e+13 2060.4
## - Clay                   1 1.5228e+13 2062.1
## - Silt                   1 1.5451e+13 2063.1
## - WHC                    1 1.5453e+13 2063.1
## - Soil.C.N               1 1.7306e+13 2071.0
## - BIO15.precSeasonality  1 1.7555e+13 2072.0
## - landcover              9 2.6610e+13 2085.1
## 
## Step:  AIC=2057.21
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - MAP                    1 1.4323e+13 2055.8
## <none>                     1.4212e+13 2057.2
## - Soil.TOC               1 1.4632e+13 2057.2
## + BIO7.TannualRange      1 1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth  1 1.4125e+13 2058.8
## + pH.1.2                 1 1.4212e+13 2059.2
## - Clay                   1 1.5296e+13 2060.4
## - WHC                    1 1.5589e+13 2061.7
## - Silt                   1 1.5730e+13 2062.3
## - Soil.C.N               1 1.7531e+13 2069.9
## - BIO15.precSeasonality  1 1.7800e+13 2071.0
## - landcover              9 2.6714e+13 2083.4
## 
## Step:  AIC=2055.76
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## <none>                     1.4323e+13 2055.8
## - Soil.TOC               1 1.4808e+13 2056.1
## + BIO5.TmaxHottestMonth  1 1.4211e+13 2057.2
## + MAP                    1 1.4212e+13 2057.2
## + BIO7.TannualRange      1 1.4267e+13 2057.5
## + pH.1.2                 1 1.4319e+13 2057.7
## - Clay                   1 1.5306e+13 2058.4
## - Silt                   1 1.5750e+13 2060.4
## - WHC                    1 1.5833e+13 2060.8
## - Soil.C.N               1 1.7540e+13 2067.9
## - BIO15.precSeasonality  1 1.8118e+13 2070.2
## - landcover              9 2.7288e+13 2082.9

# variables importance
varImp.glmstep2 <- VarImp.glm(mod = glmStep2, perm = 100, plot = T, pt = plot.theme1)
grid.arrange(varImp.glmstep2$varImpPlot)

vi_glmstep2 <- varImp.glmstep2$varImpDat

# Monavariate response plot
respM <- respMono(mod = glmStep2, pred = predDB, w = 2, h = 2, type = "step", scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)

rm(respM)

# repeated split sampling with final model from stepwise
cv <- crossVal(pred = glmStep2$model[,2:ncol(glmStep2$model)], resp = traits_WA_df, var = 'genome_size10', plot = F, type = "glm")
cv_glmstep2_fin <- cv$summary

# with stepwise selection for each set
cv <- crossVal.step(pred = predDB, resp = traits_WA_df, var = 'genome_size10', plot = F, direction = "both")
cv_glmstep2_sel <- cv$summary
# frequency variables appear in model
Vars_glmstep2_sel <- cv$variablesFraction

random forest with most important variables only

# same number of variables as in glmstep
set.seed(7)

pred4 <- predDB[,rownames(vi_rf3)[1:(ncol(glmStep2$model)-1)]]
rf4 <- randomForest(x = pred4, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)

# Variance importance
p <- varImpPlotgg(rf4)

grid.arrange(p)

vi_rf4 <- varImpPlot(rf4, sort = T)

# monovariate response plots 
respM <- respMono(mod = rf4, pred = pred4, w = 2, h = 2, scale = "fixed", cols = colLC)
grid.arrange(grobs = respM, ncol = 3)

rm(respM)


# model performance: repeated split sampling 
cv <- crossVal(pred = pred4, resp = traits_WA_df, var = 'genome_size10', plot = F)
cv_rf4 <-cv$summary

summary of crossvalidation results

# add to previous table
crossValSum <- rbind(crossValSum,cv_rf3,cv_rf4,cv_glmstep2_sel,cv_glmstep2_fin)
rownames(crossValSum)[5:8] <- c('cv_rf3','cv_rf4','cv_glmstep2_sel','cv_glmstep2_fin')
crossValSum$modelType[5:8] <- c('random forest database Vars all predictors', 'random forest database Vars all predictors most important Vars',
                                'stepwise with var selection database vars','stepwise final model database vars')

crossValSum

# for stepwise selection in cross validation: % each variable appears in model
Vars_glmstep2_sel <- as.data.frame(Vars_glmstep2_sel)
Vars_glmstep2_sel$Variable <- rownames(Vars_glmstep2_sel)
var_sum <- merge(var_sum, Vars_glmstep2_sel, by = 'Variable', all = T)
var_sum

ensemble prediction

set.seed(7)
# random forest all variables and glmstep
mods <- list(rf3,glmStep2)
names(mods) <- c('rf3','glmStep2')

weights <- c(crossValSum[c("cv_rf3","cv_glmstep2_fin"),'meanCor'])
names(weights) <- names(mods)

cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble3 <- cv$summary

# random forest with important variables only and glmstep
mods <- list(rf4,glmStep2)
names(mods) <- c('rf4','glmStep2')

weights <- c(crossValSum[c("cv_rf4","cv_glmstep2_fin"),'meanCor'])
names(weights) <- names(mods)

cv <- crossVal.ensemble(mods = mods, resp = traits_WA_df,var = "genome_size10",plot = F, weights = weights)
cv_ensemble4 <- cv$summary

# combine output from cross validation with different ensembles
crossValEns <- cbind(cv_ensemble1,cv_ensemble2,cv_ensemble3,cv_ensemble4)
crossValEns <- as.data.frame(t(crossValEns))
crossValEns$description <- c(
  'all predictors, rf and glmstep',
  'all predictors, rf with selected subset and glmstep',
  'predictors in databases, rf and glmstep',
  'predictors in databases, rf with selected subset and glmstep'
  )
crossValEns

Plot variable importance for different model types and variable sets

# combine variable importance
# for rfs
vi_rf3 <- as.data.frame(vi_rf3)
vi_rf4 <- as.data.frame(vi_rf4)

vi_rf3[,'Variable']<- rownames(vi_rf3)
vi_rf4[,'Variable'] <- rownames(vi_rf4)

vi <- merge(vi_rf3,vi_rf4, by = 'Variable', all = T)
colnames(vi)[c(2,4)] <- c('rf3','rf4')

vi <- vi[,-grep('Node',colnames(vi))] # remove columns with node impurity

# glmstep
vi <- merge(vi,vi_glmstep2, by = 'Variable', all = T)
colnames(vi)[4:5] <- c('glmstep2.mean','glmstep2.sd') # sd from permutations when calculating variable importance

# mean and sd for random forest 
vi[,'rf3_4_mean'] <- apply(vi[,c('rf3','rf4')],1,function(x)mean(x,na.rm = T))
vi[,'rf3_4_sd'] <- apply(vi[,c('rf3','rf4')],1,function(x)sd(x,na.rm = T))

Plot variable importance for most important variables

# number of variables that was selected by glmstep 
a <- vi[order(vi$rf3, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf4))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p1 <- ggplot(a, aes(x = rf3, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf3')
p1 <- set_panel_size(p1, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

a <- vi[order(vi$rf4, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf4))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p2 <- ggplot(a, aes(x = rf4, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf4')
p2 <- set_panel_size(p2, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

a <- vi[order(vi$rf3_4_mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$rf4))))),]
a$Variable <- factor(a$Variable, levels = rev(a$Variable))
p3 <- ggplot(a, aes(x = rf3_4_mean, y = Variable)) + geom_col() +
  geom_errorbar(aes(xmin = rf3_4_mean - rf3_4_sd, xmax = rf3_4_mean + rf3_4_sd), width = 0.4, size =1) +
  xlab('%IncMSE') + plot.theme1 + ggtitle('rf3, 4 mean')
p3 <- set_panel_size(p3, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

# glmstep and % variable in model 
a <- vi[order(vi$glmstep2.mean, decreasing = T),]
a <- a[1:(length(which(!(is.na(a$glmstep2.mean))))),]# adapt rows to nonNa
a$Variable <- factor(a$Variable, levels = rev(a$Variable))

# percentages variable in model crossval
b <- data.frame(Variable = a$Variable)
for(i in a$Variable){
  b[b$Variable==i,"in"] <- var_sum[var_sum$Variable==i,'Vars_glmstep2_sel']
  b[b$Variable==i,"out"] <- 1-var_sum[var_sum$Variable==i,'Vars_glmstep2_sel']
}
b <- melt(b)

## Using Variable as id variables

b$variable <- factor(b$variable, levels = c("out","in"))

# main plot: var importance 
p4 <- ggplot(a, aes(x = glmstep2.mean, y = Variable)) + geom_col() +
  xlab('%IncMSE') + plot.theme1 + ggtitle('glmstep2')
p4 <- set_panel_size(p4, width = unit(1.5, "inch"), height = unit(nrow(a)/2, "inch"))

# % variable in model as extra plot
p5 <- ggplot(b, aes(y=Variable,x=value,fill=variable))+geom_bar(stat="identity",color="black",width = 0.6) +
  scale_fill_manual(values = c("white","black")) + plot.theme1 + ggtitle('glmstep2 % var in model')
p5 <- set_panel_size(p5, width = unit(0.4, "inch"), height = unit(nrow(a)/2, "inch"))

grid.arrange(p1,p2,p3,p4,p5,ncol=5)

Predict CWM of genome size for Spain

# import predictor rasters
load("predictorGrids.RData")

# plot predictors

plot(gridStack, col=rainbow(100,start=.0,end=.8))

## Predictions

# recode land cover in predictor table with numeric identifier as in raster 
predDB_re <- predDB
predDB_re$landcover <- as.character(predDB_re$landcover)
predDB_re$landcover <- gsub("Closed shrublands","Closed Shrublands",predDB_re$landcover) # correct misspelling
# extract  numeric IDs and landcover from database (order is different compared to covleg which was used to assign
# description to numeric ID!!!)
a <- as.data.frame(levels(gridStack$landcover))

for (i in unique(predDB_re$landcover)){
  predDB_re[predDB_re$landcover==i,'landcover'] <- a[a$Name==i,'ID' ]
}
predDB_re$landcover <- as.factor(predDB_re$landcover)

# model with recoded predictors
# glmstep 
set.seed(7)
full <- glm(traits_WA_df[,'genome_size10'] ~., data = predDB_re, family = gaussian)
null <- glm(traits_WA_df[,'genome_size10'] ~1, data = predDB_re, family = gaussian)
glmStep2_re <- stepAIC(full, scope = list(upper = full, lower = null), direction = 'both')

## Start:  AIC=2062.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + pH.1.2 + 
##     Silt + Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - pH.1.2                 1 1.3978e+13 2060.1
## - BIO5.TmaxHottestMonth  1 1.4046e+13 2060.4
## - BIO7.TannualRange      1 1.4123e+13 2060.8
## - MAP                    1 1.4156e+13 2060.9
## - Soil.TOC               1 1.4188e+13 2061.1
## <none>                     1.3978e+13 2062.1
## - Clay                   1 1.4904e+13 2064.5
## - Silt                   1 1.5377e+13 2066.7
## - WHC                    1 1.5449e+13 2067.1
## - BIO15.precSeasonality  1 1.6634e+13 2072.2
## - Soil.C.N               1 1.7158e+13 2074.4
## - landcover              9 2.3642e+13 2080.8
## 
## Step:  AIC=2060.05
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO5.TmaxHottestMonth + BIO7.TannualRange + 
##     BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - BIO5.TmaxHottestMonth  1 1.4049e+13 2058.4
## - BIO7.TannualRange      1 1.4125e+13 2058.8
## - MAP                    1 1.4157e+13 2058.9
## - Soil.TOC               1 1.4207e+13 2059.2
## <none>                     1.3978e+13 2060.1
## + pH.1.2                 1 1.3978e+13 2062.1
## - Clay                   1 1.5076e+13 2063.3
## - Silt                   1 1.5423e+13 2064.9
## - WHC                    1 1.5453e+13 2065.1
## - BIO15.precSeasonality  1 1.6827e+13 2071.0
## - Soil.C.N               1 1.7240e+13 2072.7
## - landcover              9 2.5840e+13 2085.1
## 
## Step:  AIC=2058.41
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO7.TannualRange + BIO15.precSeasonality + 
##     WHC + landcover
## 
##                         Df   Deviance    AIC
## - BIO7.TannualRange      1 1.4212e+13 2057.2
## - MAP                    1 1.4267e+13 2057.5
## - Soil.TOC               1 1.4434e+13 2058.3
## <none>                     1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth  1 1.3978e+13 2060.1
## + pH.1.2                 1 1.4046e+13 2060.4
## - Clay                   1 1.5228e+13 2062.1
## - Silt                   1 1.5451e+13 2063.1
## - WHC                    1 1.5453e+13 2063.1
## - Soil.C.N               1 1.7306e+13 2071.0
## - BIO15.precSeasonality  1 1.7555e+13 2072.0
## - landcover              9 2.6610e+13 2085.1
## 
## Step:  AIC=2057.21
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + MAP + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## - MAP                    1 1.4323e+13 2055.8
## <none>                     1.4212e+13 2057.2
## - Soil.TOC               1 1.4632e+13 2057.2
## + BIO7.TannualRange      1 1.4049e+13 2058.4
## + BIO5.TmaxHottestMonth  1 1.4125e+13 2058.8
## + pH.1.2                 1 1.4212e+13 2059.2
## - Clay                   1 1.5296e+13 2060.4
## - WHC                    1 1.5589e+13 2061.7
## - Silt                   1 1.5730e+13 2062.3
## - Soil.C.N               1 1.7531e+13 2069.9
## - BIO15.precSeasonality  1 1.7800e+13 2071.0
## - landcover              9 2.6714e+13 2083.4
## 
## Step:  AIC=2055.76
## traits_WA_df[, "genome_size10"] ~ Soil.C.N + Soil.TOC + Silt + 
##     Clay + BIO15.precSeasonality + WHC + landcover
## 
##                         Df   Deviance    AIC
## <none>                     1.4323e+13 2055.8
## - Soil.TOC               1 1.4808e+13 2056.1
## + BIO5.TmaxHottestMonth  1 1.4211e+13 2057.2
## + MAP                    1 1.4212e+13 2057.2
## + BIO7.TannualRange      1 1.4267e+13 2057.5
## + pH.1.2                 1 1.4319e+13 2057.7
## - Clay                   1 1.5306e+13 2058.4
## - Silt                   1 1.5750e+13 2060.4
## - WHC                    1 1.5833e+13 2060.8
## - Soil.C.N               1 1.7540e+13 2067.9
## - BIO15.precSeasonality  1 1.8118e+13 2070.2
## - landcover              9 2.7288e+13 2082.9

gs_glmstep2 <- predict(gridStack,glmStep2_re)

# plot
# convert to stars
gs_glmstep2_st <- st_as_stars(gs_glmstep2) 
# same scale for all predictions
p2 <- ggplot() + geom_stars(data = gs_glmstep2_st, aes(x = x, y = y, fill = layer),
                            na.action = na.omit) +
  labs(title = "stepwise glm", x = "Longitude", y = "Latitude") +
  scale_fill_viridis(option = "magma", name = "Genome size", limits  = c(4000000,8500000), direction = -1) +
  theme_minimal()
p2 <- set_panel_size(p2, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))

# rf with most important variables
set.seed(7)
pred4_re <- predDB_re[,rownames(vi_rf3)[1:(ncol(glmStep2_re$model)-1)]]
rf4_re <- randomForest(x = pred4_re, y = traits_WA_df$genome_size10, ntree = 1000, importance = TRUE, nPerm = 100)
gs_rf4 <- predict(gridStack,rf4_re)

# convert to stars
gs_rf4_st <- st_as_stars(gs_rf4) 
# same scale for all predictions
p3 <- ggplot() + geom_stars(data = gs_rf4_st, aes(x = x, y = y, fill = layer),
                            na.action = na.omit) +
  labs(title = "random forest most important variables", x = "Longitude", y = "Latitude") +
  scale_fill_viridis(option = "magma", name = "Genome size", limits  = c(4000000,8500000), direction = -1) +
  theme_minimal()
p3 <- set_panel_size(p3, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))



# ensemble predictions of random forest and glmstep 
weights <- c(crossValSum[c("cv_rf4","cv_glmstep2_fin"),'meanCor'])
# stack rasters from which ensemble prediction (weighted mean) should be calculated
predStack <- stack(gs_rf4,gs_glmstep2)
gs_ens_rf4_glmstep2 <- weighted.mean(predStack,weights)

# plot
# convert to stars
gs_ens_rf4_glmstep2_st <- st_as_stars(gs_ens_rf4_glmstep2) 
# same scale for all predictions
p4 <- ggplot() + geom_stars(data = gs_ens_rf4_glmstep2_st, aes(x = x, y = y, fill = layer),
                            na.action = na.omit) +
  labs(title = "ensemble random forets - stepwise glm", x = "Longitude", y = "Latitude") +
  scale_fill_viridis(option = "magma", name = "Genome size", limits  = c(4000000,8500000), direction = -1) +
  theme_minimal()
p4 <- set_panel_size(p4, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))


# sd between models in ensemble
gs_ens_rf4_glmstep2_sd <- calc(predStack, fun = sd)
# plot
# convert to stars
gs_ens_rf4_glmstep2_sd_st <- st_as_stars(gs_ens_rf4_glmstep2_sd) 
# all values
p5 <- ggplot() + geom_stars(data = gs_ens_rf4_glmstep2_sd_st, aes(x = x, y = y, fill = layer),
                            na.action = na.omit) +
  labs(title = "sd prediction random forest - stepwise glm", x = "Longitude", y = "Latitude") +
  scale_fill_viridis(option = "magma", name = "Genome size", direction = -1) +
  theme_minimal()
p5 <- set_panel_size(p5, width = unit((4.275+9.208333)/3, "inch"), height = unit((43.75833-35.26667)/3, "inch"))


# predictions
grid.arrange(p2,p3,p4,ncol=2)

#sd predictions
grid.arrange(p5)

Modelling soil bacterial traits across environments with the trait sequence database ampliconTraits and the R package MicEnvMod - Applying the workflow to real world data

classication of traits

calculation of sequence identity with the top hit

libraries

setup

Import files

Properties of trait annotations across the dataset

Calculate community weighted trait means for continuous traits for bootstrap > 70 and sequence identity with tophit > 80

Correlation matrices for different intervals for the same trait

Correlation matrices of abundance weighted average for different bootstrap cutoffs

bootstrap > 60 considered classified, seqID > 80

bootstrap > 60 considered classified, no seqID cutoff

bootstrap > 50 considered classified, seqID > 80

bootstrap > 50 considered classified, no seqID cutoff

Correlation matrices for different bootstrap cutoffs

Fraction of unclassified samples

bootstrap values as a function of number of intervals

sequence identity with tophit properties

distribution across dataset

Modelling CWM of genome size

Prepare predictor variables

properties of predictor variables

Model genome_size

Assess distribution of response variable for use with glm

Models with all variables

random forest

glm stepwise with AIC as stopping criterion

random forest with most important variables only

summary of crossvalidation results

ensemble prediction

Plot mean variable importance for different model types and variable sets

Plots variable importance for most important variables

Models with variables in databases

random forest

glm stepwise with AIC as stopping criterion

random forest with most important variables only

summary of crossvalidation results

ensemble prediction

Plot variable importance for different model types and variable sets

Plot variable importance for most important variables

Predict CWM of genome size for Spain