split reference databases in test and training sets at different sequence identity thresholds with usearch

usearch v11.0.667 and seqkit v2.1.0 used

#  sequences  were renamed with S1, S2, S3 etc  to save memory in distance matrix (otherwise 32 bit usearch throws error); after splitting the database the sequences were renamed to their original name (code not shown)

# create distance matrix between sequences maxdist maximum distance between sequences
usearch -calc_distmx genome_size10.fasta -maxdist 0.21 -termdist 0.31 -tabbedout genome_size10_distmx.txt

# create training and test datasets at different % identity
# table for 97% identity
usearch -distmx_split_identity ./genome_size10_distmx.txt -mindist 0.025 -maxdist 0.035 -tabbedout genome_size10_97_subsets.txt
# table for 95% identity
usearch -distmx_split_identity ./genome_size10_distmx.txt -mindist 0.045 -maxdist 0.055 -tabbedout genome_size10_95_subsets.txt
# table for 90% identity
usearch -distmx_split_identity ./genome_size10_distmx.txt -mindist 0.095 -maxdist 0.105 -tabbedout genome_size10_90_subsets.txt
# table for 85% identity
usearch -distmx_split_identity ./genome_size10_distmx.txt -mindist 0.145 -maxdist 0.155 -tabbedout genome_size10_85_subsets.txt 
# table for 80% identity
usearch -distmx_split_identity ./genome_size10_distmx.txt -mindist 0.195 -maxdist 0.205 -tabbedout genome_size10_80_subsets.txt

# create fasta file with test and training set at each identity, use 1 as training and 2 as test
# create training and test set based on list
# for 97%
# create ID list to subset with using 
awk '$1==1 || $1=="1x" {print $2'  ./genome_size10_97_subsets.txt > trainingIDs_97.txt
awk '$1==2 {print $2' ./genome_size10_97_subsets.txt > testIDs_97.txt
# subset fastas for test and training
seqkit grep -n -f trainingIDs_97.txt genome_size10.fasta > training_97.fasta
seqkit grep -n -f testIDs_97.txt genome_size10.fasta > test_97.fasta

# for 95%
awk '$1==1 || $1=="1x" {print $2' ./genome_size10_95_subsets.txt > trainingIDs_95.txt
awk '$1==2 {print $2' ./genome_size10_95_subsets.txt > testIDs_95.txt
seqkit grep -n -f trainingIDs_95.txt genome_size10.fasta > training_95.fasta
seqkit grep -n -f testIDs_95.txt genome_size10.fasta > test_95.fasta

# for 90%
awk '$1==1 || $1=="1x" {print $2' ./genome_size10_90_subsets.txt > trainingIDs_90.txt
awk '$1==2 {print $2' ./genome_size10_90_subsets.txt > testIDs_90.txt
seqkit grep -n -f trainingIDs_90.txt genome_size10.fasta > training_90.fasta
seqkit grep -n -f testIDs_90.txt genome_size10.fasta > test_90.fasta

# for 85%
awk '$1==1 || $1=="1x" {print $2' ./genome_size10_85_subsets.txt > trainingIDs_85.txt
awk '$1==2 {print $2' ./genome_size10_85_subsets.txt > testIDs_85.txt
seqkit grep -n -f trainingIDs_85.txt genome_size10.fasta > training_85.fasta
seqkit grep -n -f testIDs_85.txt genome_size10.fasta > test_85.fasta

# for 80%
awk '$1==1 || $1=="1x" {print $2' ./genome_size10_80_subsets.txt > trainingIDs_80.txt
awk '$1==2 {print $2' ./genome_size10_80_subsets.txt> testIDs_80.txt
seqkit grep -n -f trainingIDs_80.txt genome_size10.fasta > training_80.fasta
seqkit grep -n -f testIDs_80.txt genome_size10.fasta > test_80.fasta

# Run sinaps
usearch -sinaps ./test_relabeled_97.fasta -db ./training_relabeled_97.fasta -attr $attribute -tabbedout ./genome_size10_97_crossVal.txt  -strand plus # for 97%
usearch -sinaps ./test_relabeled_95.fasta -db ./training_relabeled_95.fasta -attr $attribute -tabbedout ./genome_size10_95_crossVal.txt  -strand plus  # for 95%
usearch -sinaps ./test_relabeled_90.fasta -db ./training_relabeled_90.fasta -attr $attribute -tabbedout ./genome_size10_90_crossVal.txt  -strand plus # for 90%
usearch -sinaps ./test_relabeled_85.fasta -db ./training_relabeled_85.fasta -attr $attribute -tabbedout ./genome_size10_85_crossVal.txt  -strand plus g # for 85%
usearch -sinaps ./test_relabeled_80.fasta -db ./training_relabeled_80.fasta -attr $attribute -tabbedout ./genome_size10_80_crossVal.txt  -strand plus  # for 80%

Assess accuracy for different traits and intervals as a function of sequence identity between query and reference

library(grid)
library(reshape2)
library(egg)
## Loading required package: gridExtra
## Loading required package: ggplot2
library(RColorBrewer)
library(viridis)
## Loading required package: viridisLite

setup

plot.theme1 <- theme(panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.background = element_rect(fill = "white",
                                                     colour = "black",
                                                     size = 0.5, linetype = "solid"),
                     panel.border= element_rect(fill=NA,size = 0.5, linetype = 'solid',colour = "black"),
                     axis.text.x = element_text(size=10),axis.text.y = element_text(size=10),legend.text = element_text(size=13),
                     axis.title = element_text(size=11),
                     legend.title = element_text(color = "black", size = 11),
                     strip.text.x = element_text(size=11),
                     strip.background = element_rect(colour="black", fill="white")
)

Import predictions and annotations at different identity thresholds and plot comparison

# vector with continuous and categorical traits
cont <- c("d1_lo5","d1_lo10","d1_lo20","d1_lo30", "d1_up5", "d1_up10","d1_up20","d1_up30","d2_lo5","d2_lo10","d2_lo20","d2_lo30",
          "d2_up5","d2_up10","d2_up20","d2_up30","doubling_h5","doubling_h10","doubling_h20","doubling_h30","doubling_h40", "doubling_h50",
          "genome_size5","genome_size10","genome_size20","optimum_ph5","optimum_ph10","optimum_ph20",
          "optimum_tmp10","optimum_tmp20","rRNA16S_genes5","rRNA16S_genes10","rRNA16S_genes_exact_int"  
          )
cat <- c("cell_shape", "range_salinity",  "range_tmp", "gram_stain" , "sporulation",
         "metabolism","motility")


# import table with replacements to convert limits of intervals back to orignial value (were extended with cut to include border of interval)
Lim <- read.csv('../3_makeTraitSpecificDB/IntervalLimits_Reconversion.csv',
                row.names = 1)

for continuous variables

# use all intervals to determine rank of trait, also the ones that do not occur in test dataset
# for continuous traits
range_all <- list()
id_trait <- list()
for (i in cont){
  basedir <- paste0('../3_makeTraitSpecificDB/',i,'/')
  # loop over differnt identity cutoffs
  id <- list()

  for (j in as.character(c(97,95,90,85,80))){
    id[[j]] <- read.csv(paste0(basedir,grep(paste0(j,'_crossVal.txt'),list.files(basedir),value = T)),sep = '\t', header = F)
    rownames(id[[j]]) <- id[[j]][,1]
    colnames(id[[j]]) <- c('query','prediction','bootstrp','strand')
    
    #extract value of trait in query (true value)
    id[[j]][,'trueValue'] <- unlist(lapply(strsplit(id[[j]][,'query'],"="),function(x)x[2]))
    
    # replace upper and lower border of interval with original value
    # trueValue
    id[[j]][id[[j]]$trueValue==Lim[rownames(Lim)==i,'LI_cut'],'trueValue'] <- Lim[rownames(Lim)==i,'LI_or'] # lowest interval
    id[[j]][id[[j]]$trueValue==Lim[rownames(Lim)==i,'HI_cut'],'trueValue'] <- Lim[rownames(Lim)==i,'HI_or'] # highest interval
    # prediction
    id[[j]][id[[j]]$prediction==Lim[rownames(Lim)==i,'LI_cut'],'prediction'] <- Lim[rownames(Lim)==i,'LI_or'] # lowest interval
    id[[j]][id[[j]]$prediction==Lim[rownames(Lim)==i,'HI_cut'],'prediction'] <- Lim[rownames(Lim)==i,'HI_or'] # highest interval
    
    # calculate difference based on mean of interval
    # mean of each interval for prediction
    lo <- as.numeric(ifelse(substr(id[[j]][,'prediction'],2,2)=='e'|substr(id[[j]][,'prediction'],1,1)=='-',
                            gsub('(-.*?)(-.*)','\\1', id[[j]][,'prediction']),gsub('-.*','', id[[j]][,'prediction'])))
    
    up <- as.numeric(ifelse(substr(id[[j]][,'prediction'],2,2)=='e'|substr(id[[j]][,'prediction'],1,1)=='-',
                            gsub('(.*-.*?-)(.*)','\\2', id[[j]][,'prediction']),gsub('.*-','', id[[j]][,'prediction'])))

    id[[j]][,'prediction_mean'] <- (lo+up)/2
    
    # mean of each interval for true value
    lo <- as.numeric(ifelse(substr(id[[j]][,'trueValue'],2,2)=='e'|substr(id[[j]][,'trueValue'],1,1)=='-',
                            gsub('(-.*?)(-.*)','\\1', id[[j]][,'trueValue']),gsub('-.*','', id[[j]][,'trueValue'])))

    up <- as.numeric(ifelse(substr(id[[j]][,'trueValue'],2,2)=='e'|substr(id[[j]][,'trueValue'],1,1)=='-',
                            gsub('(.*-.*?-)(.*)','\\2', id[[j]][,'trueValue']),gsub('.*-','', id[[j]][,'trueValue'])))
    id[[j]][,'trueValue_mean'] <- (lo+up)/2
    # difference between mean predicted and mean true value
    id[[j]][,'Diff_predVsTrueValue'] <- id[[j]][,'prediction_mean'] - id[[j]][,'trueValue_mean']
    
    # add trait and % seqID
    id[[j]]$ID <- rep(j,nrow(id[[j]]))
    id[[j]]$trait <- rep(i,nrow(id[[j]]))
    
  }
 
  id_trait[[i]] <- id
}


id_trait_df <- list()
for (i in names(id_trait)){
  id_trait_df[[i]] <- do.call(rbind,id_trait[[i]])
}

id_trait_df <- do.call(rbind, id_trait_df)
id_trait_df$trait2 <- gsub('[0-9]*$','',id_trait_df$trait)
id_trait_df$intervals <- gsub('.*_','',id_trait_df$trait)
id_trait_df$intervals <- gsub('[a-z]*','',id_trait_df$intervals)
id_trait_df[id_trait_df$trait=="rRNA16S_genes_exact_int",'intervals'] <- 'exact'

# combine 16S genes interval and exact
id_trait_df[id_trait_df$trait2=="rRNA16S_genes_exact_int",'trait2'] <- "rRNA16S_genes" 
# change levels for intervals 
id_trait_df$intervals <- factor(id_trait_df$intervals, levels = unique(id_trait_df$intervals))

# violin plots accuracy
# all samples scaled area
p <- ggplot(id_trait_df, aes(x = ID, y = Diff_predVsTrueValue, fill = ID)) + geom_violin() +
  facet_grid(trait2~intervals, scales = 'free') + plot.theme1 +
  labs(y = 'Predicted - true value', x = '% sequence identity') +
  scale_fill_manual(values = brewer.pal(5,"Blues"))
p <- set_panel_size(p , width = unit(1.25, "inch"), height = unit(1, "inch"))

grid.arrange(p)

## for categorical variables
acc_all <- list()
for (i in cat){
  basedir <- paste0('../3_makeTraitSpecificDB/',i,'/')
  # loop over differnt identity cutoffs
  id <- list()
  plots <- list()
  acc <- list()
  for (j in as.character(c(97,95,90,85,80))){
    id[[j]] <- read.csv(paste0(basedir,grep(paste0(j,'_crossVal.txt'),list.files(basedir),value = T)),sep = '\t', header = F)
    rownames(id[[j]]) <- id[[j]][,1]
    colnames(id[[j]]) <- c('query','prediction','bootstrp','strand')
    
    #extract value of trait in query (true value)
    id[[j]][,'trueValue'] <- unlist(lapply(strsplit(id[[j]][,'query'],"="),function(x)x[2]))
    
    # column if prediction is correct
    id[[j]][,'pred_eq_true'] <- id[[j]][,'trueValue']==id[[j]][,'prediction']
    
    
    # Calculate number of correct and false predictions
    right <- length(which(id[[j]][,'pred_eq_true']))
    wrong <- length(which(!(id[[j]][,'pred_eq_true'])))
    
    acc[[j]] <- data.frame(j, nrow(id[[j]]), right, wrong, right/nrow(id[[j]]),wrong/nrow(id[[j]]))
    
  }

  acc_df <- do.call(rbind,acc)
  colnames(acc_df) <- c('%ID', 'n_comparisons', 'n_correctPred', 'n_wrongPred', 'n_correctPred_rel', 'n_wrongPred_rel')
  acc_df[,'trait'] <- rep(i,5)
  acc_all[[i]] <- acc_df
}


acc_all_df <- do.call(rbind,acc_all) 

# plot
acc_all_df$`%ID` <- factor(acc_all_df$`%ID`, levels = c('97','95','90','85','80'))


ggplot(acc_all_df, aes(x = `%ID`, y =  n_correctPred_rel )) + geom_bar(stat="identity") +
  geom_text(aes(y = n_correctPred_rel - 0.1, label = n_comparisons), color = 'white') +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  facet_wrap(~trait,scales = 'fixed') +
  ylab('Correct predictions (fraction)') +
  plot.theme1