usearch v11.0.667 and seqkit v2.1.0 used
# sequences were renamed with S1, S2, S3 etc to save memory in distance matrix (otherwise 32 bit usearch throws error); after splitting the database the sequences were renamed to their original name (code not shown)
# create distance matrix between sequences maxdist maximum distance between sequences
usearch -calc_distmx genome_size10.fasta -maxdist 0.21 -termdist 0.31 -tabbedout genome_size10_distmx.txt
# create training and test datasets at different % identity
# table for 97% identity
usearch -distmx_split_identity ./genome_size10_distmx.txt -mindist 0.025 -maxdist 0.035 -tabbedout genome_size10_97_subsets.txt
# table for 95% identity
usearch -distmx_split_identity ./genome_size10_distmx.txt -mindist 0.045 -maxdist 0.055 -tabbedout genome_size10_95_subsets.txt
# table for 90% identity
usearch -distmx_split_identity ./genome_size10_distmx.txt -mindist 0.095 -maxdist 0.105 -tabbedout genome_size10_90_subsets.txt
# table for 85% identity
usearch -distmx_split_identity ./genome_size10_distmx.txt -mindist 0.145 -maxdist 0.155 -tabbedout genome_size10_85_subsets.txt
# table for 80% identity
usearch -distmx_split_identity ./genome_size10_distmx.txt -mindist 0.195 -maxdist 0.205 -tabbedout genome_size10_80_subsets.txt
# create fasta file with test and training set at each identity, use 1 as training and 2 as test
# create training and test set based on list
# for 97%
# create ID list to subset with using
awk '$1==1 || $1=="1x" {print $2' ./genome_size10_97_subsets.txt > trainingIDs_97.txt
awk '$1==2 {print $2' ./genome_size10_97_subsets.txt > testIDs_97.txt
# subset fastas for test and training
seqkit grep -n -f trainingIDs_97.txt genome_size10.fasta > training_97.fasta
seqkit grep -n -f testIDs_97.txt genome_size10.fasta > test_97.fasta
# for 95%
awk '$1==1 || $1=="1x" {print $2' ./genome_size10_95_subsets.txt > trainingIDs_95.txt
awk '$1==2 {print $2' ./genome_size10_95_subsets.txt > testIDs_95.txt
seqkit grep -n -f trainingIDs_95.txt genome_size10.fasta > training_95.fasta
seqkit grep -n -f testIDs_95.txt genome_size10.fasta > test_95.fasta
# for 90%
awk '$1==1 || $1=="1x" {print $2' ./genome_size10_90_subsets.txt > trainingIDs_90.txt
awk '$1==2 {print $2' ./genome_size10_90_subsets.txt > testIDs_90.txt
seqkit grep -n -f trainingIDs_90.txt genome_size10.fasta > training_90.fasta
seqkit grep -n -f testIDs_90.txt genome_size10.fasta > test_90.fasta
# for 85%
awk '$1==1 || $1=="1x" {print $2' ./genome_size10_85_subsets.txt > trainingIDs_85.txt
awk '$1==2 {print $2' ./genome_size10_85_subsets.txt > testIDs_85.txt
seqkit grep -n -f trainingIDs_85.txt genome_size10.fasta > training_85.fasta
seqkit grep -n -f testIDs_85.txt genome_size10.fasta > test_85.fasta
# for 80%
awk '$1==1 || $1=="1x" {print $2' ./genome_size10_80_subsets.txt > trainingIDs_80.txt
awk '$1==2 {print $2' ./genome_size10_80_subsets.txt> testIDs_80.txt
seqkit grep -n -f trainingIDs_80.txt genome_size10.fasta > training_80.fasta
seqkit grep -n -f testIDs_80.txt genome_size10.fasta > test_80.fasta
# Run sinaps
usearch -sinaps ./test_relabeled_97.fasta -db ./training_relabeled_97.fasta -attr $attribute -tabbedout ./genome_size10_97_crossVal.txt -strand plus # for 97%
usearch -sinaps ./test_relabeled_95.fasta -db ./training_relabeled_95.fasta -attr $attribute -tabbedout ./genome_size10_95_crossVal.txt -strand plus # for 95%
usearch -sinaps ./test_relabeled_90.fasta -db ./training_relabeled_90.fasta -attr $attribute -tabbedout ./genome_size10_90_crossVal.txt -strand plus # for 90%
usearch -sinaps ./test_relabeled_85.fasta -db ./training_relabeled_85.fasta -attr $attribute -tabbedout ./genome_size10_85_crossVal.txt -strand plus g # for 85%
usearch -sinaps ./test_relabeled_80.fasta -db ./training_relabeled_80.fasta -attr $attribute -tabbedout ./genome_size10_80_crossVal.txt -strand plus # for 80%
Assess accuracy for different traits and intervals as a function of sequence identity between query and reference
library(grid)
library(reshape2)
library(egg)
## Loading required package: gridExtra
## Loading required package: ggplot2
library(RColorBrewer)
library(viridis)
## Loading required package: viridisLite
plot.theme1 <- theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white",
colour = "black",
size = 0.5, linetype = "solid"),
panel.border= element_rect(fill=NA,size = 0.5, linetype = 'solid',colour = "black"),
axis.text.x = element_text(size=10),axis.text.y = element_text(size=10),legend.text = element_text(size=13),
axis.title = element_text(size=11),
legend.title = element_text(color = "black", size = 11),
strip.text.x = element_text(size=11),
strip.background = element_rect(colour="black", fill="white")
)
# vector with continuous and categorical traits
cont <- c("d1_lo5","d1_lo10","d1_lo20","d1_lo30", "d1_up5", "d1_up10","d1_up20","d1_up30","d2_lo5","d2_lo10","d2_lo20","d2_lo30",
"d2_up5","d2_up10","d2_up20","d2_up30","doubling_h5","doubling_h10","doubling_h20","doubling_h30","doubling_h40", "doubling_h50",
"genome_size5","genome_size10","genome_size20","optimum_ph5","optimum_ph10","optimum_ph20",
"optimum_tmp10","optimum_tmp20","rRNA16S_genes5","rRNA16S_genes10","rRNA16S_genes_exact_int"
)
cat <- c("cell_shape", "range_salinity", "range_tmp", "gram_stain" , "sporulation",
"metabolism","motility")
# import table with replacements to convert limits of intervals back to orignial value (were extended with cut to include border of interval)
Lim <- read.csv('../3_makeTraitSpecificDB/IntervalLimits_Reconversion.csv',
row.names = 1)
# use all intervals to determine rank of trait, also the ones that do not occur in test dataset
# for continuous traits
range_all <- list()
id_trait <- list()
for (i in cont){
basedir <- paste0('../3_makeTraitSpecificDB/',i,'/')
# loop over differnt identity cutoffs
id <- list()
for (j in as.character(c(97,95,90,85,80))){
id[[j]] <- read.csv(paste0(basedir,grep(paste0(j,'_crossVal.txt'),list.files(basedir),value = T)),sep = '\t', header = F)
rownames(id[[j]]) <- id[[j]][,1]
colnames(id[[j]]) <- c('query','prediction','bootstrp','strand')
#extract value of trait in query (true value)
id[[j]][,'trueValue'] <- unlist(lapply(strsplit(id[[j]][,'query'],"="),function(x)x[2]))
# replace upper and lower border of interval with original value
# trueValue
id[[j]][id[[j]]$trueValue==Lim[rownames(Lim)==i,'LI_cut'],'trueValue'] <- Lim[rownames(Lim)==i,'LI_or'] # lowest interval
id[[j]][id[[j]]$trueValue==Lim[rownames(Lim)==i,'HI_cut'],'trueValue'] <- Lim[rownames(Lim)==i,'HI_or'] # highest interval
# prediction
id[[j]][id[[j]]$prediction==Lim[rownames(Lim)==i,'LI_cut'],'prediction'] <- Lim[rownames(Lim)==i,'LI_or'] # lowest interval
id[[j]][id[[j]]$prediction==Lim[rownames(Lim)==i,'HI_cut'],'prediction'] <- Lim[rownames(Lim)==i,'HI_or'] # highest interval
# calculate difference based on mean of interval
# mean of each interval for prediction
lo <- as.numeric(ifelse(substr(id[[j]][,'prediction'],2,2)=='e'|substr(id[[j]][,'prediction'],1,1)=='-',
gsub('(-.*?)(-.*)','\\1', id[[j]][,'prediction']),gsub('-.*','', id[[j]][,'prediction'])))
up <- as.numeric(ifelse(substr(id[[j]][,'prediction'],2,2)=='e'|substr(id[[j]][,'prediction'],1,1)=='-',
gsub('(.*-.*?-)(.*)','\\2', id[[j]][,'prediction']),gsub('.*-','', id[[j]][,'prediction'])))
id[[j]][,'prediction_mean'] <- (lo+up)/2
# mean of each interval for true value
lo <- as.numeric(ifelse(substr(id[[j]][,'trueValue'],2,2)=='e'|substr(id[[j]][,'trueValue'],1,1)=='-',
gsub('(-.*?)(-.*)','\\1', id[[j]][,'trueValue']),gsub('-.*','', id[[j]][,'trueValue'])))
up <- as.numeric(ifelse(substr(id[[j]][,'trueValue'],2,2)=='e'|substr(id[[j]][,'trueValue'],1,1)=='-',
gsub('(.*-.*?-)(.*)','\\2', id[[j]][,'trueValue']),gsub('.*-','', id[[j]][,'trueValue'])))
id[[j]][,'trueValue_mean'] <- (lo+up)/2
# difference between mean predicted and mean true value
id[[j]][,'Diff_predVsTrueValue'] <- id[[j]][,'prediction_mean'] - id[[j]][,'trueValue_mean']
# add trait and % seqID
id[[j]]$ID <- rep(j,nrow(id[[j]]))
id[[j]]$trait <- rep(i,nrow(id[[j]]))
}
id_trait[[i]] <- id
}
id_trait_df <- list()
for (i in names(id_trait)){
id_trait_df[[i]] <- do.call(rbind,id_trait[[i]])
}
id_trait_df <- do.call(rbind, id_trait_df)
id_trait_df$trait2 <- gsub('[0-9]*$','',id_trait_df$trait)
id_trait_df$intervals <- gsub('.*_','',id_trait_df$trait)
id_trait_df$intervals <- gsub('[a-z]*','',id_trait_df$intervals)
id_trait_df[id_trait_df$trait=="rRNA16S_genes_exact_int",'intervals'] <- 'exact'
# combine 16S genes interval and exact
id_trait_df[id_trait_df$trait2=="rRNA16S_genes_exact_int",'trait2'] <- "rRNA16S_genes"
# change levels for intervals
id_trait_df$intervals <- factor(id_trait_df$intervals, levels = unique(id_trait_df$intervals))
# violin plots accuracy
# all samples scaled area
p <- ggplot(id_trait_df, aes(x = ID, y = Diff_predVsTrueValue, fill = ID)) + geom_violin() +
facet_grid(trait2~intervals, scales = 'free') + plot.theme1 +
labs(y = 'Predicted - true value', x = '% sequence identity') +
scale_fill_manual(values = brewer.pal(5,"Blues"))
p <- set_panel_size(p , width = unit(1.25, "inch"), height = unit(1, "inch"))
grid.arrange(p)
## for categorical variables
acc_all <- list()
for (i in cat){
basedir <- paste0('../3_makeTraitSpecificDB/',i,'/')
# loop over differnt identity cutoffs
id <- list()
plots <- list()
acc <- list()
for (j in as.character(c(97,95,90,85,80))){
id[[j]] <- read.csv(paste0(basedir,grep(paste0(j,'_crossVal.txt'),list.files(basedir),value = T)),sep = '\t', header = F)
rownames(id[[j]]) <- id[[j]][,1]
colnames(id[[j]]) <- c('query','prediction','bootstrp','strand')
#extract value of trait in query (true value)
id[[j]][,'trueValue'] <- unlist(lapply(strsplit(id[[j]][,'query'],"="),function(x)x[2]))
# column if prediction is correct
id[[j]][,'pred_eq_true'] <- id[[j]][,'trueValue']==id[[j]][,'prediction']
# Calculate number of correct and false predictions
right <- length(which(id[[j]][,'pred_eq_true']))
wrong <- length(which(!(id[[j]][,'pred_eq_true'])))
acc[[j]] <- data.frame(j, nrow(id[[j]]), right, wrong, right/nrow(id[[j]]),wrong/nrow(id[[j]]))
}
acc_df <- do.call(rbind,acc)
colnames(acc_df) <- c('%ID', 'n_comparisons', 'n_correctPred', 'n_wrongPred', 'n_correctPred_rel', 'n_wrongPred_rel')
acc_df[,'trait'] <- rep(i,5)
acc_all[[i]] <- acc_df
}
acc_all_df <- do.call(rbind,acc_all)
# plot
acc_all_df$`%ID` <- factor(acc_all_df$`%ID`, levels = c('97','95','90','85','80'))
ggplot(acc_all_df, aes(x = `%ID`, y = n_correctPred_rel )) + geom_bar(stat="identity") +
geom_text(aes(y = n_correctPred_rel - 0.1, label = n_comparisons), color = 'white') +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
facet_wrap(~trait,scales = 'fixed') +
ylab('Correct predictions (fraction)') +
plot.theme1