##example commands and scripts ##Scripts -------------------------------------------------- #selc.py #!/usr/bin/python3 # Finding_longest_sequence_from_fasta_file.py from Bio import SeqIO records=list(SeqIO.parse("length_test.fasta", "fasta")) records.sort(key=lambda r: -len(r)) print(records[0].format("fasta")) -------------------------------------------------- #apc_mod.pl #!/usr/bin/env perl # AUTHOR: Joseph Fass # # The Bioinformatics Core at UC Davis Genome Center # http://bioinformatics.ucdavis.edu # Copyright (c) 2015 The Regents of The University of California, Davis Campus. # All rights reserved. use Getopt::Std; my $usage = "\nusage: $0 [options] \n\n". "Have you assembled [a] [p]erfect [c]ircle (e.g. plasmid, bacterial chromosome)? ". "This script tests each sequence in the supplied fasta file for self-overlap. ". "If overlap is found, the 5' copy of the overlapping sequence is trimmed, ". "the ends joined, and the join moved (\"permuted\") to the middle of the output sequence. ". "Join location is appended to the sequence header, to allow confirmation. ". "If a multi-fasta file is provided, each sequence is tested for circularity ". "and output separately. ". "If reads are provided (e.g. PacBio corrected.fastq), ". "BWA MEM is used to align reads to spliced sequence, for validation.\n". "The binaries \'lastdb\', \'lastal\', and (with -r) ". "\'bwa\' and \'samtools\' must be in your PATH.\n". "\n". "-b \tbasename for output files (alphanumeric and underscore characters)\n". "-r \thigh accuracy long reads to use for validation\n". "\n"; our($opt_b,$opt_r); getopts('r:b:') or die $usage; if (!(defined($opt_b)) or !($opt_b=~m/^\w+$/)) {$opt_b="permuted"} $fname = $opt_b; # works better in interpreted quotes my $assemblyfile = shift or die $usage; # pull in (single-line) assembled molecules open ASM, "<$assemblyfile"; while ($line = ) { if ($line =~ m/^>/) { push @contigs, $block if !($block eq ''); # push 2-line contig onto list of contigs $block = $line."\n"; # adds another newline, because it'll be chomped below } else { chomp $block; # remove newline before adding more sequence chomp $line; # remove newline on current (sequence) line $block .= $line."\n"; # block always ends in a single newline } } push @contigs, $block; # push last sequence block onto list close ASM; # loop through each contig, perform LAST self-alignment, trim, output open LOG, ">apc.log"; $n = 0; # keeps track of sequence count while (@contigs) { #print LOG "contig\t"; # debug #print LOG $#contigs + 1; # debug #print LOG "\n"; # debug $block = shift(@contigs); $n++; open SEQ, ">temp.apc.fa"; print SEQ $block; close SEQ; print LOG "formatting LAST db ... "; $command = "lastdb temp.apc temp.apc.fa"; # format lastdb for current contig system($command); print LOG "running LAST self-alignment ... "; $command = "lastal -s 1 -x 300 -f 0 -T 1 temp.apc temp.apc.fa > temp.apc.lastal"; # self align current contig system($command); print LOG "done\n"; # pull in output of current contig's LAST self-alignment open LAST, ") { if (!($line =~ m/^#/)) { push @alignment, $line; } } close LAST; # process alignment lines; should be three, with first being full self-alignment # second and third lines should be near identical end-overlaps if ($#alignment + 1 == 3) { # check that first is full self alignment # print LOG "three alignments in LAST output!\n"; # debug # check that 2nd and 3rd are near identical overlaps @overlap5prime = split("\t", $alignment[1]); @overlap3prime = split("\t", $alignment[2]); # print LOG $overlap5prime[0]."\n"; # debug # print LOG $overlap3prime[0]."\n"; # debug if ($overlap5prime[0] == $overlap3prime[0]) { $start5prime = $overlap5prime[2] + 1; # 0-based! $end5prime = $start5prime + $overlap5prime[3] - 1; $start3prime = $overlap3prime[2] + 1; # 0-based! $end3prime = $start3prime + $overlap3prime[3] - 1; print LOG "Overlap detected between "; print LOG $overlap5prime[1].":".$start5prime."-".$end5prime; print LOG " and "; print LOG $overlap3prime[1].":".$start3prime."-".$end3prime."\n"; # trim one overlap, then permute halfway $block =~ m/^(.*\n)(.*)$/; $header = $1; chomp $header; $sequence = $2; chomp $sequence; $trimmedSeq = substr($sequence, $overlap5prime[3]); # 0-, 1-based trickery $mid = int( length($trimmedSeq) / 2 ); # midpoint, to permute around $newSeq = substr($trimmedSeq, $mid) . substr($trimmedSeq, 0, $mid) . "\n"; open OUT, ">$fname.$n.fa"; chomp $block; print OUT $header."|join_near_$mid\n"; print OUT $newSeq."\n"; close OUT; system("mv -- $assemblyfile overlapped_seq.txt"); if (defined($opt_r)) { # use bwa mem to align reads to permuted sequence $command = "bwa index $fname.$n.fa 2> $fname.$n.aln.log"; print LOG "indexing permuted sequence ... "; system($command); $command = "bwa mem -M $fname.$n.fa $opt_r 2>> $fname.$n.aln.log | gzip > $fname.$n.sam.gz"; print LOG "aligning reads to permuted sequence ... "; system($command); $command = "samtools view -uhS $fname.$n.sam.gz 2>> $fname.$n.aln.log | samtools sort - $fname.$n 2>> $fname.$n.aln.log"; print LOG "sorting BAM file of aligned reads ... "; system($command); $command = "samtools index $fname.$n.bam 2>> $fname.$n.aln.log"; print LOG "indexing BAM file ... "; system($command); print LOG "done\n"; } } else { system("mv temp.apc.lastal apc_aln_$n.txt"); print LOG "irregular LAST results ... check local file apc_aln_$n.txt\n"; } } else { system("mv temp.apc.lastal apc_aln_$n.txt"); print LOG "irregular LAST results ... check local file apc_aln_$n.txt\n"; } } close LOG; # clean up temp files! system("rm temp.apc*"); -------------------------------------------------- #splitgbk.py #!/usr/bin/python3 #Usage: splitgbk.py foo.gbk #Produces individual gbk files from multigbk import os import sys from Bio import SeqIO import sys mgbk_filename = sys.argv[1] for rec in SeqIO.parse(mgbk_filename, "genbank"): SeqIO.write([rec], open(rec.id + ".gbk", "w"), "genbank") -------------------------------------------------- #gkb_to_faa.py #!/usr/bin/python3 #Usage: converter.py foo.gbk #Produces a converted file: foo_converted.faa import os import sys from Bio import GenBank from Bio import SeqIO gbk_filename = sys.argv[1] root_name = os.path.splitext(gbk_filename)[0] faa_filename = root_name + "_protein.faa" spacer= ";" #specify species name as 'organism' variable fh = open(gbk_filename) for gb_record in SeqIO.parse (fh, 'genbank'): organism = gb_record.annotations['organism'] # stores all the CDS entries all_entries = [] with open(gbk_filename, 'r') as GBFile: GBcds = SeqIO.InsdcIO.GenBankCdsFeatureIterator(GBFile) for cds in GBcds: if cds.seq is not None: cds.description = spacer + cds.annotations['gene'] cds.id = organism all_entries.append(cds) # write file SeqIO.write(all_entries, '{}faa'.format(gbk_filename[:-3]), 'fasta') -------------------------------------------------- ##Commands #ONT basecalling with GPU (Guppy v5.0.7) guppy_basecaller --input_path path/to/fast5/data --save_path path/to/fastq/output/ --config dna_r10.3_450bps_sup.cfg -x 'auto' -r ##demux with guppy (Guppy v5.0.7) guppy_barcoder -i path/to/fastq/output/pass/ -s path/to/fastq/output/demux -x 'auto' --barcode_kits SQK-RBK110-96 #or SQK-RBK004# --detect_mid_strand_barcodes --trim_barcodes #cat all fastq files in each barcode subdirectory for dir in path/to/fastq/output/demux/*;do cd "$dir"; files=(*.fastq); cat "${files[@]}" > "${PWD##*/}.fq"; cd ..; done #use nanofilt to filter out reads below q10, reads above 17500bp and below 500bp (NanoFilt v2.7.0) for dir in path/to/fastq/output/demux/*;do cd "$dir"; files=(*.fq); cat "${files[@]}" | NanoFilt -q 10 --maxlength 17500 | NanoFilt -l 500 | gzip > "${PWD##*/}_filt.fq.gz"; cd ..; done #Summary statistics for filtered reads using NanoStat (v1.2.0) for dir in path/to/fastq/output/demux/*; do cd "$dir"; files=(*_filt.fq.gz); NanoStat --fastq "${files[@]}" -n "${PWD##*/}" --outdir "${PWD##*/}_nanostat"; cd ..; done #run flye --meta assembly (Flye v2.8.3-b1695 or v2.9-b1774) #first attempt with Flye v2.8.3-b1695 for dir in path/to/fastq/output/demux/*;do cd "$dir"; files=(*_filt.fq.gz); flye --nano-raw "${files[@]}" --threads 35 --meta -o path/to/fastq/output/demux/assembly/"${PWD##*/}_flye_meta"; done #if an acceptable assembly was not generated, these flye parameters were used with Flye v2.8.3-b1695 flye --nano-raw --threads 8 --meta -o path/to/fastq/output/demux/assembly/"${PWD##*/}_flye_meta ##we've noticed that the number of threads can influence output #if no acceptable assembly, Flye v2.9-b1774 was used with the following parameters flye --nano-hq --threads 35 --meta -o path/to/fastq/output/demux/assembly/"${PWD##*/}_flye_meta #if no acceptable assembly, Flye v2.9-b1774 was used with the following parameters flye --nano-raw --threads 35 --meta -o path/to/fastq/output/demux/assembly/"${PWD##*/}_flye_meta #sort assembly using selc.py which returns longest contig and then run apc_mod script on output contigs which changes the original contig.fa file to overlapped_contig.txt if it detects overlap for dir in path/to/fastq/output/demux/assembly/*; do cd "$dir"; files=(contig.fa); selc.py > contig.fa; apc_mod.pl -b contig "${files[@]}"; done #polish apc output with flye using same version of flye as was used for assembly for dir in path/to/fastq/output/demux/assembly/*; do cd "$dir"; files1=(*_filt.fq.gz); files2=(contig.*); flye --nano-raw "${files1[@]}" --polish-target "${files2[@]}" --threads 20 --iterations 3 -o path/to/fastq/output/demux/assembly_polished/"${PWD##*/}_polished"; done #medaka polish (Medaka v1.4.3), copy filtered reads file into the respective assembly_polished subdirectory mkdir path/to/fastq/output/demux/assembly_polished_medaka/ for dir in path/to/fastq/output/demux/assembly_polished/*; do cd "$dir"; files1=(*_filt.fq.gz); files2=(polished_3.fasta); medaka_consensus -i "${files1[@]}" -d "${files2[@]}" -o path/to/fastq/output/demux/assembly_polished_medaka/"${PWD##*/}_medaka" -t 2 -m r103_sup_g507; done; >>>manually named the polished mitogenome sequences after their filename and indicate circular with topology=circular after the contig name<<< #reorient the mitogenome to start on trnM using BLAST of species-specific or related species trnM nucleotide sequence #Annotation #be sure to be in the directory with the mitogenome fasta #annotate fastmitos mitogenomes source activate mitoz (MitoZ v2.3) for dir in path/to/fastq/output/demux/assembly_polished_medaka/*; do cd "$dir"; files=(*.fasta); docker run -v $PWD:$PWD -w $PWD --rm guanliangmeng/mitoz:2.3 /app/release_MitoZ_v2.3/MitoZ.py annotate --genetic_code 5 --clade Arthropoda --outprefix "${PWD##*/}"_mitoz --thread_number 8 --fastafile "${files[@]}"; done; ##Geneious outputs a tar file of a batch .asn sequin-like file using the Submit to GenBank Tool, you need to untar this which yields a generically named genbank.asn file tar -xvf #then convert .asn to .gbk using asn2gb asn2gb -i genbank.asn -o #split mutligbk into single gbk splitgbk.py #then extract all protein coding gene amino acid sequences and concatenate the file for file in *.gbk; do gbk_to_faa.py "$file"; done; cat *.faa > total_protein.faa #linearize fasta (Seqtk-1.3 (r106)) seqtk seq > #separate out the .faa files for the 10AA protein coding gene phylogenetic analysis scheme, then move these files to new directory grep --no-group-separator -i -h -w -A 1 'ND2' total_protein.faa > nd2_aa.faa grep --no-group-separator -i -h -w -A 1 'COX1' total_protein.faa > cox1_aa.faa grep --no-group-separator -i -h -w -A 1 'COX2' total_protein.faa > cox2_aa.faa grep --no-group-separator -i -h -w -A 1 'ATP6' total_protein.faa > atp6_aa.faa grep --no-group-separator -i -h -w -A 1 'COX3' total_protein.faa > cox3_aa.faa grep --no-group-separator -i -h -w -A 1 'ND3' total_protein.faa > nd3_aa.faa grep --no-group-separator -i -h -w -A 1 'ND5' total_protein.faa > nd5_aa.faa grep --no-group-separator -i -h -w -A 1 'ND4' total_protein.faa > nd4_aa.faa grep --no-group-separator -i -h -w -A 1 'CYTB' total_protein.faa > cytb_aa.faa grep --no-group-separator -i -h -w -A 1 'ND1' total_protein.faa > nd1_aa.faa grep --no-group-separator -i -h -w -A 1 'ND4L' total_protein.faa > nd4l_aa.faa grep --no-group-separator -i -h -w -A 1 'ATP8' total_protein.faa > atp8_aa.faa grep --no-group-separator -i -h -w -A 1 'ND6' total_protein.faa > nd6_aa.faa mkdir 10AA_scheme mv *_aa.faa > 10AA_scheme/ #in 10AA_scheme directory, mafft alignment for protein coding genes (G-INS-i parameters) (MAFFT v7.486) for i in *.faa; do mafft --maxiterate 1000 --globalpair --thread 12 $i >$i.aln; done #GBLOCKS (v0.91b) GBLOCKS was used with each alignment for each protein. The default settings were used except for "5. Allowed Gap Positions", which was set to "With Half". Gblocks output was put in a new directory 'gblocks'. #mitophylogenomics (IQTREE2 v2.1.2), maximum-likelihood tree estimation with 100k ultrafast bootstrap replicates and modelfinder plus options iqtree2 -p 10AA_scheme/gblocks/ -m MFP --prefix final_mitophylogenomic_analysis_100k -B 100000 -T AUTO ##benchmarking (NucDiff v2.0.3), the ref.fa file for benchmarking was the Illumina reference, the assembly.fa file was output from the ONT strategy nucdiff ref.fa assembly.fa --vcf ##calculate coverage (Minimap2 v2.22-r1101, Samtools v1.11, mosdepth v0.3.1) minimap2 -ax map-ont assembly.fa | samtools sort -o sorted.bam samtools index sorted.bam mosdepth sorted.bam