##example commands and scripts

##Scripts
--------------------------------------------------
#selc.py
#!/usr/bin/python3
# Finding_longest_sequence_from_fasta_file.py
from Bio import SeqIO
records=list(SeqIO.parse("length_test.fasta", "fasta"))
records.sort(key=lambda r: -len(r))
print(records[0].format("fasta"))
--------------------------------------------------
#apc_mod.pl
#!/usr/bin/env perl

# AUTHOR: Joseph Fass
# 
# The Bioinformatics Core at UC Davis Genome Center
# http://bioinformatics.ucdavis.edu
# Copyright (c) 2015 The Regents of The University of California, Davis Campus.
# All rights reserved.

use Getopt::Std;

my $usage = "\nusage: $0 [options] <fasta>\n\n".
            "Have you assembled [a] [p]erfect [c]ircle (e.g. plasmid, bacterial chromosome)? ".
            "This script tests each sequence in the supplied fasta file for self-overlap. ".
            "If overlap is found, the 5' copy of the overlapping sequence is trimmed, ".
            "the ends joined, and the join moved (\"permuted\") to the middle of the output sequence. ".
            "Join location is appended to the sequence header, to allow confirmation. ".
            "If a multi-fasta file is provided, each sequence is tested for circularity ".
            "and output separately. ".
            "If reads are provided (e.g. PacBio corrected.fastq), ".
            "BWA MEM is used to align reads to spliced sequence, for validation.\n".
            "The binaries \'lastdb\', \'lastal\', and (with -r) ".
            "\'bwa\' and \'samtools\' must be in your PATH.\n".
            "\n".
            "-b <text>\tbasename for output files (alphanumeric and underscore characters)\n".
            "-r <fastq>\thigh accuracy long reads to use for validation\n".
            "\n";
our($opt_b,$opt_r);
getopts('r:b:') or die $usage;
if (!(defined($opt_b)) or !($opt_b=~m/^\w+$/)) {$opt_b="permuted"} 
$fname = $opt_b;  # works better in interpreted quotes
my $assemblyfile = shift or die $usage;

# pull in (single-line) assembled molecules
open ASM, "<$assemblyfile";
while ($line = <ASM>) {
    if ($line =~ m/^>/) {
	push @contigs, $block if !($block eq '');  # push 2-line contig onto list of contigs
	$block = $line."\n";  # adds another newline, because it'll be chomped below
    }
    else {
	chomp $block;  # remove newline before adding more sequence
	chomp $line;  # remove newline on current (sequence) line
	$block .= $line."\n";  # block always ends in a single newline
    }
     }
push @contigs, $block;  # push last sequence block onto list
close ASM;

# loop through each contig, perform LAST self-alignment, trim, output
open LOG, ">apc.log";
$n = 0;  # keeps track of sequence count
while (@contigs) {
    #print LOG "contig\t";     # debug
    #print LOG $#contigs + 1;  # debug
    #print LOG "\n";           # debug
    $block = shift(@contigs);
    $n++;
    open SEQ, ">temp.apc.fa";
    print SEQ $block;
    close SEQ;
    print LOG "formatting LAST db ... ";
    $command = "lastdb temp.apc temp.apc.fa";  # format lastdb for current contig
    system($command);
    print LOG "running LAST self-alignment ... ";
    $command = "lastal -s 1 -x 300 -f 0 -T 1 temp.apc temp.apc.fa > temp.apc.lastal";  # self align current contig
    system($command);
    print LOG "done\n";
    # pull in output of current contig's LAST self-alignment
    open LAST, "<temp.apc.lastal";
    undef(@alignment);
    while ($line = <LAST>) {
	if (!($line =~ m/^#/)) {
	    push @alignment, $line;
	}
    }
    close LAST;
    # process alignment lines; should be three, with first being full self-alignment
    # second and third lines should be near identical end-overlaps
    if ($#alignment + 1 == 3) {
	# check that first is full self alignment
	# print LOG "three alignments in LAST output!\n";  # debug
	# check that 2nd and 3rd are near identical overlaps
	@overlap5prime = split("\t", $alignment[1]);
	@overlap3prime = split("\t", $alignment[2]);
	# print LOG $overlap5prime[0]."\n";  # debug
	# print LOG $overlap3prime[0]."\n";  # debug
	if ($overlap5prime[0] == $overlap3prime[0]) {
	    $start5prime = $overlap5prime[2] + 1;  # 0-based!
	    $end5prime = $start5prime + $overlap5prime[3] - 1;
	    $start3prime = $overlap3prime[2] + 1;  # 0-based!
	    $end3prime = $start3prime + $overlap3prime[3] - 1;
	    print LOG "Overlap detected between  ";
	    print LOG $overlap5prime[1].":".$start5prime."-".$end5prime;
	    print LOG "  and  ";
	    print LOG $overlap3prime[1].":".$start3prime."-".$end3prime."\n";
	    # trim one overlap, then permute halfway
	    $block =~ m/^(.*\n)(.*)$/;
	    $header = $1;
	    chomp $header;
	    $sequence = $2;
	    chomp $sequence;
	    $trimmedSeq = substr($sequence, $overlap5prime[3]);  # 0-, 1-based trickery
	    $mid = int( length($trimmedSeq) / 2 );  # midpoint, to permute around
	    $newSeq = substr($trimmedSeq, $mid) . substr($trimmedSeq, 0, $mid) . "\n";
	    open OUT, ">$fname.$n.fa";
	    chomp $block;
	    print OUT $header."|join_near_$mid\n";
	    print OUT $newSeq."\n";
	    close OUT;
	    system("mv -- $assemblyfile overlapped_seq.txt"); 
	    if (defined($opt_r)) {
		# use bwa mem to align reads to permuted sequence
		$command = "bwa index $fname.$n.fa 2> $fname.$n.aln.log";
		print LOG "indexing permuted sequence ... ";
		system($command);
		$command = "bwa mem -M $fname.$n.fa $opt_r 2>> $fname.$n.aln.log | gzip > $fname.$n.sam.gz";
		print LOG "aligning reads to permuted sequence ... ";
		system($command);
		$command = "samtools view -uhS $fname.$n.sam.gz 2>> $fname.$n.aln.log | samtools sort - $fname.$n 2>> $fname.$n.aln.log";
		print LOG "sorting BAM file of aligned reads ... ";
		system($command);
		$command = "samtools index $fname.$n.bam 2>> $fname.$n.aln.log";
		print LOG "indexing BAM file ... ";
		system($command);
		print LOG "done\n";
	    }
	}
	else {
	    system("mv temp.apc.lastal apc_aln_$n.txt");
	    print LOG "irregular LAST results ... check local file apc_aln_$n.txt\n";
	}
    }
    else {
	system("mv temp.apc.lastal apc_aln_$n.txt");
	print LOG "irregular LAST results ... check local file apc_aln_$n.txt\n";
    }
}
close LOG;

# clean up temp files!
system("rm temp.apc*");

--------------------------------------------------
#splitgbk.py
#!/usr/bin/python3

#Usage: splitgbk.py foo.gbk

#Produces individual gbk files from multigbk
import os
import sys
from Bio import SeqIO
import sys

mgbk_filename = sys.argv[1]

for rec in SeqIO.parse(mgbk_filename, "genbank"):
   SeqIO.write([rec], open(rec.id + ".gbk", "w"), "genbank")

--------------------------------------------------
#gkb_to_faa.py
#!/usr/bin/python3

#Usage: converter.py foo.gbk

#Produces a converted file: foo_converted.faa
import os
import sys
from Bio import GenBank
from Bio import SeqIO

gbk_filename = sys.argv[1]
root_name = os.path.splitext(gbk_filename)[0]
faa_filename = root_name + "_protein.faa"
spacer= ";"
#specify species name as 'organism' variable
fh = open(gbk_filename)
for gb_record in SeqIO.parse (fh, 'genbank'):
    organism = gb_record.annotations['organism']
# stores all the CDS entries

all_entries = []
with open(gbk_filename, 'r') as GBFile:
    GBcds = SeqIO.InsdcIO.GenBankCdsFeatureIterator(GBFile)

    for cds in GBcds:
        if cds.seq is not None:
            cds.description = spacer + cds.annotations['gene']
            cds.id = organism
            all_entries.append(cds)


# write file
SeqIO.write(all_entries, '{}faa'.format(gbk_filename[:-3]), 'fasta')
--------------------------------------------------
##Commands 

#ONT basecalling with GPU (Guppy v5.0.7)

guppy_basecaller --input_path path/to/fast5/data  --save_path path/to/fastq/output/ --config dna_r10.3_450bps_sup.cfg -x 'auto' -r

##demux with guppy (Guppy v5.0.7)
guppy_barcoder -i path/to/fastq/output/pass/ -s path/to/fastq/output/demux -x 'auto' --barcode_kits SQK-RBK110-96 #or SQK-RBK004# --detect_mid_strand_barcodes --trim_barcodes

#cat all fastq files in each barcode subdirectory
for dir in path/to/fastq/output/demux/*;do cd "$dir"; files=(*.fastq); cat "${files[@]}" > "${PWD##*/}.fq"; cd ..; done

#use nanofilt to filter out reads below q10, reads above 17500bp and below 500bp (NanoFilt v2.7.0)
for dir in path/to/fastq/output/demux/*;do cd "$dir"; files=(*.fq); cat "${files[@]}" | NanoFilt -q 10 --maxlength 17500 | NanoFilt -l 500 | gzip > "${PWD##*/}_filt.fq.gz"; cd ..; done

#Summary statistics for filtered reads using NanoStat (v1.2.0)
for dir in path/to/fastq/output/demux/*; do cd "$dir"; files=(*_filt.fq.gz); NanoStat --fastq "${files[@]}" -n "${PWD##*/}" --outdir "${PWD##*/}_nanostat"; cd ..; done 

#run flye --meta assembly (Flye v2.8.3-b1695 or v2.9-b1774)
#first attempt with Flye v2.8.3-b1695
for dir in path/to/fastq/output/demux/*;do cd "$dir"; files=(*_filt.fq.gz); flye --nano-raw "${files[@]}" --threads 35 --meta -o path/to/fastq/output/demux/assembly/"${PWD##*/}_flye_meta"; done
#if an acceptable assembly was not generated, these flye parameters were used with Flye v2.8.3-b1695
flye --nano-raw <filtered reads> --threads 8 --meta -o path/to/fastq/output/demux/assembly/"${PWD##*/}_flye_meta ##we've noticed that the number of threads can influence output
#if no acceptable assembly, Flye v2.9-b1774 was used with the following parameters
flye --nano-hq <filtered reads> --threads 35 --meta -o path/to/fastq/output/demux/assembly/"${PWD##*/}_flye_meta 
#if no acceptable assembly, Flye v2.9-b1774 was used with the following parameters
flye --nano-raw <filtered reads> --threads 35 --meta -o path/to/fastq/output/demux/assembly/"${PWD##*/}_flye_meta 

#sort assembly using selc.py which returns longest contig and then run apc_mod script on output contigs which changes the original contig.fa file to overlapped_contig.txt if it detects overlap
for dir in path/to/fastq/output/demux/assembly/*; do cd "$dir"; files=(contig.fa); selc.py > contig.fa; apc_mod.pl -b contig "${files[@]}"; done

#polish apc output with flye using same version of flye as was used for assembly
for dir in path/to/fastq/output/demux/assembly/*; do cd "$dir"; files1=(*_filt.fq.gz); files2=(contig.*); flye --nano-raw "${files1[@]}" --polish-target "${files2[@]}" --threads 20 --iterations 3 -o path/to/fastq/output/demux/assembly_polished/"${PWD##*/}_polished"; done

#medaka polish (Medaka v1.4.3), copy filtered reads file into the respective assembly_polished subdirectory
mkdir path/to/fastq/output/demux/assembly_polished_medaka/

for dir in path/to/fastq/output/demux/assembly_polished/*; do cd "$dir"; files1=(*_filt.fq.gz); files2=(polished_3.fasta); medaka_consensus -i "${files1[@]}" -d "${files2[@]}" -o path/to/fastq/output/demux/assembly_polished_medaka/"${PWD##*/}_medaka" -t 2 -m r103_sup_g507; done;

>>>manually named the polished mitogenome sequences after their filename and indicate circular with topology=circular after the contig name<<<

#reorient the mitogenome to start on trnM using BLAST of species-specific or related species trnM nucleotide sequence

#Annotation

#be sure to be in the directory with the mitogenome fasta

#annotate fastmitos mitogenomes

source activate mitoz (MitoZ v2.3)
for dir in path/to/fastq/output/demux/assembly_polished_medaka/*; do cd "$dir"; files=(*.fasta); docker run -v $PWD:$PWD -w $PWD --rm guanliangmeng/mitoz:2.3 /app/release_MitoZ_v2.3/MitoZ.py annotate --genetic_code 5 --clade Arthropoda --outprefix "${PWD##*/}"_mitoz --thread_number 8 --fastafile "${files[@]}"; done;

##Geneious outputs a tar file of a batch .asn sequin-like file using the Submit to GenBank Tool, you need to untar this which yields a generically named genbank.asn file
tar -xvf <tar file from Geneious>

#then convert .asn to .gbk using asn2gb 
asn2gb -i genbank.asn -o <output.gbk>

#split mutligbk into single gbk
splitgbk.py <input_multigbk>

#then extract all protein coding gene amino acid sequences and concatenate the file
for file in *.gbk; do gbk_to_faa.py "$file"; done;

cat *.faa > total_protein.faa

#linearize fasta (Seqtk-1.3 (r106))
seqtk seq <input> > <output>

#separate out the .faa files for the 10AA protein coding gene phylogenetic analysis scheme, then move these files to new directory 
grep --no-group-separator -i -h -w -A 1 'ND2' total_protein.faa > nd2_aa.faa
grep --no-group-separator -i -h -w -A 1 'COX1' total_protein.faa > cox1_aa.faa
grep --no-group-separator -i -h -w -A 1 'COX2' total_protein.faa > cox2_aa.faa
grep --no-group-separator -i -h -w -A 1 'ATP6' total_protein.faa > atp6_aa.faa
grep --no-group-separator -i -h -w -A 1 'COX3' total_protein.faa > cox3_aa.faa
grep --no-group-separator -i -h -w -A 1 'ND3' total_protein.faa > nd3_aa.faa
grep --no-group-separator -i -h -w -A 1 'ND5' total_protein.faa > nd5_aa.faa
grep --no-group-separator -i -h -w -A 1 'ND4' total_protein.faa > nd4_aa.faa
grep --no-group-separator -i -h -w -A 1 'CYTB' total_protein.faa > cytb_aa.faa
grep --no-group-separator -i -h -w -A 1 'ND1' total_protein.faa > nd1_aa.faa
grep --no-group-separator -i -h -w -A 1 'ND4L' total_protein.faa > nd4l_aa.faa
grep --no-group-separator -i -h -w -A 1 'ATP8' total_protein.faa > atp8_aa.faa
grep --no-group-separator -i -h -w -A 1 'ND6' total_protein.faa > nd6_aa.faa

mkdir 10AA_scheme
mv *_aa.faa > 10AA_scheme/

#in 10AA_scheme directory, mafft alignment for protein coding genes (G-INS-i parameters) (MAFFT v7.486)
for i in *.faa; do mafft --maxiterate 1000 --globalpair --thread 12 $i >$i.aln; done

#GBLOCKS (v0.91b)

GBLOCKS was used with each alignment for each protein. The default settings were used except for "5. Allowed Gap Positions", which was set to "With Half". Gblocks output was put in a new directory 'gblocks'.  

#mitophylogenomics (IQTREE2 v2.1.2), maximum-likelihood tree estimation with 100k ultrafast bootstrap replicates and modelfinder plus options

iqtree2 -p 10AA_scheme/gblocks/ -m MFP --prefix final_mitophylogenomic_analysis_100k -B 100000 -T AUTO

##benchmarking (NucDiff v2.0.3), the ref.fa file for benchmarking was the Illumina reference, the assembly.fa file was output from the ONT strategy

nucdiff ref.fa assembly.fa <outdir> <prefix> --vcf

##calculate coverage (Minimap2 v2.22-r1101, Samtools v1.11, mosdepth v0.3.1)

minimap2 -ax map-ont assembly.fa <filtered reads> | samtools sort -o sorted.bam

samtools index sorted.bam

mosdepth <prefix> sorted.bam