Figures for potato paper
1 Introduction
This document contains all of the code needed to generate the figures for the potato paper from the raw data. It can be executed with org-mode to generate the figures automatically. The following org-mode settings are required:
;;; use bash to evaluate shell code (setq org-babel-sh-command "bash") ;;; export with CSS classes instead of explicit colours ;;; this uses the included CSS for syntax highlighting ;;; instead of your own emacs colours (setq org-html-htmlize-output-type 'css) (setq org-html-htmlize-font-prefix "org-") ;;; the same but with bootstrap export, requires `ox-twbs' (setq org-twbs-htmlize-output-type 'css) (setq org-twbs-htmlize-font-prefix "org-")
The code in this document will read input data from the data
directory, make
intermediate files in an output
directory, and put the final figures in a
figures
directory.
#!/bin/bash mkdir -p output figures
2 Cumulative content
These plots show the cumulative content of the assemblies as successively smaller contigs are added in. The table gives the names of the assemblies and the filenames we use for each. Much of the code in this section uses this table. It is read into the code blocks line by line into the name and filename variables.
name | filename |
---|---|
Supernova + BioNano | 10x-bn |
Supernova | 10x |
Discovar | discovar-contig |
Discovar + MP + DT + BioNano | discovar-mp-dt-bn |
Discovar + MP + Dovetail | discovar-mp-dt |
Discovar + MP + DT + PBJelly | discovar-mp-dt-jelly |
Discovar + MP + BioNano | discovar-mp-bn |
Discovar + MP | discovar-mp |
Falcon + BioNano | falcon-bn |
Falcon + DT + BioNano | falcon-dt-bn |
Falcon + Dovetail | falcon-dt |
Falcon | falcon |
2.1 Code
First we calculate the lengths of the contigs/scaffolds in each assembly and sort in descending order of size.
#!/bin/bash IFS=',' while read name filename; do if [[ ! -s output/${filename}.length ]]; then echo "Calculating lengths of ${filename}..." gzip -cd data/${filename}.fasta.gz \ | awk ' /^>/ { if (len) { print len, name } split($0,s," ") name=substr(s[1],2) len=0 next } { len += length($0) } END { if (len) { print len, name } } ' \ | sort -nr -k1,1 \ > output/${filename}.length fi done <<< "$table"
Now we calculate the data for the plot. The minimum length of contigs considered and the total length of all contigs of that size or bigger.
#!/bin/bash IFS=',' while read name filename; do if [[ ! -s output/${filename}.minlen-cumulative ]]; then echo "Calculating cumulative lengths of ${filename}..." awk ' BEGIN { OFS="\t" last = 0 sum = 0 } { if($1 != last && last != 0) { print last, sum } last = $1 sum += $1 } ' \ output/${filename}.length \ > output/${filename}.minlen-cumulative fi done <<< "$table"
Now we prepare the table for ggplot by concatenating the minlen-cumulative
tables and putting the name of the assembly in the third column.
#!/bin/bash IFS=',' while read name filename; do awk -v OFS=',' -v fname="$filename" -v name="$name" \ ' { print $1,$2,fname,name } ' \ output/${filename}.minlen-cumulative done <<< "$table" \ > output/all-assemblies.minlen-cumulative
Now we can plot these lines using ggplot:
#!/usr/bin/env Rscript library("ggplot2") library("grid") library("scales") reverselog_trans <- function(base = exp(1)) { trans <- function(x) -log(x, base) inv <- function(x) base^(-x) trans_new(paste0("reverselog-", format(base)), trans, inv, log_breaks(base = base), domain = c(1e-100, Inf)) } input <- read.table("output/all-assemblies.minlen-cumulative", sep=",", col.names=c("minlen", "cumulative", "fname", "name")) ## "bigcomp" with the large assemblies bigcomp <- subset(input, fname %in% c("10x-bn", "falcon-dt-bn", "discovar-mp-dt-bn")) bigcomp$name <- factor(bigcomp$name, levels=c("Supernova + BioNano", "Falcon + DT + BioNano", "Discovar + MP + DT + BioNano")) bigcompplot <- ggplot(bigcomp, aes(x=minlen, y=cumulative, colour=name)) + scale_x_continuous(trans=reverselog_trans(10), breaks=c(10000000,1000000,100000,10000,1000), labels=c("10Mbp", "1MBp", "100kbp", "10kbp", "1kbp")) + scale_y_log10(limits=c(400000,800000000), breaks=c(800000000,100000000,10000000,1000000), labels=c("800Mbp", "100Mbp", "10Mbp", "1Mbp")) + xlab("Minimal scaffold length") + ylab("Covered assembly length\n") + ggtitle("Comparison of potato assemblies") + labs(colour="Assembly") + geom_line() + theme(legend.justification=c(1,0),legend.position=c(1,0)) ## discovar comp dvcomp <- subset(input, fname %in% c("discovar-contig", "discovar-mp", "discovar-mp-bn", "discovar-mp-dt", "discovar-mp-dt-jelly", "discovar-mp-dt-bn")) dvcompplot <- ggplot(dvcomp, aes(x=minlen, y=cumulative, colour=name)) + scale_x_continuous(trans=reverselog_trans(10), breaks=c(10000000,1000000,100000,10000,1000), labels=c("10Mbp", "1MBp", "100kbp", "10kbp", "1kbp")) + scale_y_log10(limits=c(400000,800000000), breaks=c(800000000,100000000,10000000,1000000), labels=NULL) + xlab("Minimal scaffold length") + ylab(NULL) + ggtitle("Comparison of potato Discovar assemblies") + labs(colour="Assembly") + geom_line() + theme(legend.justification=c(1,0),legend.position=c(1,0)) dvcompplotwaxis <- dvcompplot + scale_y_log10(limits=c(400000,800000000), breaks=c(800000000,100000000,10000000,1000000), labels=c("800Mbp", "100Mbp", "10Mbp", "1Mbp")) + ylab("Covered assembly length\n") ## pacbio comp pbcomp <- subset(input, fname %in% c("falcon", "falcon-dt", "falcon-bn", "falcon-dt-bn")) pbcompplot <- ggplot(pbcomp, aes(x=minlen, y=cumulative, colour=name)) + scale_x_continuous(trans=reverselog_trans(10), breaks=c(10000000,1000000,100000,10000,1000), labels=c("10Mbp", "1MBp", "100kbp", "10kbp", "1kbp")) + scale_y_log10(limits=c(400000,800000000), breaks=c(800000000,100000000,10000000,1000000), labels=NULL) + xlab("Minimal scaffold length") + ylab(NULL) + ggtitle("Comparison of potato Falcon assemblies") + labs(colour="Assembly") + geom_line() + theme(legend.justification=c(1,0),legend.position=c(1,0)) pbcompplotwaxis <- pbcompplot + scale_y_log10(limits=c(400000,800000000), breaks=c(800000000,100000000,10000000,1000000), labels=c("800Mbp", "100Mbp", "10Mbp", "1Mbp")) + ylab("Covered assembly length\n")
Make PDF versions:
source(file="cumulative-content.R") pdf(file="figures/bigcompplot.pdf", width=5, height=5) print(bigcompplot) dev.off() pdf(file="figures/dvcompplot.pdf", width=5, height=5) print(dvcompplot) dev.off() pdf(file="figures/pbcompplot.pdf", width=5, height=5) print(pbcompplot) dev.off() pdf(file="figures/all.pdf", width=15, height=5) grid.draw(cbind(ggplotGrob(bigcompplot), ggplotGrob(dvcompplot), ggplotGrob(pbcompplot), size="last")) dev.off()
2.2 Figures
3 KAT plots
3.1 Code
For the KAT plots we have to count k-mers in the Discovar library reads and
the assemblies. The read files LIB12786_R1.fastq
and LIB12786_R2.fastq
need to be in the data
directory to run KAT as below. This generates three
matrix files with filenames ending in -main.mx
. KAT takes a long time and a
lot of memory to run so we provide these matrix files.
for asm in discovar-contig falcon-pilon 10x; do kat comp -t 8 -o data/kat-comp-${asm} \ 'data/LIB12786_R?.fastq' \ data/${asm}.fasta done
KAT comes with its own plotting tools using Python and matplotlib, but we use the ggplot version to match the other figures in the paper.
#!/usr/bin/env Rscript args <- commandArgs() matrixFile <- args[1] maxMul <- as.integer(args[2]) minCov <- as.integer(args[3]) covBands <- as.integer(args[4]) readsName <- args[5] genName <- args[6] transpose <- FALSE isReads <- TRUE library("reshape2") library("ggplot2") library("grid") ## returns list of n colours equally spaced in HSL, like ggplot default gg_color_hue <- function(n) { hues = seq(15, 375, length=n+1)[1:n] hcl(h=hues, l=60, c=120) } matrix <- read.table(matrixFile) if (transpose) { matrix <- t(matrix) } matrix <- matrix[1:(maxMul+1),] lastcolumn <- rowSums(matrix[,-seq_len(minCov+covBands-2)]) matrix <- matrix[,(minCov+1):(minCov+covBands-1)] matrix <- cbind(matrix, lastcolumn) counts <- data.frame(cbind(as.matrix(seq(0,maxMul)), matrix)) names(counts) <- c("multiplicity", sapply(seq(minCov,minCov+covBands-1), function (n) sprintf("%dx", n))) names(counts)[length(names(counts))] <- sprintf("%s+", names(counts)[length(names(counts))]) counts <- melt(counts, id.vars=c("multiplicity"), variable.name="coverage", value.name="count") ## a big peak at 1 is often expected and not interesting, but any other peak is totals <- rowSums(matrix) peaksx <- which(diff(sign(diff(totals))) == -2) peaksy <- totals[peaksx] if (!isReads) { peaky <- max(totals) } else if (peaksx[1] == 1) { peaky <- max(peaksy[-1]) } else { peaky <- max(peaksy) } p <- ggplot(counts, aes(x=multiplicity, y=count, fill=coverage)) p <- p + geom_bar(stat="identity") if (minCov == 0) { p <- p + scale_fill_manual(values=c("#444444", gg_color_hue(covBands-1))) } else { p <- p + scale_fill_manual(values=c(gg_color_hue(covBands))) } p <- p + coord_cartesian(xlim=c(0,maxMul)) p <- p + coord_cartesian(ylim=c(0,peaky*1.1)) p <- p + labs(title=sprintf("%s", genName), x="k-mer multiplicity", y="Number of distinct k-mers", fill="Coverage") p <- p + theme(legend.justification=c(1,1),legend.position=c(1,1))
Make PDF versions:
commandArgs <- function() c("data/kat-comp-discovar-contig-main.mx", "200", "0", "5", "PCR free", "Discovar") source(file="plot-comp.R") pdf(file="figures/kat-comp-discovar.pdf", width=5, height=4, onefile=TRUE) print(p) dev.off() p1 <- p commandArgs <- function() c("data/kat-comp-falcon-pilon-main.mx", "200", "0", "5", "PCR free", "Falcon") source(file="plot-comp.R") pdf(file="figures/kat-comp-falcon.pdf", width=5, height=4, onefile=TRUE) print(p) dev.off() p2 <- p commandArgs <- function() c("data/kat-comp-10x-main.mx", "200", "0", "5", "PCR free", "Supernova") source(file="plot-comp.R") pdf(file="figures/kat-comp-10x.pdf", width=5, height=4, onefile=TRUE) print(p) dev.off() p3 <- p p2 <- p2 + scale_y_continuous(labels=NULL) + ylab(NULL) p3 <- p3 + scale_y_continuous(labels=NULL) + ylab(NULL) pdf(file="figures/kat-comp-all.pdf", width=15, height=5) grid.draw(cbind(ggplotGrob(p1), ggplotGrob(p2), ggplotGrob(p3), size="last")) dev.off()
3.2 Figures
The PNG output from R does not look very good for some reason.
4 BAC with difficult region
For the BAC plot we use a PacBio assembly of a BAC (labelled number 22). We
have several tracks of data: three contig alignments, three read alignments,
and a GC content and homopolymer track. The PacBio assembly of BAC 22 is
contained in bac22.fasta
.
4.1 Contig alignment
For the contig alignments we use bwa-mem.
name | filename |
---|---|
Supernova | 10x |
Discovar | discovar-contig |
Falcon | falcon |
#!/bin/bash bwa index data/bac22.fasta IFS=',' while read name filename; do bwa mem data/bac22.fasta data/${filename}.fasta \ | samtools view -b - > data/pb-${filename}.bam sambamba sort data/pb-${filename}.bam done <<< "$table"
The sam-to-R.py
script takes multiple SAM input files and outputs a table
for plotting with ggplot. Optional parameters are -m
for minimum length of
the alignment and -i
for minimum percent identity, both of which are
calculated from the CIGAR string for each alignment in the SAM file. The first
positional parameter provides a short name for each of the following
alignments, which is embedded in the table, and then the alignment files
follow. We use process substitution so we can use BAM files, ensuring that the
SAM header is present by using the -h
flag for samtools view.
#!/usr/bin/env python import sys import argparse def cigar_to_aln_len(cigar): current_number = "" total_length = 0 for c in cigar: if c in "0123456789": current_number += c elif c in "MD=X": total_length += int(current_number) current_number = "" else: current_number = "" return total_length def parse_opts(opt): opts = {} for text_opt in opt.split(): [name,typ,value] = text_opt.split(":") if typ == "A": opts[name] = value elif typ == "i": opts[name] = int(value) elif typ == "f": opts[name] = float(value) elif typ == "Z": opts[name] = value elif typ == "H": opts[name] = bytearray(value.decode("hex")) elif typ == "B": arr = value.split(",") arrtyp = arr[0] arr = arr[1:] if arrtyp == "f": arr = [float(f) for f in arr] else: arr = [int(f) for f in arr] opts[name] = arr return opts # ----- command line parsing ----- parser = argparse.ArgumentParser( prog="sam-to-svg", description="Turns SAM file into SVG showing alignments.") parser.add_argument("names", type=str, help="Name of alignments, comma separated.") parser.add_argument("sorted_sam_file", type=str, nargs="+", help="Sorted SAM files.") parser.add_argument("-l", "--ref_length", type=int, help="Length of reference " "(taken from SAM file if header is present.") parser.add_argument("-m", "--min_length", type=int, help="Minimum length of alignment to draw.") parser.add_argument("-i", "--min_identity", type=float, help="Minimum percent identity of match.") args = parser.parse_args() # ----- end command line parsing ----- names = args.names.split(",") length = args.ref_length lanes = [] lanesy = [] lanelens = [] if len(names) < len(args.sorted_sam_file): sys.exit("Error: fewer names than SAM files given.") n = 0 minlane = 0 nexty = 0 for sam_file in args.sorted_sam_file: samf = open(sam_file) reflengths = {} refname = "" for line in samf: if line[0] == "@": if line[1:3] == "SQ": split = line[3:].split() for bit in split: if bit[0:2] == "SN": refname = bit[3:] if bit[0:2] == "LN": reflengths[refname] = int(bit[3:]) elif line[1:3] == "PG": split = line[3:].split() else: [qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual,opt] \ = line.split(None,11) aln_len = cigar_to_aln_len(cigar) opts = parse_opts(opt) if rname == "*": continue if args.min_identity is not None and "NM" not in opts: sys.exit("No NM value in SAM file, " "can't calculate percent identity.") if ((args.min_length is None or aln_len >= args.min_length) and (args.min_identity is None or ((float(aln_len - opts["NM"])/aln_len)*100 >= args.min_identity))): inserted = False for lane in lanes[minlane:]: if len(lane) == 0: lane.append((int(pos),int(pos)+aln_len,n)) inserted = True break elif lane[-1][1] < int(pos): lane.append((int(pos),int(pos)+aln_len,n)) inserted = True break if not inserted: lanes.append([(int(pos),int(pos)+aln_len,n)]) lanelens.append(reflengths[rname]) lanesy.append(nexty) nexty += 1 minlane = len(lanes) samf.close() n += 1 nexty += 0.5 sys.stdout.write("{:s}\t{:s}\t{:s}\t{:s}\n" .format("track","file","beg","end")) n = 0 for lane,lanelen in zip(lanes,lanelens): sys.stdout.write("{:.1f}\t{:s}\t{:d}\t{:d}\n" .format(lanesy[n], "space", 1, lanelen)) for aln in lane: sys.stdout.write("{:.1f}\t{:s}\t{:d}\t{:d}\n" .format(lanesy[n], names[aln[2]], aln[0], aln[1])) n += 1
To generate the table for our three contig tracks we run:
#!/bin/bash python sam-to-R.py \ -m 1000 -i 99.9 dv,fal,10x \ <(samtools view -h data/pb-discovar-contig.sorted.bam) \ <(samtools view -h data/pb-falcon.sorted.bam) \ <(samtools view -h data/pb-10x.sorted.bam) \ > output/pb-r-table
4.2 Read alignment
For the read alignment we use bowtie2. The ord
variable tells the aligner if
the reads are in forward-reverse or reverse-forward order.
name | filename | ord |
---|---|---|
Discovar | discovar | fr |
Mate pair | mp | rf |
Dovetail | dovetail | fr |
We align each set of reads and suppress the unaligned reads to keep the file size smaller. Then they are filtered for reads with a high mapping quality and an edit distance with respect to the reference of no more than 1.
#!/bin/bash bowtie2-build data/bac22.fasta data/bac22 IFS=',' while read name filename ord; do bowtie2 -p 8 --no-unal --${ord} -x data/bac22 \ -1 data/${filename}-R1.fastq -2 data/${filename}-R2.fastq \ | samtools view -b - > data/${filename}-aln.bam sambamba view -f bam -F "mapping_quality >= 30 and [NM] <= 1" \ data/${filename}-aln.bam \ > data/${filename}-aln-acc.bam done <<< "$table"
For the paired-end (Discovar) we simply generate the depth at each reference position.
#!/bin/bash sambamba sort data/discovar-aln-acc.bam samtools depth -a data/discovar-aln-acc.sorted.bam \ > output/discovar-aln-acc.cov
For the mate pair and Dovetail we calculate the coverage the aligned fragments. First we need to sort the alignments by name to bring the pairs together:
#!/bin/bash for filename in mp dovetail; do samtools sort -n -O bam -T data/${filename}-aln-acc.sorted.name.bam \ <(samtools view -bh -F 1804 -q 1 data/${filename}-aln-acc.bam) \ > data/${filename}-aln.sorted.name.bam done
Now we use bedtools to make a bed
file with the physical fragments:
#!/bin/bash for filename in mp dovetail; do bamToBed -i data/${filename}-aln.sorted.name.bam -bedpe \ | cut -f 1,2,6 | sort -k1,1 > output/${filename}.phys.bed done
And finally calculate the coverage at each reference position:
#!/bin/bash for filename in mp dovetail; do bedtools genomecov -d -i output/${filename}.phys.bed \ -g data/bac22.fasta.fai > output/${filename}.phys.cov done
4.3 GC content and homopolymers
For the GC content we use bedtools. We make 100bp windows and calculate the GC content in the windows.
#!/bin/bash bedtools makewindows -g data/bac22.fasta.fai -w 100 \ > output/bac22.100bps.bed bedtools nuc -fi data/bac22.fasta -bed output/bac22.100bps.bed \ > output/bac22.gc.txt awk -v w=100 -vOFS='\t' -vFS='\t' 'NR > 1 {print $1,($2+$3)/2,$5}' \ output/bac22.gc.txt \ > output/bac22.gc.100bps
The homopolymers are calculated using the following python script:
#!/usr/bin/env python # returns positions of homopolymers for each contig import sys import os import argparse import threading import time from collections import namedtuple def progress(fp, fs, fin): progress = ["-", "\\", "|", "/"] prog = 0 while not fin.isSet(): if sys.stderr.isatty(): sys.stderr.write("\r{:3.0f}% {:s}\b" .format(fp.tell()/float(fs)*100, progress[prog])) sys.stderr.flush() prog = (prog + 1)%4 time.sleep(0.1) else: sys.stderr.write("{:3.0f}% " .format(fp.tell()/float(fs)*100)) time.sleep(5) else: if sys.stderr.isatty(): sys.stderr.write("\r100% *") sys.stderr.write("\n") return Homopolymer = namedtuple("Homopolymer", "seqname base length beg end") # ----- command line parsing ----- parser = argparse.ArgumentParser( description="Finds positions of homopolyomers.") parser.add_argument("file", type=str, help="Fasta file.") parser.add_argument("min_length", type=int, help="Minimum length of homopolymer.") parser.add_argument("-s", "--case_sensitive", dest="case", action="store_true", help="Case sensitive bases.") parser.add_argument("-b", "--bases", type=str, help="Comma separated list of bases to consider.") parser.set_defaults(case=False) args = parser.parse_args() # ----- end command line parsing ----- fasta_size = os.path.getsize(args.file) fasta = open(args.file) sys.stderr.write("Reading {:s}...\n".format(args.file)) fin = threading.Event() pthread = threading.Thread(name = "progress", target = progress, args = (fasta, fasta_size, fin)) homopolymers = [] name = "" pos = 0 run = 0 last_base = "" try: pthread.start() for line in fasta: if line[0] == '>': if run >= args.min_length: homopolymers.append( Homopolymer(name, last_base, run, pos-run+1, pos)) name = line[1:-1] pos = 0 run = 0 last_base = "" continue for base in line[:-1]: pos += 1 if not args.case: base = base.upper() if last_base == "": last_base = base run = 1 elif base == last_base: run += 1 else: if run >= args.min_length: homopolymers.append( Homopolymer(name, last_base, run, pos-run, pos-1)) run = 1 last_base = base if run >= args.min_length: homopolymers.append( Homopolymer(name, last_base, run, pos-run+1, pos)) fin.set() pthread.join() except KeyboardInterrupt: fin.set() pthread.join() isolate_file.close() sys.stderr.write("\n") sys.exit(1) fasta.close() if args.bases is not None: if args.case: bases = set(args.bases.split(",")) else: bases = set(args.bases.upper().split(",")) else: bases = None for homo in homopolymers: if bases is None or homo.base in bases: sys.stdout.write("{:s}\t{:s}\t{:d}\t{:d}\t{:d}\n" .format(*homo))
#!/bin/bash python ~/code/tools/homopolymers.py data/bac22.fasta 4 \ > output/bac22.homopolymers.4
4.4 Figure
Finally, after generating the data we can plot it all using ggplot.
#!/usr/bin/env Rscript gg_colour_hue <- function(n) { hues = seq(15, 375, length=n+1)[1:n] c(hcl(h=hues, l=65, c=100)) } library("ggplot2") library("grid") library("Cairo") args <- commandArgs(TRUE) if (length(args) < 9) { print("usage: prog title output dvc mpc dtc gc hom con conorder") quit() } title <- args[1] output <- args[2] dvcf <- args[3] mpcf <- args[4] dtcf <- args[5] gcf <- args[6] homf <- args[7] conf <- args[8] conorder <- args[9] dvc <- read.table(dvcf, col.names=c("chr", "pos", "cov")) mpc <- read.table(mpcf, col.names=c("chr", "pos", "cov")) dtc <- read.table(dtcf, col.names=c("chr", "pos", "cov")) gc <- read.table(gcf, col.names=c("chr", "pos", "gc")) hom <- read.table(homf, col.names=c("chr", "base", "length", "beg", "end")) con <- read.table(conf, header=TRUE) conorder <- unlist(strsplit(conorder, split=",")) con$file <- factor(con$file, levels=c(conorder, "space")) plotmargin <- unit(c(0,0,0,4), "mm") conp <- ggplot(con, aes(ymin=track+0.1, ymax=track+0.9, xmin=beg, xmax=end, fill=file)) + guides(fill=FALSE) + xlab(NULL) + ylab(NULL) + scale_x_continuous(labels=NULL) + scale_y_continuous(breaks=c(0.5,2.0,4.5), labels=c("Discovar", "Falcon", "Supernova")) + scale_fill_manual(values=c(gg_colour_hue(3),"light grey")) + theme(axis.ticks.x = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_text(size=8), plot.margin=plotmargin) + geom_rect() + ggtitle(expression(paste(italic("S. verrucosum"), " BAC 22"))) dvcmax <- 1000 dvcp <- ggplot(dvc, aes(x=pos, ymin=0, ymax=cov)) + xlab(NULL) + ylab("Paired end") + scale_x_continuous(labels=NULL) + scale_y_log10(breaks=c(1,10,100,1000)) + theme(axis.ticks.x = element_blank(), axis.text.y = element_text(size=8), axis.title.y = element_text(size=9,vjust=0.1), plot.margin=plotmargin) + geom_ribbon() mpcp <- ggplot(mpc, aes(x=pos, ymin=0, ymax=cov)) + xlab(NULL) + ylab("Mate pair") + scale_x_continuous(labels=NULL) + theme(axis.ticks.x = element_blank(), axis.text.y = element_text(size=8), axis.title.y = element_text(size=9,vjust=0.1), plot.margin=plotmargin) + geom_ribbon() dtcp <- ggplot(dtc, aes(x=pos, ymin=0, ymax=cov)) + xlab(NULL) + ylab("Dovetail") + scale_x_continuous(labels=NULL) + theme(axis.ticks.x = element_blank(), axis.text.y = element_text(size=8), axis.title.y = element_text(size=9,vjust=0.1), plot.margin=plotmargin) + geom_ribbon() pad <- 50 gcp <- ggplot(gc) + geom_rect(data=subset(hom,length > 4 & base != "N"), aes(ymin=0,ymax=100, xmin=beg-pad, xmax=end+pad, fill=base, alpha=length)) + geom_line(aes(x=pos, y=gc*100)) + guides(alpha=FALSE, fill=FALSE) + scale_fill_manual(name="Base", values=c( "A" = "red", "C" = "blue", "G" = "yellow", "T" = "green", "N" = "black")) + xlab("Position") + ylab("GC Cont") + theme(plot.margin=plotmargin, axis.text.y = element_text(size=8), axis.title.y = element_text(size=9,vjust=0.1)) dvcg <- ggplotGrob(dvcp) mpcg <- ggplotGrob(mpcp) dtcg <- ggplotGrob(dtcp) gcg <- ggplotGrob(gcp) cong <- ggplotGrob(conp) pdf(file=sprintf("%s.pdf",output), height=4.5, width=12) grid.draw(rbind(cong, dvcg, mpcg, dtcg, gcg, size="last")) dev.off() CairoPNG(file=sprintf("%s.png",output), height=4.5, width=12, units="in", dpi=80) grid.draw(rbind(cong, dvcg, mpcg, dtcg, gcg, size="last")) dev.off()
#!/bin/bash ./bac-graphs2.R \ bac22 \ figures/bac22-pacbio \ output/discovar-aln-acc.cov \ output/mp.phys.cov \ output/dovetail.phys.cov \ output/bac22.gc.100bps \ output/bac22.homopolymers.4 \ output/pb-r-table \ 10x,fal,dv
5 Gene content
name | filename |
---|---|
Supernova + BioNano | 10x-bn |
Discovar + MP + DT + BioNano | discovar-mp-dt-bn |
Falcon + DT + BioNano | falcon-dt-bn |
Tuberosum reference | tuberosum |
We align the transcripts of the S. tuberosum genome to our assemblies using BLAST version 2.2.31. The transcripts were retrieved from here: http://solanaceae.plantbiology.msu.edu/data/PGSC_DM_v3.4_transcript-update.fasta.zip
for asm in 10x-bn discovar-mp-dt-bn falcon-dt-bn; do blastn -outfmt '6 qseqid qstart qend qlen sseqid sstart send slen pident length evalue' \ -evalue 1e-3 \ -subject ${asm}.fasta \ -query PGSC_DM_v3.4_transcript-update.fasta \ > ${asm}-transcripts.blastn done
We can adjust the minimum percentage identity of BLAST alignments that we
consider and calculate the percent coverage of each transcript in the
alignment. We use the transcript-coverage.py
script to calculate the percent
coverage for a transcript:
#!/usr/bin/env python import sys current_tid = None current_tlen = 0 last_start = 0 last_end = 0 seg_lengths = [] for line in sys.stdin: (tid,tstart,tend,tlen, seqid,seqstart,seqend,seqlen, pident,length,evalue) = line.split() tstart = int(tstart) tend = int(tend) tlen = int(tlen) seqstart = int(seqstart) seqend = int(seqend) seqlen = int(seqlen) if tid != current_tid: if current_tid is not None: sys.stdout.write("{:s}\t{:d}\t{:d}\t{:f}\n" .format(current_tid,current_tlen, sum(seg_lengths), float(sum(seg_lengths))/ current_tlen)) current_tid = tid current_tlen = tlen last_start = tstart last_end = tend seg_lengths = [last_end - last_start + 1] else: if tstart < last_start: sys.exit("Input file not sorted by transcript start!") if tstart <= last_end: # extend segment last_start = min(last_start, tstart) last_end = max(last_end, tend) seg_lengths[-1] = last_end - last_start + 1 else: # new segment last_start = tstart last_end = tlen seg_lengths.append(last_end - last_start + 1) if current_tid is not None: sys.stdout.write("{:s}\t{:d}\t{:d}\t{:f}\n" .format(current_tid,current_tlen, sum(seg_lengths), float(sum(seg_lengths))/ current_tlen))
This is run after filtering the alignments with awk and sorting by alignment start position:
IFS=',' while read name filename; do for i in {85..100}; do ./make-coverages data/${filename}-transcripts.blastn ${i} output/ done done <<EOF ${table} EOF
Now we can count the number of transcripts which appear with various thresholds:
for filename in 10x-bn discovar-mp-dt-bn falcon-dt-bn; do for cov in $(seq 0.85 0.01 1.0); do for i in {85..100}; do num=$(awk -v c=${cov} '$4 >= c' output/${filename}-transcripts-i${i}.coverage | wc -l) echo -ne "${filename}\t${cov}\t${i}\t${num}\n" done done done > output/transcript-asm-cov-id-count
#!/usr/bin/env Rscript library("ggplot2") library("reshape2") library("grid") t <- read.table("../genes-table", header=TRUE) tm <- melt(t, id.vars="asm", variable.name="type", value.name="genes") p <- ggplot(tm, aes(x=asm, y=genes, fill=type)) + scale_fill_discrete(name="Type", labels=c("cegcom"="Cegma complete", "cegpar"="Cegma partial", "buscom"="Busco complete", "buspar"="Busco partial", "busmis"="Busco missing")) + scale_x_discrete(name="Assembly") + scale_y_continuous(name="Number of genes") + geom_bar(stat="identity", position="dodge") + geom_bar(stat="identity", position="dodge", colour="black", show_guide=FALSE) + theme(legend.position="bottom", legend.key.size=unit(3, "mm"), legend.text=element_text(size=8),legend.title=element_text(size=8), plot.margin=unit(c(2,2,0,0), "mm")) + guides(fill=guide_legend(nrow=2,byrow=TRUE)) pdf(file="genes-bar.pdf", width=4, height=4) print(p) dev.off()
6 Synteny to S. tuberosum
6.1 Code
We generate alignments between the S. tuberosum reference and our assemblies
using nucmer
. We are interesting in our three "large" hybrid assemblies.
name | filename |
---|---|
Supernova + BioNano | 10x-bn |
Discovar + MP + DT + BioNano | discovar-mp-dt-bn |
Falcon + DT + BioNano | falcon-dt-bn-2 |
The file falcon-dt-bn-2
is generated by replacing the pipe symbols in the
fasta file with underscores because some or all of mummer's scripts do not
support the pipe symbol.
#!/bin/bash gzip -cd data/falcon-dt-bn.fasta.gz \ | sed 's/|/_/g' > data/falcon-dt-bn-2.fasta
The following code requires mummer and the S. tuberosum pseudomolecule
assembly version 4.03 in the data
directory. It is available here:
http://solanaceae.plantbiology.msu.edu/pgsc_download.shtml
#!/bin/bash IFS=',' while read name filename; do nucmer -c 65 -l 65 \ -p output/${filename} \ data/PGSC_DM_v4.03_pseudomolecules.fasta \ data/${filename}.fasta done <<< "$table"
First we filter the delta to only include alignments of at least 90% identity and 10kb in length.
#!/bin/bash IFS=',' while read name filename; do if [[ ! -s output/${filename}-i${i}-l${l}.delta ]]; then echo "Filtering delta..." delta-filter -q -r -i ${i} -l ${l} output/${filename}.delta \ > output/${filename}-i${i}-l${l}.delta fi done <<< "$table"
Now we generate the coords files which we'll need later to sort and orientate the scaffolds for each chromosome. By default mummer will plot every scaffold in the assembly even if we select only one chromosome of the reference, therefore we need to find which scaffolds actually have alignments with each reference sequence.
#!/bin/bash IFS=',' while read name filename; do if [[ ! -s output/${filename}-i${i}-l${l}.coords ]]; then echo "Making coords..." show-coords -THrcl output/${filename}-i${i}-l${l}.delta \ > output/${filename}-i${i}-l${l}.coords fi done <<< "$table"
Mummer will also require the length of each assembly scaffold that we wish to plot.
#!/bin/bash IFS=',' while read name filename; do if [[ ! -s output/${filename}.length ]]; then echo "Calculating lengths of ${filename}..." cat data/${filename}.fasta \ | awk ' /^>/ { if (len) { print len, name } split($0,s," ") name=substr(s[1],2) len=0 next } { len += length($0) } END { if (len) { print len, name } } ' \ | sort -nr -k1,1 \ > output/${filename}.length fi done <<< "$table"
Finally we make a mummerplot for each assembly and each chromosome. We are
going to manually select which assembly scaffolds to display and also manually
order and orientate them. mummerplot
attempts to do the ordering and
orientation itself but doesn't do a very good job. We want to order the
scaffolds such that most of the bases in the alignment appear on the
diagonal. We do this by sorting by the average base position in the alignment
of each scaffold. We want to orientate the scaffolds such that as much as
possible is "forwards" (more red than blue in the plots). We do this by
counting how much of the alignment length is forward and reverse and setting
the orientation accordingly.
#!/bin/bash IFS=',' while read name filename; do for chr in {01..12}; do echo "Doing chr ${chr}..." # find scaffold order and orientation awk '$12 == "ST4.03ch'${chr}'" {print $1,$2,$3,$4,$13;}' \ output/${filename}-i${i}-l${l}.coords \ | sort -n -k1,1 \ | sort -s -k5 \ | awk ' BEGIN { lastn=""; lastm=0; lastb=0; lastl=0; } { n=$5; s=$1; e=$2; p=(s+e)/2; b=e-s; l=$4-$3; if (lastn=="") { lastn=n; lastm=p; lastb=b; lastl=l; } else if (lastn!=n) { print lastn,int(lastm),lastl lastn=n; lastm=p; lastb=b; lastl=l; } else { lastm=(lastm*lastb+p*b)/(lastb+b); lastb+=b; lastl+=l } } END { print lastn,int(lastm),lastl } ' \ | sort -k1,1 \ > \ output/${filename}-chr${chr}-contigs-i${i}-l${l}.scaf-pos-orn join -1 1 -2 2 \ output/${filename}-chr${chr}-contigs-i${i}-l${l}.scaf-pos-orn \ <(sort -k2,2 output/${filename}.length) \ | sort -n -k2,2 \ | awk ' {if ($3>0) { sign="+" } else { sign="-" } printf "%s\t%d\t%s\n", $1, $4, sign;} ' \ > output/${filename}-chr${chr}-contigs-i${i}-l${l}.mumlist mummerplot -t png -r ST4.03ch${chr} \ -p output/mummerplot-${filename}-i${i}-l${l}-${chr} \ output/${filename}-i${i}-l${l}.delta \ -Q output/${filename}-chr${chr}-contigs-i${i}-l${l}.mumlist mv output/mummerplot-${filename}-i${i}-l${l}-${chr}.png figures/ done done <<< "$table"
For the rest of the paper we ggplot for the plots so we now convert the mummerplots to ggplot and plot them. The data is prepared with the following script:
#!/bin/bash set -e if [[ $# -lt 2 ]]; then echo "usage: mummer2ggplot inputprefix outputdir" exit 1 fi inputprefix=$1 outputdir=$2 # make coords cat ${inputprefix}.[fr]plot \ | awk '/^[1-9]/' \ | awk ' NR%2==1 {a=$1;b=$2} NR%2==0 {if($2>b) { print a,b,$1,$2,$3,"+" }else{ print a,b,$1,$2,$3,"-" }}' \ | sort -n \ > ${inputprefix}.ggplot # make ticks sed ':a;N;$!ba;s/\\\n//g' ${inputprefix}.gp \ | awk '/^set ytics/' \ | sed 's/set ytics ( //' \ | sed 's/ )//' \ | sed 's/, /\n/g' \ | sed 's/ /,/' \ | awk '{gsub(/"/, "", $1); print $1,$2;}' \ > ${inputprefix}.asmticks Rscript mummerplot.R ${outputdir}/$(basename ${inputprefix}) \ ${inputprefix}.ggplot ${inputprefix}.asmticks
Where mummerplot.R
is:
#!/usr/bin/env Rscript library("ggplot2") args <- commandArgs(TRUE) if (length(args) < 3) { print("usage: prog output coords ticks") quit() } output <- args[1] coords <- args[2] ticks <- args[3] t <- read.table(coords, col.names=c("x1", "y1", "x2", "y2", "pid", "dir")) t$dir <- factor(t$dir, levels=c("+","-")) ticks <- read.table(ticks, sep=",", col.names=c("labels", "breaks")) p <- ggplot(t, aes(x=x1,xend=x2,y=y1,yend=y2,alpha=pid,colour=dir)) + geom_segment() + geom_point() + geom_point(aes(x2,y2)) + scale_x_continuous(name="Reference") + scale_y_continuous(name="Assembly", breaks=ticks$breaks, labels=ticks$labels, minor_breaks=NULL) + scale_colour_discrete(guide=FALSE) + scale_alpha_continuous(guide=FALSE) + theme(axis.text.y=element_text(size=6)) pdf(file=sprintf("%s-gg.pdf",output), width=7, height=6.5) print(p) dev.off() png(file=sprintf("%s-gg.png",output), width=800, height=800) print(p) dev.off()
6.2 Figures
Convert all the figures to ggplot:
IFS=',' while read name filename; do for chr in {01..12}; do ./mummer2ggplot \ output/mummerplot-${filename}-i${i}-l${l}-${chr} \ figures done done <<EOF $table EOF
Here is a quick preview of the figures.
IFS=',' while read name filename; do montage figures/mummerplot-${filename}-i${i}-l${l}-??-gg.png \ -geometry 240x240+10+5 -tile 3x4 \ figures/mummerplot-${filename}-i${i}-l${l}-all-gg.png echo "*** ${name}" echo "[[file:figures/mummerplot-${filename}-i${i}-l${l}-all.png]]" done <<< "$table"