Here is an approach to tabulating the counts of gaps present in a bam file.
It is implemented in R using bioconductor packages
It takes about a minute to tabulate 33 million reads on my hardware.
It produces a data.frame for further downstream analysis of your choosing, and, optionally, a corresponding "junctions.bed" file (i.e. as is produced by Tophat and can be visualized in IGV).
Here's an example of using it and inspecting its output:
Here's an example of using it and inspecting its output:
> btg <- bamTabulateGaps('t/t1.sam.sorted.bam')
> names(btg)
[1] "bedPath" "tabulatedGaps"
> btg$bedPath
[1] "t/t1.sam.sorted.junctions.bed"
> system(paste('head ', btg$bedPath))
track name=t1 graphType=junctions
2L 11343 11410 . 25
2L 11517 11779 . 51
2L 12220 12286 . 121
2L 12927 13520 . 109
2L 13624 13683 . 109
2L 14873 14933 . 114
2L 15710 17053 . 38
2L 17211 18026 . 4
2L 17211 18261 . 5
> head(btg$tabulatedGaps)
space start end score
1 2L 11344 11410 25
2 2L 11518 11779 51
3 2L 12221 12286 121
4 2L 12928 13520 109
5 2L 13625 13683 109
6 2L 14874 14933 114
Enjoy the code:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(IRanges) # for: coverage, psetdiff, etc | |
library(GenomicRanges) # for: readGappedAlignments, | |
library(Rsamtools) # for: reading bam files scanBam, countBam etc | |
library(rtracklayer) # for: track file IO (bed, wig, etc) | |
library(sqldf) # for: querying dataframes using SQL\ | |
bamTabulateGaps <- function(bamPath, | |
bedPath=paste(gsub('.bam$','',bamPath),'.junctions.bed',sep=''), | |
trackName=gsub('\\..*','',basename(bamPath)), | |
trackLine=sprintf("track name=%s graphType=junctions",trackName), | |
... ){ | |
### Tabulate the (unstranded) coverage of all gaps present in the | |
### gapped alignments in <bamPath>, such as may be produced by a | |
### splice-aware aligner (e.g. GSNAP). | |
### | |
### Optionally, write a bed formatted trackfile at <bedPath> (unless | |
### <bedPath> is provided as ''). | |
### | |
### Include in the bed file an initial <trackLine> declaring the | |
### <trackName> as well as 'graphType=junctions'. | |
### | |
### <bedPath> defaults to the bamPath with any trailing .bam removed, | |
### suffixed with 'junctions.bed'. | |
### | |
### <trackName> defaults to the basename of <bamPath> without any | |
### .extensions. | |
### | |
### 'graphType=junctions' is a convention established (AFAIK) by | |
### TopHat and respected by IGV (at least) causing it to display the | |
### regions using arc-shaped glyphs with a weight proportional to the | |
### score (up to 50). | |
### | |
### Additional options in <...> are passed to bed.export. | |
### | |
### Values: a list of the tabulatedGaps, as a data.frame and the | |
### bedPath | |
### | |
### AUTHOR: malcolm_cook@stowers.org | |
### | |
### EXAMPLE | |
### 1) using this code from gist (requires the R devtools library installed) | |
### RScript --default-packages=devtools,utils,methods,stats -e "suppressPackageStartupMessages(source_gist(1016957))" -e "invisible(do.call(bamTabulateGaps,as.list(commandArgs(TRUE))))" foo.bam | |
GA <- readGappedAlignments(bamPath) | |
GA.gapped <- | |
## which ones have gaps? | |
GA[ngap(GA) != 0] | |
GA.gapped.grg <- | |
## a GRanges, with one component per alignment (which looses | |
## spliced internals). | |
granges(GA.gapped); | |
GA.gapped.grglist <- | |
## a GRangesList of GRanges, one per alignment | |
grglist(GA.gapped); | |
J2J.grglist <- | |
## the gapped regions (presumably intronic), as GRangesList, | |
## indexed by read. | |
psetdiff(GA.gapped.grg,GA.gapped.grglist) | |
J2J.GRanges <- | |
## the gapped regions, as genome wide GRanges | |
unlist(J2J.grglist) | |
## Offset to include the last base of upstream exon and first base | |
## of downstream exon: | |
start(J2J.GRanges) <- start(J2J.GRanges) - 1 | |
end(J2J.GRanges) <- end(J2J.GRanges) + 1 | |
J2JDF <- | |
## coerce to data.frame so we can query using sqldf. | |
as.data.frame(J2J.GRanges) | |
tabulatedGaps <- | |
## Count the number of reads grouped by chromosome,start,end,strand | |
sqldf(paste("SELECT seqnames AS space,start,end, count(*) AS score", | |
"FROM J2JDF", | |
"GROUP BY seqnames,start,end")) | |
if ('' != bedPath) { | |
write(trackLine,file=bedPath) | |
export.bed(tabulatedGaps,bedPath,append=TRUE) | |
} | |
list(bedPath=bedPath, | |
tabulatedGaps=tabulatedGaps | |
) | |
} |