Customizing Gviz's plotTracks function

Question

Customizing Gviz's plotTracks function

1.3k views Asked by dan At 02 January 2017 at 05:31

I'm trying to generate a track plot using Gviz's plotTracks function.

For example, I'm trying to plot these three transcripts from gencode.v25.primary_assembly.annotation.gtf:

"ENST00000352151.9","ENST00000550488.5","ENST00000335154.9"

(all from gene: ENSG00000161791.13)

So I'm trying to call GeneRegionTrack with a data.frame that includes only the GTF records of these three transcripts:

require(rtracklayer)
#gtf.fn is the gencode.v25.primary_assembly.annotation.gtf file
gtf.df <- as.data.frame(import(gtf.fn))
my.transcripts.gtf.df <- dplyr::filter(gtf.df, transcript_id %in% c("ENST00000352151.9","ENST00000550488.5","ENST00000335154.9"))


> dim(my.transcripts.gtf.df)
[1] 171  27

And (naively) trying GeneRegionTrack with my.transcripts.gtf.df:

require(Gviz)
gene.region.track <-GeneRegionTrack(range=my.transcripts.gtf.df,genome="hg38",chromosome=my.transcripts.gtf.df$seqnames[1],name="Gene Model")
ideogram.track <- IdeogramTrack(genome=genome,chromosome=as.character(t.df$chromosome[1]))
plotTracks(list(ideogram.track,axis.track,gene.region.track))

I get a bit of a mess:

That's because the GTF has more records than the CDS/exon/UTR records which will make plotTracks more organized.

I tried:

my.transcripts.gtf.df <- dplyr::filter(gtf.df, transcript_id %in% c("ENST00000352151.9","ENST00000550488.5","ENST00000335154.9"), type %in% c("CDS","exon","UTR"))
gene.region.track <-GeneRegionTrack(range=my.transcripts.gtf.df,genome="hg38",chromosome=my.transcripts.gtf.df$seqnames[1],name="Gene Model")
plotTracks(list(ideogram.track,axis.track,gene.region.track))

And I get:

Which is still quite a mess. But what I do notice is that the 3'UTRs seem to have the same width as the all exons.

So I tried to process my GTF to make it compatible with that Gviz has in its vignette:

determineGtfFeature <- function(t.gtf.df){
  if(length(which(unique(t.gtf.df$type) == "CDS")) > 0){
    res.gtf.df <- dplyr::filter(t.gtf.df,type =="CDS")
    res.gtf.df$type <- NA
    res.gtf.df$type <- "protein_coding"
    if(length(which(unique(t.gtf.df$type) == "UTR")) > 0){
      utr.gtf.df <- dplyr::filter(t.gtf.df,type == "UTR")
      utr.gtf.df <- utr.gtf.df[order(utr.gtf.df$start),]
      if(utr.gtf.df$strand[1] == "+"){
        utr5.idx <- which(utr.gtf.df$end < min(res.gtf.df$start))
        utr3.idx <- which(utr.gtf.df$start > max(res.gtf.df$end))
      } else{
        utr3.idx <- which(utr.gtf.df$end < min(res.gtf.df$start))
        utr5.idx <- which(utr.gtf.df$start > max(res.gtf.df$end))
      }
      utr.gtf.df$type <- NA
      utr.gtf.df$type[utr5.idx] <- "utr5"
      utr.gtf.df$type[utr3.idx] <- "utr3"
      res.gtf.df <- rbind(res.gtf.df,utr.gtf.df)
    }
  } else{
     if(grepl("pseudogene",t.gtf.df$transcript_type)==T){
       res.gtf.df <- dplyr::filter(t.gtf.df,type =="exon")
       res.gtf.df$type <- "pseudogene"
     } else{
       res.gtf.df <- dplyr::filter(t.gtf.df,type =="exon")
       res.gtf.df$type <- "lincRNA"
     }
  }
  res.gtf.df$type <- factor(res.gtf.df$type,levels=unique(res.gtf.df$type))
  res.gtf.df <- res.gtf.df[,-which(colnames(res.gtf.df) == "transcript_type")]
  return(res.gtf.df)
}

And then:

my.transcripts.gtf.df <- do.call(rbind,lapply(c("ENST00000352151.9","ENST00000550488.5","ENST00000335154.9"),function(t)
  determineGtfFeature(dplyr::select(dplyr::filter(gtf.df, transcript_id == t),seqnames,start,end,width,strand,type,gene_id,exon_id,transcript_id,gene_name,transcript_type))))
colnames(my.transcripts.gtf.df)[c(1,6,7,8,9,10)] <- c("chromosome","feature","gene","exon","transcript","symbol")
tid.starts <- sort(sapply(unique(my.transcripts.gtf.df$transcript),function(t) min(dplyr::filter(my.transcripts.gtf.df,transcript == t)$start)))
my.transcripts.gtf.df$transcript.index <- sapply(my.transcripts.gtf.df$transcript,function(t) which(names(my.transcripts.gtf.df) == t))
my.transcripts.gtf.df <- my.transcripts.gtf.df[order(my.transcripts.gtf.df$transcript.index,my.transcripts.gtf.df$start),]
my.transcripts.gtf.df <- my.transcripts.gtf.df[,-which(colnames(my.transcripts.gtf.df) == "transcript.index")]
my.transcripts.gtf.df$gene <- factor(my.transcripts.gtf.df$gene,levels=unique(my.transcripts.gtf.df$gene))
my.transcripts.gtf.df$exon <- factor(my.transcripts.gtf.df$exon,levels=unique(my.transcripts.gtf.df$exon))
my.transcripts.gtf.df$transcript <- factor(my.transcripts.gtf.df$transcript,levels=unique(my.transcripts.gtf.df$transcript))
my.transcripts.gtf.df$symbol <- factor(my.transcripts.gtf.df$symbol,levels=unique(my.transcripts.gtf.df$symbol))

So:

gene.region.track <- GeneRegionTrack(range=my.transcripts.gtf.df,genome=genome,chromosome=as.character(t.df$chromosome[1]),name="Gene Model")
plotTracks(list(ideogram.track,axis.track,gene.region.track))

Now gives:

Only utr3 of the middle transcript has the proper smaller width but the other two don't.

As far as I checked my.transcripts.gtf.df is compatible with:

 data(geneModels)

used in Gviz's vignette.

So my questions are:

Is there a simple way to get Gviz to plot a requested transcript from a standard GTF file without having to go through the trouble of editing it. Probably not, and hence the GTF needs to be in the same format as the geneModels data.frame. So any idea why are the utr3's misbehaving?
Which parameter in displayPars(gene.region.track) controls the spacing between the tracks? If I set displayPars(gene.region.track)$stackHeight <- 0.1 the tracks become narrower which leaves a big space between them, and I would like to shrink that space.

Original Q&A

TechQA.

Customizing Gviz's plotTracks function

There are 0 answers

Related Questions in R

Related Questions in GGPLOT2

Related Questions in BIOCONDUCTOR

Related Questions in COWPLOT

Popular Questions

Popular Tags

Trending Questions