Customizing Gviz's plotTracks function

1.3k views Asked by At

I'm trying to generate a track plot using Gviz's plotTracks function.

For example, I'm trying to plot these three transcripts from gencode.v25.primary_assembly.annotation.gtf:

"ENST00000352151.9","ENST00000550488.5","ENST00000335154.9"

(all from gene: ENSG00000161791.13)

So I'm trying to call GeneRegionTrack with a data.frame that includes only the GTF records of these three transcripts:

require(rtracklayer)
#gtf.fn is the gencode.v25.primary_assembly.annotation.gtf file
gtf.df <- as.data.frame(import(gtf.fn))
my.transcripts.gtf.df <- dplyr::filter(gtf.df, transcript_id %in% c("ENST00000352151.9","ENST00000550488.5","ENST00000335154.9"))


> dim(my.transcripts.gtf.df)
[1] 171  27

And (naively) trying GeneRegionTrack with my.transcripts.gtf.df:

require(Gviz)
gene.region.track <-GeneRegionTrack(range=my.transcripts.gtf.df,genome="hg38",chromosome=my.transcripts.gtf.df$seqnames[1],name="Gene Model")
ideogram.track <- IdeogramTrack(genome=genome,chromosome=as.character(t.df$chromosome[1]))
plotTracks(list(ideogram.track,axis.track,gene.region.track))

I get a bit of a mess: enter image description here

That's because the GTF has more records than the CDS/exon/UTR records which will make plotTracks more organized.

I tried:

my.transcripts.gtf.df <- dplyr::filter(gtf.df, transcript_id %in% c("ENST00000352151.9","ENST00000550488.5","ENST00000335154.9"), type %in% c("CDS","exon","UTR"))
gene.region.track <-GeneRegionTrack(range=my.transcripts.gtf.df,genome="hg38",chromosome=my.transcripts.gtf.df$seqnames[1],name="Gene Model")
plotTracks(list(ideogram.track,axis.track,gene.region.track))

And I get: enter image description here

Which is still quite a mess. But what I do notice is that the 3'UTRs seem to have the same width as the all exons.

So I tried to process my GTF to make it compatible with that Gviz has in its vignette:

determineGtfFeature <- function(t.gtf.df){
  if(length(which(unique(t.gtf.df$type) == "CDS")) > 0){
    res.gtf.df <- dplyr::filter(t.gtf.df,type =="CDS")
    res.gtf.df$type <- NA
    res.gtf.df$type <- "protein_coding"
    if(length(which(unique(t.gtf.df$type) == "UTR")) > 0){
      utr.gtf.df <- dplyr::filter(t.gtf.df,type == "UTR")
      utr.gtf.df <- utr.gtf.df[order(utr.gtf.df$start),]
      if(utr.gtf.df$strand[1] == "+"){
        utr5.idx <- which(utr.gtf.df$end < min(res.gtf.df$start))
        utr3.idx <- which(utr.gtf.df$start > max(res.gtf.df$end))
      } else{
        utr3.idx <- which(utr.gtf.df$end < min(res.gtf.df$start))
        utr5.idx <- which(utr.gtf.df$start > max(res.gtf.df$end))
      }
      utr.gtf.df$type <- NA
      utr.gtf.df$type[utr5.idx] <- "utr5"
      utr.gtf.df$type[utr3.idx] <- "utr3"
      res.gtf.df <- rbind(res.gtf.df,utr.gtf.df)
    }
  } else{
     if(grepl("pseudogene",t.gtf.df$transcript_type)==T){
       res.gtf.df <- dplyr::filter(t.gtf.df,type =="exon")
       res.gtf.df$type <- "pseudogene"
     } else{
       res.gtf.df <- dplyr::filter(t.gtf.df,type =="exon")
       res.gtf.df$type <- "lincRNA"
     }
  }
  res.gtf.df$type <- factor(res.gtf.df$type,levels=unique(res.gtf.df$type))
  res.gtf.df <- res.gtf.df[,-which(colnames(res.gtf.df) == "transcript_type")]
  return(res.gtf.df)
}

And then:

my.transcripts.gtf.df <- do.call(rbind,lapply(c("ENST00000352151.9","ENST00000550488.5","ENST00000335154.9"),function(t)
  determineGtfFeature(dplyr::select(dplyr::filter(gtf.df, transcript_id == t),seqnames,start,end,width,strand,type,gene_id,exon_id,transcript_id,gene_name,transcript_type))))
colnames(my.transcripts.gtf.df)[c(1,6,7,8,9,10)] <- c("chromosome","feature","gene","exon","transcript","symbol")
tid.starts <- sort(sapply(unique(my.transcripts.gtf.df$transcript),function(t) min(dplyr::filter(my.transcripts.gtf.df,transcript == t)$start)))
my.transcripts.gtf.df$transcript.index <- sapply(my.transcripts.gtf.df$transcript,function(t) which(names(my.transcripts.gtf.df) == t))
my.transcripts.gtf.df <- my.transcripts.gtf.df[order(my.transcripts.gtf.df$transcript.index,my.transcripts.gtf.df$start),]
my.transcripts.gtf.df <- my.transcripts.gtf.df[,-which(colnames(my.transcripts.gtf.df) == "transcript.index")]
my.transcripts.gtf.df$gene <- factor(my.transcripts.gtf.df$gene,levels=unique(my.transcripts.gtf.df$gene))
my.transcripts.gtf.df$exon <- factor(my.transcripts.gtf.df$exon,levels=unique(my.transcripts.gtf.df$exon))
my.transcripts.gtf.df$transcript <- factor(my.transcripts.gtf.df$transcript,levels=unique(my.transcripts.gtf.df$transcript))
my.transcripts.gtf.df$symbol <- factor(my.transcripts.gtf.df$symbol,levels=unique(my.transcripts.gtf.df$symbol))

So:

gene.region.track <- GeneRegionTrack(range=my.transcripts.gtf.df,genome=genome,chromosome=as.character(t.df$chromosome[1]),name="Gene Model")
plotTracks(list(ideogram.track,axis.track,gene.region.track))

Now gives: enter image description here

Only utr3 of the middle transcript has the proper smaller width but the other two don't.

As far as I checked my.transcripts.gtf.df is compatible with:

 data(geneModels)

used in Gviz's vignette.

So my questions are:

  1. Is there a simple way to get Gviz to plot a requested transcript from a standard GTF file without having to go through the trouble of editing it. Probably not, and hence the GTF needs to be in the same format as the geneModels data.frame. So any idea why are the utr3's misbehaving?

  2. Which parameter in displayPars(gene.region.track) controls the spacing between the tracks? If I set displayPars(gene.region.track)$stackHeight <- 0.1 the tracks become narrower which leaves a big space between them, and I would like to shrink that space.

0

There are 0 answers