Find the pathway to the external data from the airway package. List the files in this directory and in the “quants” directory.

dir <- system.file("extdata", package="airway", mustWork=TRUE)
list.files(dir)
##  [1] "GSE52778_series_matrix.txt"        "Homo_sapiens.GRCh37.75_subset.gtf"
##  [3] "quants"                            "sample_table.csv"                 
##  [5] "SraRunInfo_SRP033351.csv"          "SRR1039508_subset.bam"            
##  [7] "SRR1039509_subset.bam"             "SRR1039512_subset.bam"            
##  [9] "SRR1039513_subset.bam"             "SRR1039516_subset.bam"            
## [11] "SRR1039517_subset.bam"             "SRR1039520_subset.bam"            
## [13] "SRR1039521_subset.bam"
list.files(file.path(dir, "quants"))
## [1] "SRR1039508" "SRR1039509"

Create a csv with detailed information from each of the samples.

csvfile <- file.path(dir, "sample_table.csv")
coldata <- read.csv(csvfile, row.names=1, stringsAsFactors=FALSE)
coldata
##            SampleName    cell   dex albut        Run avgLength Experiment
## SRR1039508 GSM1275862  N61311 untrt untrt SRR1039508       126  SRX384345
## SRR1039509 GSM1275863  N61311   trt untrt SRR1039509       126  SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512       126  SRX384349
## SRR1039513 GSM1275867 N052611   trt untrt SRR1039513        87  SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516       120  SRX384353
## SRR1039517 GSM1275871 N080611   trt untrt SRR1039517       126  SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520       101  SRX384357
## SRR1039521 GSM1275875 N061011   trt untrt SRR1039521        98  SRX384358
##               Sample    BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677

Create two new columns, names and files.

coldata <- coldata[1:2,]
coldata$names <- coldata$Run
coldata$files <- file.path(dir, "quants", coldata$names, "quant.sf.gz")
file.exists(coldata$files)
## [1] TRUE TRUE

Load tximeta and run the main function to assemble counts.

library("tximeta")
se <- tximeta(coldata)
## importing quantifications
## reading in files with read.delim (install 'readr' package for speed up)
## 1 2 
## found matching transcriptome:
## [ GENCODE - Homo sapiens - release 29 ]
## building TxDb with 'GenomicFeatures' package
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
##   stop_codon. This information was ignored.
## OK
## generating transcript ranges
## fetching genome info for GENCODE

Check dimensions of dataframe and see the rownames.

dim(se)
## [1] 205870      2
head(rownames(se))
## [1] "ENST00000456328.2" "ENST00000450305.2" "ENST00000488147.1"
## [4] "ENST00000619216.1" "ENST00000473358.1" "ENST00000469289.1"

Summarize at gene level.

gse <- summarizeToGene(se)
## loading existing TxDb created: 2020-03-04 21:56:04
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## obtaining transcript-to-gene mapping from TxDb
## generating gene ranges
## summarizing abundance
## summarizing counts
## summarizing length

Check that rownames are gene IDs and the dimensions are reduced.

dim(gse)
## [1] 58294     2
head(rownames(gse))
## [1] "ENSG00000000003.14" "ENSG00000000005.5"  "ENSG00000000419.12"
## [4] "ENSG00000000457.13" "ENSG00000000460.16" "ENSG00000000938.12"

Summary of dataframe

par(mar=c(0,0,0,0))
plot(1,1,xlim=c(0,100),ylim=c(0,100),bty="n",
     type="n",xlab="",ylab="",xaxt="n",yaxt="n")
polygon(c(45,90,90,45),c(5,5,70,70),col="pink",border=NA)
polygon(c(45,90,90,45),c(68,68,70,70),col="pink3",border=NA)
text(67.5,40,"assay(s)")
text(67.5,35,'e.g. "counts", ...')
polygon(c(10,40,40,10),c(5,5,70,70),col="skyblue",border=NA)
polygon(c(10,40,40,10),c(68,68,70,70),col="skyblue3",border=NA)
text(25,40,"rowRanges")
polygon(c(45,90,90,45),c(75,75,95,95),col="palegreen",border=NA)
polygon(c(45,47,47,45),c(75,75,95,95),col="palegreen3",border=NA)
text(67.5,85,"colData")

Load whole count matrix

data(gse)
gse
## class: RangedSummarizedExperiment 
## dim: 58294 8 
## metadata(6): tximetaInfo quantInfo ... txomeInfo txdbInfo
## assays(3): counts abundance length
## rownames(58294): ENSG00000000003.14 ENSG00000000005.5 ...
##   ENSG00000285993.1 ENSG00000285994.1
## rowData names(1): gene_id
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(3): names donor condition

Look at counts in first matrix

assayNames(gse)
## [1] "counts"    "abundance" "length"
head(assay(gse), 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14    708.164    467.962    900.992    424.368   1188.295
## ENSG00000000005.5       0.000      0.000      0.000      0.000      0.000
## ENSG00000000419.12    455.000    510.000    604.000    352.000    583.000
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14   1090.668    805.929    599.337
## ENSG00000000005.5       0.000      0.000      0.000
## ENSG00000000419.12    773.999    409.999    499.000
colSums(assay(gse))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 
##   21100805   19298584   26145537   15688246   25268618   31891456   19683767 
## SRR1039521 
##   21813903

Ranges for first five and last five genes

rowRanges(gse)
## GRanges object with 58294 ranges and 1 metadata column:
##                      seqnames              ranges strand |            gene_id
##                         <Rle>           <IRanges>  <Rle> |        <character>
##   ENSG00000000003.14     chrX 100627109-100639991      - | ENSG00000000003.14
##    ENSG00000000005.5     chrX 100584802-100599885      + |  ENSG00000000005.5
##   ENSG00000000419.12    chr20   50934867-50958555      - | ENSG00000000419.12
##   ENSG00000000457.13     chr1 169849631-169894267      - | ENSG00000000457.13
##   ENSG00000000460.16     chr1 169662007-169854080      + | ENSG00000000460.16
##                  ...      ...                 ...    ... .                ...
##    ENSG00000285990.1    chr14   19244904-19269380      - |  ENSG00000285990.1
##    ENSG00000285991.1     chr6 149817937-149896011      - |  ENSG00000285991.1
##    ENSG00000285992.1     chr8   47129262-47132628      + |  ENSG00000285992.1
##    ENSG00000285993.1    chr18   46409197-46410645      - |  ENSG00000285993.1
##    ENSG00000285994.1    chr10   12563151-12567351      + |  ENSG00000285994.1
##   -------
##   seqinfo: 25 sequences (1 circular) from hg38 genome

Chromosome metadata

seqinfo(rowRanges(gse))
## Seqinfo object with 25 sequences (1 circular) from hg38 genome:
##   seqnames seqlengths isCircular genome
##   chr1      248956422      FALSE   hg38
##   chr2      242193529      FALSE   hg38
##   chr3      198295559      FALSE   hg38
##   chr4      190214555      FALSE   hg38
##   chr5      181538259      FALSE   hg38
##   ...             ...        ...    ...
##   chr21      46709983      FALSE   hg38
##   chr22      50818468      FALSE   hg38
##   chrX      156040895      FALSE   hg38
##   chrY       57227415      FALSE   hg38
##   chrM          16569       TRUE   hg38

Look at data frame and conditions

colData(gse)
## DataFrame with 8 rows and 3 columns
##                 names    donor     condition
##              <factor> <factor>      <factor>
## SRR1039508 SRR1039508   N61311     Untreated
## SRR1039509 SRR1039509   N61311 Dexamethasone
## SRR1039512 SRR1039512  N052611     Untreated
## SRR1039513 SRR1039513  N052611 Dexamethasone
## SRR1039516 SRR1039516  N080611     Untreated
## SRR1039517 SRR1039517  N080611 Dexamethasone
## SRR1039520 SRR1039520  N061011     Untreated
## SRR1039521 SRR1039521  N061011 Dexamethasone

Look at specific columns of gse

gse$donor
## [1] N61311  N61311  N052611 N052611 N080611 N080611 N061011 N061011
## Levels: N052611 N061011 N080611 N61311
gse$condition
## [1] Untreated     Dexamethasone Untreated     Dexamethasone Untreated    
## [6] Dexamethasone Untreated     Dexamethasone
## Levels: Untreated Dexamethasone

Rename variables: donor = cell and condition = dex

gse$cell <- gse$donor
gse$dex <- gse$condition

Rename levels: untreated=untrt and dexamethasone=trt

levels(gse$dex)
## [1] "Untreated"     "Dexamethasone"
# when renaming levels, the order must be preserved!
levels(gse$dex) <- c("untrt", "trt")

Adjust which variable is the reference/control variable

library("magrittr")
gse$dex %<>% relevel("untrt")
gse$dex
## [1] untrt trt   untrt trt   untrt trt   untrt trt  
## Levels: untrt trt

Round fragments

round( colSums(assay(gse)) / 1e6, 1 )
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 
##       21.1       19.3       26.1       15.7       25.3       31.9       19.7 
## SRR1039521 
##       21.8

Create analysis design (accounting for cell line, what is the effect of dex)

library("DESeq2")
dds <- DESeqDataSet(gse, design = ~ cell + dex)
## using counts and average transcript lengths from tximeta

See fragment counts

countdata <- round(assays(gse)[["counts"]])
head(countdata, 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14        708        468        901        424       1188
## ENSG00000000005.5           0          0          0          0          0
## ENSG00000000419.12        455        510        604        352        583
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14       1091        806        599
## ENSG00000000005.5           0          0          0
## ENSG00000000419.12        774        410        499

Save column data

coldata <- colData(gse)

countdata: a table with the fragment counts coldata: a table with information about the samples

Construct DESeqDataSet

ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
                                 colData = coldata,
                                 design = ~ cell + dex)
## converting counts to integer mode

Remove rows with minimal data to speed up functions

nrow(dds)
## [1] 58294
keep <- rowSums(counts(dds)) > 1
dds <- dds[keep,]
nrow(dds)
## [1] 31604

Keep at least 3 samples with a count of 10 or higher

keep <- rowSums(counts(dds) >= 10) >= 3

Plot SD of each gene against the mean

lambda <- 10^seq(from = -1, to = 2, length = 1000)
cts <- matrix(rpois(1000*100, lambda), ncol = 100)
library("vsn")
meanSdPlot(cts, ranks = FALSE)

Log transformed counts

log.cts.one <- log2(cts + 1)
meanSdPlot(log.cts.one, ranks = FALSE)

Amplifies differences when values are close to 0 (because of +1 pseudocount).

VST: faster, less sensitive to outliers (n>30) rlog: works well on small datasets, better for wide range of depth (n<30)

Perform VST. Counts no longer counts

vsd <- vst(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(vsd), 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14  10.105781   9.852029  10.169726   9.991545  10.424865
## ENSG00000000419.12   9.692244   9.923647   9.801921   9.798653   9.763455
## ENSG00000000457.13   9.449592   9.312186   9.362754   9.459168   9.281415
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14  10.194490  10.315814  10.002177
## ENSG00000000419.12   9.874703   9.683211   9.845507
## ENSG00000000457.13   9.395937   9.477971   9.477027
colData(vsd)
## DataFrame with 8 rows and 5 columns
##                 names    donor     condition     cell      dex
##              <factor> <factor>      <factor> <factor> <factor>
## SRR1039508 SRR1039508   N61311     Untreated   N61311    untrt
## SRR1039509 SRR1039509   N61311 Dexamethasone   N61311      trt
## SRR1039512 SRR1039512  N052611     Untreated  N052611    untrt
## SRR1039513 SRR1039513  N052611 Dexamethasone  N052611      trt
## SRR1039516 SRR1039516  N080611     Untreated  N080611    untrt
## SRR1039517 SRR1039517  N080611 Dexamethasone  N080611      trt
## SRR1039520 SRR1039520  N061011     Untreated  N061011    untrt
## SRR1039521 SRR1039521  N061011 Dexamethasone  N061011      trt

Perform rlog

rld <- rlog(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(rld), 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14   9.482613   9.172197   9.558383   9.346001   9.851349
## ENSG00000000419.12   8.860186   9.150196   9.000042   8.995902   8.951327
## ENSG00000000457.13   8.354790   8.166700   8.236582   8.366693   8.121781
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14   9.587602   9.727248   9.357876
## ENSG00000000419.12   9.091075   8.848782   9.054384
## ENSG00000000457.13   8.282307   8.392384   8.391023

blind=false means differences bw treatment and cell lines does not contribute to expected variance-mean trend blind=true for unsupervised experiment

Plot first sample against second. Estimate size factors to account for sequencing depth, set normalized to true.

dds <- estimateSizeFactors(dds)
## using 'avgTxLength' from assays(dds), correcting for library size
df <- bind_rows(
  as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
         mutate(transformation = "log2(x + 1)"),
  as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
  as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
## Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
colnames(df)[1:2] <- c("x", "y")  

ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
  coord_fixed() + facet_grid( . ~ transformation)  

Compress differences within low count genes, since they provide little information about differential expression.

Calculate differences between samples using VST data. Different samples should be rows, different dimensions (genes) should be columns

sampleDists <- dist(t(assay(vsd)))
sampleDists
##            SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517
## SRR1039509   39.42362                                                       
## SRR1039512   32.37620   44.93748                                            
## SRR1039513   51.09677   37.18799   41.79886                                 
## SRR1039516   35.59642   47.54671   34.83458   52.05265                      
## SRR1039517   51.26314   41.58572   46.89609   40.67315   39.74268           
## SRR1039520   32.38578   46.96000   30.35980   48.08669   37.07106   50.38349
## SRR1039521   51.49108   37.57383   47.17283   31.45899   52.62276   41.35941
##            SRR1039520
## SRR1039509           
## SRR1039512           
## SRR1039513           
## SRR1039516           
## SRR1039517           
## SRR1039520           
## SRR1039521   43.01502
library("pheatmap")
library("RColorBrewer")

Visualize differences in a heatmap. Supply distances so it does not calculate on its own.

sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd$dex, vsd$cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colors)

Change row names to have treatment type and patient number

Calculate distances using Poisson distance

library("PoiClaClu")
poisd <- PoissonDistance(t(counts(dds)))

Plot heatmap

samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
         clustering_distance_rows = poisd$dd,
         clustering_distance_cols = poisd$dd,
         col = colors)

Principal components analysis (PCA): data points on 2D plane. x axis separates data points the most while y axis separates second most. There are still other dimensions of difference

plotPCA(vsd, intgroup = c("dex", "cell"))

PCA plot using ggplot. Return PCA data without building plot.

pcaData <- plotPCA(vsd, intgroup = c( "dex", "cell"), returnData = TRUE)
pcaData
##                   PC1        PC2         group   dex    cell       name
## SRR1039508 -14.311369 -2.6000421  untrt:N61311 untrt  N61311 SRR1039508
## SRR1039509   8.058574 -0.7500532    trt:N61311   trt  N61311 SRR1039509
## SRR1039512  -9.404122 -4.3920761 untrt:N052611 untrt N052611 SRR1039512
## SRR1039513  14.497842 -4.1323833   trt:N052611   trt N052611 SRR1039513
## SRR1039516 -12.365055 11.2109581 untrt:N080611 untrt N080611 SRR1039516
## SRR1039517   9.343946 14.9115160   trt:N080611   trt N080611 SRR1039517
## SRR1039520 -10.852633 -7.7618618 untrt:N061011 untrt N061011 SRR1039520
## SRR1039521  15.032816 -6.4860576   trt:N061011   trt N061011 SRR1039521
percentVar <- round(100 * attr(pcaData, "percentVar"))

Build second plot, specifyinf cell line with symbol and treatment with color.

ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
  geom_point(size =3) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  coord_fixed() +
  ggtitle("PCA with VST data")

Generalized principal component analysis (GLM-PCA) and plot

library("glmpca")
gpca <- glmpca(counts(dds), L=2)
gpca.dat <- gpca$factors
gpca.dat$dex <- dds$dex
gpca.dat$cell <- dds$cell
ggplot(gpca.dat, aes(x = dim1, y = dim2, color = dex, shape = cell)) +
  geom_point(size =3) + coord_fixed() + ggtitle("glmpca - Generalized PCA")

Multidimensional scaling (MDS) for matrix of distances from VST data.

mds <- as.data.frame(colData(vsd))  %>%
         cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
  geom_point(size = 3) + coord_fixed() + ggtitle("MDS with VST data")

Same plot with Poisson Distance

mdsPois <- as.data.frame(colData(dds)) %>%
   cbind(cmdscale(samplePoisDistMatrix))
ggplot(mdsPois, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
  geom_point(size = 3) + coord_fixed() + ggtitle("MDS with PoissonDistances")