Find the pathway to the external data from the airway package. List the files in this directory and in the “quants” directory.
dir <- system.file("extdata", package="airway", mustWork=TRUE)
list.files(dir)
## [1] "GSE52778_series_matrix.txt" "Homo_sapiens.GRCh37.75_subset.gtf"
## [3] "quants" "sample_table.csv"
## [5] "SraRunInfo_SRP033351.csv" "SRR1039508_subset.bam"
## [7] "SRR1039509_subset.bam" "SRR1039512_subset.bam"
## [9] "SRR1039513_subset.bam" "SRR1039516_subset.bam"
## [11] "SRR1039517_subset.bam" "SRR1039520_subset.bam"
## [13] "SRR1039521_subset.bam"
list.files(file.path(dir, "quants"))
## [1] "SRR1039508" "SRR1039509"
Create a csv with detailed information from each of the samples.
csvfile <- file.path(dir, "sample_table.csv")
coldata <- read.csv(csvfile, row.names=1, stringsAsFactors=FALSE)
coldata
## SampleName cell dex albut Run avgLength Experiment
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120 SRX384353
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101 SRX384357
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358
## Sample BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677
Create two new columns, names and files.
coldata <- coldata[1:2,]
coldata$names <- coldata$Run
coldata$files <- file.path(dir, "quants", coldata$names, "quant.sf.gz")
file.exists(coldata$files)
## [1] TRUE TRUE
Load tximeta and run the main function to assemble counts.
library("tximeta")
se <- tximeta(coldata)
## importing quantifications
## reading in files with read.delim (install 'readr' package for speed up)
## 1 2
## found matching transcriptome:
## [ GENCODE - Homo sapiens - release 29 ]
## building TxDb with 'GenomicFeatures' package
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
## stop_codon. This information was ignored.
## OK
## generating transcript ranges
## fetching genome info for GENCODE
Check dimensions of dataframe and see the rownames.
dim(se)
## [1] 205870 2
head(rownames(se))
## [1] "ENST00000456328.2" "ENST00000450305.2" "ENST00000488147.1"
## [4] "ENST00000619216.1" "ENST00000473358.1" "ENST00000469289.1"
Summarize at gene level.
gse <- summarizeToGene(se)
## loading existing TxDb created: 2020-03-04 21:56:04
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## obtaining transcript-to-gene mapping from TxDb
## generating gene ranges
## summarizing abundance
## summarizing counts
## summarizing length
Check that rownames are gene IDs and the dimensions are reduced.
dim(gse)
## [1] 58294 2
head(rownames(gse))
## [1] "ENSG00000000003.14" "ENSG00000000005.5" "ENSG00000000419.12"
## [4] "ENSG00000000457.13" "ENSG00000000460.16" "ENSG00000000938.12"
Summary of dataframe
par(mar=c(0,0,0,0))
plot(1,1,xlim=c(0,100),ylim=c(0,100),bty="n",
type="n",xlab="",ylab="",xaxt="n",yaxt="n")
polygon(c(45,90,90,45),c(5,5,70,70),col="pink",border=NA)
polygon(c(45,90,90,45),c(68,68,70,70),col="pink3",border=NA)
text(67.5,40,"assay(s)")
text(67.5,35,'e.g. "counts", ...')
polygon(c(10,40,40,10),c(5,5,70,70),col="skyblue",border=NA)
polygon(c(10,40,40,10),c(68,68,70,70),col="skyblue3",border=NA)
text(25,40,"rowRanges")
polygon(c(45,90,90,45),c(75,75,95,95),col="palegreen",border=NA)
polygon(c(45,47,47,45),c(75,75,95,95),col="palegreen3",border=NA)
text(67.5,85,"colData")
Load whole count matrix
data(gse)
gse
## class: RangedSummarizedExperiment
## dim: 58294 8
## metadata(6): tximetaInfo quantInfo ... txomeInfo txdbInfo
## assays(3): counts abundance length
## rownames(58294): ENSG00000000003.14 ENSG00000000005.5 ...
## ENSG00000285993.1 ENSG00000285994.1
## rowData names(1): gene_id
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(3): names donor condition
Look at counts in first matrix
assayNames(gse)
## [1] "counts" "abundance" "length"
head(assay(gse), 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 708.164 467.962 900.992 424.368 1188.295
## ENSG00000000005.5 0.000 0.000 0.000 0.000 0.000
## ENSG00000000419.12 455.000 510.000 604.000 352.000 583.000
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 1090.668 805.929 599.337
## ENSG00000000005.5 0.000 0.000 0.000
## ENSG00000000419.12 773.999 409.999 499.000
colSums(assay(gse))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520
## 21100805 19298584 26145537 15688246 25268618 31891456 19683767
## SRR1039521
## 21813903
Ranges for first five and last five genes
rowRanges(gse)
## GRanges object with 58294 ranges and 1 metadata column:
## seqnames ranges strand | gene_id
## <Rle> <IRanges> <Rle> | <character>
## ENSG00000000003.14 chrX 100627109-100639991 - | ENSG00000000003.14
## ENSG00000000005.5 chrX 100584802-100599885 + | ENSG00000000005.5
## ENSG00000000419.12 chr20 50934867-50958555 - | ENSG00000000419.12
## ENSG00000000457.13 chr1 169849631-169894267 - | ENSG00000000457.13
## ENSG00000000460.16 chr1 169662007-169854080 + | ENSG00000000460.16
## ... ... ... ... . ...
## ENSG00000285990.1 chr14 19244904-19269380 - | ENSG00000285990.1
## ENSG00000285991.1 chr6 149817937-149896011 - | ENSG00000285991.1
## ENSG00000285992.1 chr8 47129262-47132628 + | ENSG00000285992.1
## ENSG00000285993.1 chr18 46409197-46410645 - | ENSG00000285993.1
## ENSG00000285994.1 chr10 12563151-12567351 + | ENSG00000285994.1
## -------
## seqinfo: 25 sequences (1 circular) from hg38 genome
Chromosome metadata
seqinfo(rowRanges(gse))
## Seqinfo object with 25 sequences (1 circular) from hg38 genome:
## seqnames seqlengths isCircular genome
## chr1 248956422 FALSE hg38
## chr2 242193529 FALSE hg38
## chr3 198295559 FALSE hg38
## chr4 190214555 FALSE hg38
## chr5 181538259 FALSE hg38
## ... ... ... ...
## chr21 46709983 FALSE hg38
## chr22 50818468 FALSE hg38
## chrX 156040895 FALSE hg38
## chrY 57227415 FALSE hg38
## chrM 16569 TRUE hg38
Look at data frame and conditions
colData(gse)
## DataFrame with 8 rows and 3 columns
## names donor condition
## <factor> <factor> <factor>
## SRR1039508 SRR1039508 N61311 Untreated
## SRR1039509 SRR1039509 N61311 Dexamethasone
## SRR1039512 SRR1039512 N052611 Untreated
## SRR1039513 SRR1039513 N052611 Dexamethasone
## SRR1039516 SRR1039516 N080611 Untreated
## SRR1039517 SRR1039517 N080611 Dexamethasone
## SRR1039520 SRR1039520 N061011 Untreated
## SRR1039521 SRR1039521 N061011 Dexamethasone
Look at specific columns of gse
gse$donor
## [1] N61311 N61311 N052611 N052611 N080611 N080611 N061011 N061011
## Levels: N052611 N061011 N080611 N61311
gse$condition
## [1] Untreated Dexamethasone Untreated Dexamethasone Untreated
## [6] Dexamethasone Untreated Dexamethasone
## Levels: Untreated Dexamethasone
Rename variables: donor = cell and condition = dex
gse$cell <- gse$donor
gse$dex <- gse$condition
Rename levels: untreated=untrt and dexamethasone=trt
levels(gse$dex)
## [1] "Untreated" "Dexamethasone"
# when renaming levels, the order must be preserved!
levels(gse$dex) <- c("untrt", "trt")
Adjust which variable is the reference/control variable
library("magrittr")
gse$dex %<>% relevel("untrt")
gse$dex
## [1] untrt trt untrt trt untrt trt untrt trt
## Levels: untrt trt
Round fragments
round( colSums(assay(gse)) / 1e6, 1 )
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520
## 21.1 19.3 26.1 15.7 25.3 31.9 19.7
## SRR1039521
## 21.8
Create analysis design (accounting for cell line, what is the effect of dex)
library("DESeq2")
dds <- DESeqDataSet(gse, design = ~ cell + dex)
## using counts and average transcript lengths from tximeta
See fragment counts
countdata <- round(assays(gse)[["counts"]])
head(countdata, 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 708 468 901 424 1188
## ENSG00000000005.5 0 0 0 0 0
## ENSG00000000419.12 455 510 604 352 583
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 1091 806 599
## ENSG00000000005.5 0 0 0
## ENSG00000000419.12 774 410 499
Save column data
coldata <- colData(gse)
countdata: a table with the fragment counts coldata: a table with information about the samples
Construct DESeqDataSet
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ cell + dex)
## converting counts to integer mode
Remove rows with minimal data to speed up functions
nrow(dds)
## [1] 58294
keep <- rowSums(counts(dds)) > 1
dds <- dds[keep,]
nrow(dds)
## [1] 31604
Keep at least 3 samples with a count of 10 or higher
keep <- rowSums(counts(dds) >= 10) >= 3
Plot SD of each gene against the mean
lambda <- 10^seq(from = -1, to = 2, length = 1000)
cts <- matrix(rpois(1000*100, lambda), ncol = 100)
library("vsn")
meanSdPlot(cts, ranks = FALSE)
Log transformed counts
log.cts.one <- log2(cts + 1)
meanSdPlot(log.cts.one, ranks = FALSE)
Amplifies differences when values are close to 0 (because of +1 pseudocount).
VST: faster, less sensitive to outliers (n>30) rlog: works well on small datasets, better for wide range of depth (n<30)
Perform VST. Counts no longer counts
vsd <- vst(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(vsd), 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 10.105781 9.852029 10.169726 9.991545 10.424865
## ENSG00000000419.12 9.692244 9.923647 9.801921 9.798653 9.763455
## ENSG00000000457.13 9.449592 9.312186 9.362754 9.459168 9.281415
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 10.194490 10.315814 10.002177
## ENSG00000000419.12 9.874703 9.683211 9.845507
## ENSG00000000457.13 9.395937 9.477971 9.477027
colData(vsd)
## DataFrame with 8 rows and 5 columns
## names donor condition cell dex
## <factor> <factor> <factor> <factor> <factor>
## SRR1039508 SRR1039508 N61311 Untreated N61311 untrt
## SRR1039509 SRR1039509 N61311 Dexamethasone N61311 trt
## SRR1039512 SRR1039512 N052611 Untreated N052611 untrt
## SRR1039513 SRR1039513 N052611 Dexamethasone N052611 trt
## SRR1039516 SRR1039516 N080611 Untreated N080611 untrt
## SRR1039517 SRR1039517 N080611 Dexamethasone N080611 trt
## SRR1039520 SRR1039520 N061011 Untreated N061011 untrt
## SRR1039521 SRR1039521 N061011 Dexamethasone N061011 trt
Perform rlog
rld <- rlog(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(rld), 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 9.482613 9.172197 9.558383 9.346001 9.851349
## ENSG00000000419.12 8.860186 9.150196 9.000042 8.995902 8.951327
## ENSG00000000457.13 8.354790 8.166700 8.236582 8.366693 8.121781
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 9.587602 9.727248 9.357876
## ENSG00000000419.12 9.091075 8.848782 9.054384
## ENSG00000000457.13 8.282307 8.392384 8.391023
blind=false means differences bw treatment and cell lines does not contribute to expected variance-mean trend blind=true for unsupervised experiment
Plot first sample against second. Estimate size factors to account for sequencing depth, set normalized to true.
dds <- estimateSizeFactors(dds)
## using 'avgTxLength' from assays(dds), correcting for library size
df <- bind_rows(
as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = "log2(x + 1)"),
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
## Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
colnames(df)[1:2] <- c("x", "y")
ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)
Compress differences within low count genes, since they provide little information about differential expression.
Calculate differences between samples using VST data. Different samples should be rows, different dimensions (genes) should be columns
sampleDists <- dist(t(assay(vsd)))
sampleDists
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517
## SRR1039509 39.42362
## SRR1039512 32.37620 44.93748
## SRR1039513 51.09677 37.18799 41.79886
## SRR1039516 35.59642 47.54671 34.83458 52.05265
## SRR1039517 51.26314 41.58572 46.89609 40.67315 39.74268
## SRR1039520 32.38578 46.96000 30.35980 48.08669 37.07106 50.38349
## SRR1039521 51.49108 37.57383 47.17283 31.45899 52.62276 41.35941
## SRR1039520
## SRR1039509
## SRR1039512
## SRR1039513
## SRR1039516
## SRR1039517
## SRR1039520
## SRR1039521 43.01502
library("pheatmap")
library("RColorBrewer")
Visualize differences in a heatmap. Supply distances so it does not calculate on its own.
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd$dex, vsd$cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)
Change row names to have treatment type and patient number
Calculate distances using Poisson distance
library("PoiClaClu")
poisd <- PoissonDistance(t(counts(dds)))
Plot heatmap
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
clustering_distance_rows = poisd$dd,
clustering_distance_cols = poisd$dd,
col = colors)
Principal components analysis (PCA): data points on 2D plane. x axis separates data points the most while y axis separates second most. There are still other dimensions of difference
plotPCA(vsd, intgroup = c("dex", "cell"))
PCA plot using ggplot. Return PCA data without building plot.
pcaData <- plotPCA(vsd, intgroup = c( "dex", "cell"), returnData = TRUE)
pcaData
## PC1 PC2 group dex cell name
## SRR1039508 -14.311369 -2.6000421 untrt:N61311 untrt N61311 SRR1039508
## SRR1039509 8.058574 -0.7500532 trt:N61311 trt N61311 SRR1039509
## SRR1039512 -9.404122 -4.3920761 untrt:N052611 untrt N052611 SRR1039512
## SRR1039513 14.497842 -4.1323833 trt:N052611 trt N052611 SRR1039513
## SRR1039516 -12.365055 11.2109581 untrt:N080611 untrt N080611 SRR1039516
## SRR1039517 9.343946 14.9115160 trt:N080611 trt N080611 SRR1039517
## SRR1039520 -10.852633 -7.7618618 untrt:N061011 untrt N061011 SRR1039520
## SRR1039521 15.032816 -6.4860576 trt:N061011 trt N061011 SRR1039521
percentVar <- round(100 * attr(pcaData, "percentVar"))
Build second plot, specifyinf cell line with symbol and treatment with color.
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed() +
ggtitle("PCA with VST data")
Generalized principal component analysis (GLM-PCA) and plot
library("glmpca")
gpca <- glmpca(counts(dds), L=2)
gpca.dat <- gpca$factors
gpca.dat$dex <- dds$dex
gpca.dat$cell <- dds$cell
ggplot(gpca.dat, aes(x = dim1, y = dim2, color = dex, shape = cell)) +
geom_point(size =3) + coord_fixed() + ggtitle("glmpca - Generalized PCA")
Multidimensional scaling (MDS) for matrix of distances from VST data.
mds <- as.data.frame(colData(vsd)) %>%
cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed() + ggtitle("MDS with VST data")
Same plot with Poisson Distance
mdsPois <- as.data.frame(colData(dds)) %>%
cbind(cmdscale(samplePoisDistMatrix))
ggplot(mdsPois, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed() + ggtitle("MDS with PoissonDistances")