We load the saved data from last time
library( tidyverse )
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library( DESeq2 )
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
##
## simplify
## The following objects are masked from 'package:base':
##
## aperm, apply
load( "lecture8.rda" )
Here are all the upregulated genes
What do all the significantly upregulated genes have in common? Are certain groups of genes overrepresented among them? The clusterProfiler
package (Yu et al., Omics 16:284, 2012) allows us to compare with many data bases of gene groups (“categories”).
library( clusterProfiler )
##
## clusterProfiler v3.10.1 For help: https://guangchuangyu.github.io/software/clusterProfiler
##
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:DelayedArray':
##
## simplify
## The following object is masked from 'package:purrr':
##
## simplify
Let’s start with the most popular of these group collections, the Gene Ontology (Nucl Acid Res, 45:D331, 2017). It is divided into three sub-ontologies, for cellular components (“CC”), molecular functions (“MF”), and biological processes (“BP”). The assignment of human genes to the GO catogies is one of the many gene annotation information in Bioconductor’s Homo sapients organism database package
library( org.Hs.eg.db )
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
##
We can ask clusterProfiler to check for enrichment of the significantly upregulated genes in the CC sub-ontology:
res %>%
filter( signif, log2FoldChange > 0 ) %>%
pull( "EnsemblID" ) %>%
enrichGO(
OrgDb = org.Hs.eg.db,
keyType = 'ENSEMBL',
ont = "CC",
universe = ( res %>% filter( !is.na(padj) ) %>% pull( "EnsemblID" ) ) ) ->
egoCC
egoCC
## #
## # over-representation test
## #
## #...@organism Homo sapiens
## #...@ontology CC
## #...@keytype ENSEMBL
## #...@gene chr [1:807] "ENSG00000008130" "ENSG00000116285" "ENSG00000162426" ...
## #...pvalues adjusted by 'BH' with cutoff <0.05
## #...59 enriched terms found
## 'data.frame': 59 obs. of 9 variables:
## $ ID : chr "GO:0015629" "GO:0031012" "GO:0062023" "GO:0070161" ...
## $ Description: chr "actin cytoskeleton" "extracellular matrix" "collagen-containing extracellular matrix" "anchoring junction" ...
## $ GeneRatio : chr "66/765" "56/765" "47/765" "74/765" ...
## $ BgRatio : chr "336/11129" "281/11129" "210/11129" "465/11129" ...
## $ pvalue : num 3.20e-15 2.36e-13 2.59e-13 5.08e-12 5.34e-12 ...
## $ p.adjust : num 1.58e-12 4.27e-11 4.27e-11 5.27e-10 5.27e-10 ...
## $ qvalue : num 1.30e-12 3.50e-11 3.50e-11 4.33e-10 4.33e-10 ...
## $ geneID : chr "ENSG00000162458/ENSG00000169504/ENSG00000131236/ENSG00000117519/ENSG00000143549/ENSG00000143322/ENSG00000162704"| __truncated__ "ENSG00000142871/ENSG00000060718/ENSG00000143382/ENSG00000135862/ENSG00000116962/ENSG00000049323/ENSG00000196975"| __truncated__ "ENSG00000060718/ENSG00000143382/ENSG00000135862/ENSG00000116962/ENSG00000049323/ENSG00000196975/ENSG00000204262"| __truncated__ "ENSG00000162458/ENSG00000131236/ENSG00000077254/ENSG00000162614/ENSG00000117519/ENSG00000162704/ENSG00000143878"| __truncated__ ...
## $ Count : int 66 56 47 74 73 63 62 62 31 30 ...
## #...Citation
## Guangchuang Yu, Li-Gen Wang, Yanyan Han and Qing-Yu He.
## clusterProfiler: an R package for comparing biological themes among
## gene clusters. OMICS: A Journal of Integrative Biology
## 2012, 16(5):284-287
About the universe
option: This is the list of all genes for whcih we had enought data to actually test for differential expression. We use thse for which an adjusted p value has been reported. This is non-obvious, and we will have to discuss p-value adjustment and independent hypothesis filtering before explaining.
clusterProfile offers several visualization methods. Here is one:
dotplot(egoCC)
GO categories are organized in a DAG, so, we should look at that, too:
goplot( egoCC )
Be sure to check out the clusterProfiler vignette to learn more about the package’s possibilities:
openVignette( "clusterProfiler" )
Let’s look at one, emapplot
, which shows us how much overlap there is between related categories
emapplot( egoCC )
Also, don’t forget to also check enrichment for the other two sub-ontologies, and for the down-regulated gene.
There are other useful gene categories, for example the Moleculare Signature Database (MSigDb) (Liberson et al., Bioinformatics, 27:1739, 2011), and within it, the Hallmark Gene Set Set Collection (Liberzon et al., Cell Systems, 1:417, 2015)
Let’s download the Hallmark collection from the MsigDb web page. It’s the file h.all.v6.2.symbols.gmt
. As always, have a look at the file first. You’ll notice that each line starts with the name of a gene set, followed by a web link to a description and then a list of gene names.
The read.gmt
function of clusterProfiler can read such files.
gmtH <- read.gmt( "RNASeq/airway/h.all.v6.2.symbols.gmt" )
## Loading required package: GSEABase
## Loading required package: annotate
## Loading required package: XML
## Loading required package: graph
##
## Attaching package: 'graph'
## The following object is masked from 'package:XML':
##
## addNode
## The following object is masked from 'package:stringr':
##
## boundary
head(gmtH)
Instead of the enrichGO
function, we now use clusterProfiler’s more generic enricher
function:
enrH <- enricher(
gene = ( res %>% filter( signif & log2FoldChange>0 ) %>% pull( "Gene name" ) ),
TERM2GENE = gmtH,
universe = ( res %>% filter( !is.na(padj) ) %>% pull( "Gene name" ) )
)
dotplot( enrH )
NEXT: Do we believe that? Do it by hand.
gmtH %>%
filter( ont == "HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION" ) %>%
pull( gene ) ->
emtGenes
res %>%
ggplot( ) +
geom_point(
aes(
x = baseMean,
y = log2FoldChange,
col = interaction(
signif,
`Gene name` %in% emtGenes )
)
) +
scale_x_log10() +
scale_y_continuous( oob = scales::squish )
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 32275 rows containing missing values (geom_point).
Explaining the test:
res %>%
filter( !is.na( padj ) ) %>% # universe
mutate(
signifUp = signif & log2FoldChange > 0,
emtGene = `Gene name` %in% emtGenes ) %>%
group_by( signifUp, emtGene ) %>%
tally() %>%
spread( emtGene, n ) %>%
column_to_rownames( "signifUp" ) %>%
fisher.test
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 3.662834 7.452288
## sample estimates:
## odds ratio
## 5.265303
Which genes react most strongly to dexamethasone? Not necessarily the ones at the top of the MA plot:
plotMA( results(dds), ylim = c( -6, 6 ) )
This is because fold changes are exaggerated for weakly expressed or otherwise variable genes. We can compensate by shrinking the reported log fold changes to “trade bias for variance”. We will discuss this in more detail next time.
res2 <- lfcShrink( dds, coef = "dexamethasoneTRUE", type = "apeglm" )
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res2
## log2 fold change (MAP): dexamethasoneTRUE
## Wald test p-value: dexamethasoneTRUE
## DataFrame with 58735 rows and 5 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000223972 0.899451678871487 -0.0129051366222438 0.266921725783118
## ENSG00000227232 18.9656724190172 0.0185572500050121 0.225633514341154
## ENSG00000278267 0.774339343863466 0.00244152679494093 0.266664114487098
## ENSG00000243485 0.16459638610649 -0.000203974450421599 0.268046225450947
## ENSG00000284332 0 NA NA
## ... ... ... ...
## ENSG00000271254 63.5947117488586 -0.729915241895625 0.282292451652418
## ENSG00000275405 0.0830439460944298 -0.00448019687817336 0.268215161402206
## ENSG00000275987 0 NA NA
## ENSG00000277475 0 NA NA
## ENSG00000268674 0 NA NA
## pvalue padj
## <numeric> <numeric>
## ENSG00000223972 0.755255553053589 NA
## ENSG00000227232 0.882149504421758 0.984774363909095
## ENSG00000278267 0.912208743834165 NA
## ENSG00000243485 0.978310406820466 NA
## ENSG00000284332 NA NA
## ... ... ...
## ENSG00000271254 0.000848220460400001 0.0126618400005116
## ENSG00000275405 0.880740491925834 NA
## ENSG00000275987 NA NA
## ENSG00000277475 NA NA
## ENSG00000268674 NA NA
plotMA( res2, ylim = c( -6, 6 ) )
Let’s look at the new top genes
res2 %>%
as_tibble( rownames = "EnsemblID" ) %>%
left_join(
read_tsv( "RNASeq/airway/mart_export.txt" ),
by = c( "EnsemblID" = "Gene stable ID" ) ) %>%
arrange( -log2FoldChange )
## Parsed with column specification:
## cols(
## `Gene stable ID` = col_character(),
## `Gene name` = col_character(),
## `Chromosome/scaffold name` = col_character(),
## `Gene description` = col_character(),
## `Gene start (bp)` = col_double(),
## `Gene end (bp)` = col_double()
## )
We can also redo our interactvie plot
library( rlc )
openPage( useViewer=FALSE, layout="table2x2" )
## WebSocket opened
## Warning in func(req): File '/favicon.ico' is not found
lc_scatter(
dat(
x = res2$baseMean,
y = res2$log2FoldChange,
colorValue = !is.na(res2$padj) & res2$padj < .1,
label = res2$`Gene name`,
on_click = function(k) { gene <<- k; updateCharts("A2") }
),
place = "A1",
logScaleX = 10,
size = 2,
colourLegendTitle = "significant"
)
## Chart 'A1' added.
## Layer 'Layer1' is added to chart 'A1'.
lc_scatter(
dat(
x = colData(dds)$cell_line,
y = log10( 1 + counts( dds, normalized=TRUE )[ gene, ] ),
colourValue = colData(dds)$treatment
),
place = "A2",
domainY = c( 0, 5 )
)
## Chart 'A2' added.
## Layer 'Layer1' is added to chart 'A2'.
How about now using just the genes with LFC > 1.5 in an GO enrichment analysis?
res2 %>%
as_tibble( rownames = "EnsemblID" ) %>%
filter( log2FoldChange > 1.5 ) %>%
pull( "EnsemblID" ) %>%
enrichGO(
OrgDb = org.Hs.eg.db,
keyType = 'ENSEMBL',
ont = "CC",
universe = ( res %>% filter( baseMean > 8 ) %>% pull( "EnsemblID" ) ) ) ->
egoCC2
egoCC2
## #
## # over-representation test
## #
## #...@organism Homo sapiens
## #...@ontology CC
## #...@keytype ENSEMBL
## #...@gene chr [1:130] "ENSG00000116285" "ENSG00000171819" "ENSG00000162493" ...
## #...pvalues adjusted by 'BH' with cutoff <0.05
## #...2 enriched terms found
## 'data.frame': 2 obs. of 9 variables:
## $ ID : chr "GO:0031012" "GO:0062023"
## $ Description: chr "extracellular matrix" "collagen-containing extracellular matrix"
## $ GeneRatio : chr "16/120" "11/120"
## $ BgRatio : chr "262/10264" "198/10264"
## $ pvalue : num 5.94e-08 1.94e-05
## $ p.adjust : num 1.28e-05 2.09e-03
## $ qvalue : num 1.26e-05 2.05e-03
## $ geneID : chr "ENSG00000142871/ENSG00000060718/ENSG00000081052/ENSG00000157150/ENSG00000163661/ENSG00000152583/ENSG00000138829"| __truncated__ "ENSG00000060718/ENSG00000081052/ENSG00000152583/ENSG00000138829/ENSG00000196569/ENSG00000127083/ENSG00000262655"| __truncated__
## $ Count : int 16 11
## #...Citation
## Guangchuang Yu, Li-Gen Wang, Yanyan Han and Qing-Yu He.
## clusterProfiler: an R package for comparing biological themes among
## gene clusters. OMICS: A Journal of Integrative Biology
## 2012, 16(5):284-287
How about the hallmark set?
res2 %>%
as_tibble( rownames = "EnsemblID" ) %>%
filter( log2FoldChange > 1.5 ) %>%
left_join(
read_tsv( "RNASeq/airway/mart_export.txt" ),
by = c( "EnsemblID" = "Gene stable ID" ) ) %>%
pull( "Gene name" ) %>%
enricher(
TERM2GENE = gmtH,
universe = ( res %>% filter( baseMean > 8 ) %>% pull( "Gene name" ) ) ) ->
egoH2
## Parsed with column specification:
## cols(
## `Gene stable ID` = col_character(),
## `Gene name` = col_character(),
## `Chromosome/scaffold name` = col_character(),
## `Gene description` = col_character(),
## `Gene start (bp)` = col_double(),
## `Gene end (bp)` = col_double()
## )
egoH2
## #
## # over-representation test
## #
## #...@organism UNKNOWN
## #...@ontology UNKNOWN
## #...@gene chr [1:130] "ERRFI1" "ANGPTL7" "PDPN" "TRNP1" "NEXN" "CYR61" ...
## #...pvalues adjusted by 'BH' with cutoff <0.05
## #...1 enriched terms found
## 'data.frame': 1 obs. of 9 variables:
## $ ID : chr "HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION"
## $ Description: chr "HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION"
## $ GeneRatio : chr "11/61"
## $ BgRatio : chr "181/3016"
## $ pvalue : num 8e-04
## $ p.adjust : num 0.0296
## $ qvalue : num 0.0278
## $ geneID : chr "CYR61/COL11A1/IGFBP2/OXTR/PTX3/FBN2/LAMA2/NNMT/FZD8/THBS1/FSTL3"
## $ Count : int 11
## #...Citation
## Guangchuang Yu, Li-Gen Wang, Yanyan Han and Qing-Yu He.
## clusterProfiler: an R package for comparing biological themes among
## gene clusters. OMICS: A Journal of Integrative Biology
## 2012, 16(5):284-287
Instead of a Fisher test, we can also do a Kolmogorov-Smirnov-style test, which is independent of a threshold, by ranking the genes by shrunken fold change.
res2 %>%
as_tibble( rownames = "EnsemblID" ) %>%
filter( baseMean > 8 ) %>%
left_join(
read_tsv( "RNASeq/airway/mart_export.txt" ),
by = c( "EnsemblID" = "Gene stable ID" ) ) %>%
arrange( -log2FoldChange ) %>%
dplyr::select( `EnsemblID`, log2FoldChange ) %>%
deframe %>%
gseGO( OrgDb = org.Hs.eg.db,
keyType = 'ENSEMBL',
ont = "CC",
nPerm = 1000 ) ->
gseCC
## Parsed with column specification:
## cols(
## `Gene stable ID` = col_character(),
## `Gene name` = col_character(),
## `Chromosome/scaffold name` = col_character(),
## `Gene description` = col_character(),
## `Gene start (bp)` = col_double(),
## `Gene end (bp)` = col_double()
## )
## preparing geneSet collections...
## GSEA analysis...
## no term enriched under specific pvalueCutoff...