set.seed(0)
library(SingleCellExperiment)
library(scran)
library(scater)
library(dplyr)
library(biomaRt)
library(sparseMatrixStats)
3 Chicken 3 Furious
Today we’ll once again continue where we left off. Some general goals for this project:
- Follow the Bioconductor OSCA Basics Book
- Maybe re-assemble from the FASTQ files using the new chicken assembly in Ensembl
I’d like to go through today and practice Feature Selection and Dimensionality Reduction.
Set up utility functions
In the future I’ll split these off into a utility file.
<- function(dir.path, exp.name) {
load.scea.data # Load in the matrix of counts
<- Matrix::readMM(
counts.mat paste0(
dir.path,'-quantification-raw-files/',
exp.name, '.aggregated_filtered_counts.mtx'
exp.name,
)|> as(Class="CsparseMatrix")
)
# Get the gene names for the matrix
<- read.csv(
counts.rows paste0(
dir.path,'-quantification-raw-files/',
exp.name, '.aggregated_filtered_counts.mtx_rows'
exp.name,
),sep='\t',
header=FALSE
$V1
)
# Get the cell ids for the matrix
<- read.csv(
counts.cols paste0(
dir.path,'-quantification-raw-files/',
exp.name, '.aggregated_filtered_counts.mtx_cols'
exp.name,
),sep='\t',
header=FALSE
$V1
)
# Get all cellwise metadata
<- read.csv(
meta.data paste0(
dir.path,'ExpDesign-', exp.name, '.tsv'
),sep='\t',
header=TRUE,
row.names='Assay'
)
rownames(counts.mat) <- counts.rows
colnames(counts.mat) <- counts.cols
# Ensure that the meta.data lines up with the cells
if (!identical(rownames(meta.data), colnames(counts.mat))) {
warning("The metadata does not line up with the cells")
}
return(
SingleCellExperiment(
assays=list(counts=counts.mat),
colData=meta.data
)
)
}
<- function(sce, mart) {
add.hgnc.names rowData(sce) <- getBM(
filters="ensembl_gene_id",
attributes=c(
"ensembl_gene_id",
"hgnc_symbol"
),values=rownames(sce),
mart=mart
)return(sce)
}
<- function(sce) {
add.mito.info rowSubset(sce, field="is.mito") <- grep("MT-", rowData(sce)$hgnc_symbol)
return(sce)
}
<- useDataset(
mart "ggallus_gene_ensembl",
useMart(
"ensembl",
host="https://apr2022.archive.ensembl.org"
)
)
<- load.scea.data("./localdata/Datasets/E-CURD-13/", "E-CURD-13") sce
After setting up our utility functions, it’s pretty easy to catch up to where we were yesterday:
<- useDataset(
mart "ggallus_gene_ensembl",
useMart(
"ensembl",
host="https://apr2022.archive.ensembl.org"
)
)<- load.scea.data("./localdata/Datasets/E-CURD-13/", "E-CURD-13") |>
sce add.hgnc.names(mart=mart) |>
add.mito.info() |>
quickPerCellQC(subsets=list(Mito=is.mito), sub.fields="subsets_Mito_percent") |>
logNormCounts()
sce
class: SingleCellExperiment
dim: 13645 7430
metadata(0):
assays(2): counts logcounts
rownames(13645): ENSGALG00000000003 ENSGALG00000000011 ...
ENSGALG00000055127 ENSGALG00000055132
rowData names(3): ensembl_gene_id hgnc_symbol is.mito
colnames(7430): SAMN11526603-AAAAAAATTCAG SAMN11526603-AAAAACAAGTAG ...
SAMN11526603-TTTTTTTGTGAG SAMN11526603-TTTTTTTTTTTT
colData names(29): Sample.Characteristic.organism.
Sample.Characteristic.Ontology.Term.organism. ... discard sizeFactor
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
We’re now going to find information on the variance of the genes, for feature selection. The OSCA book makes the following plot:
<- modelGeneVar(sce)
gene.var.data <- metadata(gene.var.data)
md <- data.frame(mean=md$mean, var=md$var) md.as.df
ggplot(md.as.df, aes(mean, var)) +
geom_point(shape=2) +
geom_line(
aes(mean, md$trend(mean), lty='Variance Trend'),
color="blue",
linewidth=2,
+
) labs(
x="Mean of Log-Counts",
y="Variance of Log-Counts"
+
) ggtitle("Mean-Variance Plot of Log-Counts") +
scale_linetype("") # This changes the legend title for some reason
This plot is showing the mean-variance trend of the logs. The trend is assumed to be the ‘uninteresting variation’, i.e. the baseline variation of the experiment (assumed to be the technical variation and the ‘boring’ biological variation). Under this assumption, the deviation from this trend represents the interesting biological variation.
But speaking of mean-variance trends, this reminds me of the fact that Poisson distributions have equal mean and variance; it’s reasonable to believe that our original (non-log) counts follow a Poisson distribution: is this correct?
<- rowMeans(assay(sce, "counts"))
means <- rowVars(assay(sce, "counts"))
vars
ggplot(
data.frame(
means=means,
vars=vars
),aes(
x=means,
y=vars
)+
) geom_point() +
geom_abline(slope=1, color='blue')
Interestingly, this is not accurate at all! The variance is way higher than the means. This could be because the counts have already been scaled by a value. I’ll have a think about this for the future. I did discover the modelGeneVarByPoisson
function, so let’s see what that does:
<- modelGeneVarByPoisson(sce)
gene.var.data <- metadata(gene.var.data)
md <- data.frame(mean=md$mean, var=md$var) md.as.df
ggplot(md.as.df, aes(mean, var)) +
geom_point(shape=2) +
geom_line(
aes(mean, md$trend(mean), lty='Variance Trend'),
color="blue",
linewidth=1,
+
) labs(
x="Mean of Log-Counts",
y="Variance of Log-Counts"
+
) ggtitle("Mean-Variance Plot of Log-Counts Poissonly") +
scale_linetype("") # This changes the legend title for some reason
Interestingly enough, this looks very accurate! So it’s plausible that I messed up somewhere in my prior plot. Of course, I don’t really understand what this plot represents at the moment - but it’s clear the predicted trend deviates somewhat from the true trend at the maximum (it oversetimates it).
For completeness, let’s peek at the most varying genes!
<- gene.var.data[order(gene.var.data$bio, decreasing=TRUE),]
genes.by.variance head(genes.by.variance)
DataFrame with 6 rows and 6 columns
mean total tech bio p.value
<numeric> <numeric> <numeric> <numeric> <numeric>
ENSGALG00000037167 1.554552 2.50414 0.848294 1.655847 3.16059e-154
ENSGALG00000035998 3.027209 2.17292 0.525832 1.647090 0.00000e+00
ENSGALG00000046616 0.717956 2.08196 0.619730 1.462226 3.02326e-224
ENSGALG00000010745 1.929346 2.16633 0.816513 1.349819 2.86377e-111
ENSGALG00000052192 2.813587 1.65282 0.577865 1.074959 2.77975e-140
ENSGALG00000015970 0.590967 1.49926 0.542383 0.956873 2.09606e-126
FDR
<numeric>
ENSGALG00000037167 8.62525e-151
ENSGALG00000035998 0.00000e+00
ENSGALG00000046616 1.03131e-220
ENSGALG00000010745 2.60508e-108
ENSGALG00000052192 5.41853e-137
ENSGALG00000015970 3.17786e-123
rowData(sce[rownames(sce) == "ENSGALG00000037167"])$hgnc_symbol
rowData(sce[rownames(sce) == "ENSGALG00000035998"])$hgnc_symbol
rowData(sce[rownames(sce) == "ENSGALG00000046616"])$hgnc_symbol
rowData(sce[rownames(sce) == "ENSGALG00000010745"])$hgnc_symbol
rowData(sce[rownames(sce) == "ENSGALG00000052192"])$hgnc_symbol
rowData(sce[rownames(sce) == "ENSGALG00000015970"])$hgnc_symbol
Most of these aren’t named, annoyingly - which made me realize that the H in “HGNC” stands for “human” 😅
If we use other sources, such as bgee, we can get a bit more info.
Bgee is a database for retrieval and comparison of gene expression patterns across multiple animal species. It provides an intuitive answer to the question “where is a gene expressed?” and supports research in cancer and agriculture as well as evolutionary biology.
ENSGALG00000037167
- No hgnc identifier
- “HIST1H103” https://bgee.org/gene/ENSGALG00000037167/
ENSGALG00000035998
- No hgnc identifier
- “HINTW” https://bgee.org/gene/ENSGALG00000035998/
ENSGALG00000046616
- “CHGB” hgnc
- “CHGB” https://bgee.org/gene/ENSGALG00000046616/
ENSGALG00000010745
- No hgnc identifier
- “HMGB2” https://bgee.org/gene/ENSGALG00000010745/
ENSGALG00000052192
- “TUBA1B” hgnc
- “TUBA1B” https://bgee.org/gene/ENSGALG00000052192/
ENSGALG00000015970
- No hgnc identifier
- “COL9A1” https://bgee.org/gene/ENSGALG00000015970/
This whole thing about chickens “not being” humans is annoying, because it throws into question my mitochondrial analysis as well. What if the reason only two mitochondrial genes is because the whole check-hgnc-name-trick failed due to lack of a valid hgnc name?
If we get the chromosome_name
tag from biomart
, this would be a more direct way to get the mitochondrial genes!
<- getBM(
gene.chromosome.map filters="ensembl_gene_id",
attributes=c(
"ensembl_gene_id",
"chromosome_name"
),values=rownames(sce),
mart=mart
)
"chromosome_name"] == "MT",] gene.chromosome.map[gene.chromosome.map[
ensembl_gene_id | chromosome_name | |
---|---|---|
<chr> | <chr> | |
8565 | ENSGALG00000029500 | MT |
8768 | ENSGALG00000030436 | MT |
9146 | ENSGALG00000032079 | MT |
9151 | ENSGALG00000032142 | MT |
9218 | ENSGALG00000032456 | MT |
9220 | ENSGALG00000032465 | MT |
9858 | ENSGALG00000035334 | MT |
10038 | ENSGALG00000036229 | MT |
10372 | ENSGALG00000037838 | MT |
11108 | ENSGALG00000041091 | MT |
11425 | ENSGALG00000042478 | MT |
11478 | ENSGALG00000042750 | MT |
11684 | ENSGALG00000043768 | MT |
And we can see that there are 13 mitochondrial genes in our dataset, much more than the two we originally found!
We’ll continue with our analysis for now, but next time we return to this dataset we’ll redo the QC steps. It’s plausible that we’ll get wildly different results, which would prove the importance of QC.
<- sce[getTopHVGs(gene.var.data, prop=0.1),]
sce sce
class: SingleCellExperiment
dim: 1160 7430
metadata(0):
assays(2): counts logcounts
rownames(1160): ENSGALG00000035998 ENSGALG00000037167 ...
ENSGALG00000011330 ENSGALG00000026203
rowData names(3): ensembl_gene_id hgnc_symbol is.mito
colnames(7430): SAMN11526603-AAAAAAATTCAG SAMN11526603-AAAAACAAGTAG ...
SAMN11526603-TTTTTTTGTGAG SAMN11526603-TTTTTTTTTTTT
colData names(29): Sample.Characteristic.organism.
Sample.Characteristic.Ontology.Term.organism. ... discard sizeFactor
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
<- sce |>
sce runPCA(ncomponents=14) |>
runUMAP(ncomponents=2, dimred='PCA') |>
runTSNE(ncomponents=2, dimred='PCA') |>
runPCA(ncomponents=2, name='PCA.tiny')
sce
class: SingleCellExperiment
dim: 1160 7430
metadata(0):
assays(2): counts logcounts
rownames(1160): ENSGALG00000035998 ENSGALG00000037167 ...
ENSGALG00000011330 ENSGALG00000026203
rowData names(3): ensembl_gene_id hgnc_symbol is.mito
colnames(7430): SAMN11526603-AAAAAAATTCAG SAMN11526603-AAAAACAAGTAG ...
SAMN11526603-TTTTTTTGTGAG SAMN11526603-TTTTTTTTTTTT
colData names(29): Sample.Characteristic.organism.
Sample.Characteristic.Ontology.Term.organism. ... discard sizeFactor
reducedDimNames(4): PCA UMAP TSNE PCA.tiny
mainExpName: NULL
altExpNames(0):
plotReducedDim(sce, dimred='UMAP') + ggtitle("UMAP")
plotReducedDim(sce, dimred='TSNE') + ggtitle("TSNE")
plotReducedDim(sce, dimred='PCA.tiny') + ggtitle("PCA")
None of these look great, but they do seem comparable to the UMAP reported by the Single Cell Expression Atlas:
We can cheat by coloring in the cells by the predicted cell types, to get a thumbnail for this post (ideally we’d like to learn how to predict these ourselves).
plotReducedDim(
sce,dimred='UMAP',
color_by="Factor.Value.inferred.cell.type...ontology.labels."
+ ggtitle("UMAP") )