set.seed(0)
library(SingleCellExperiment)
library(scran)
library(scater)
library(dplyr)
library(biomaRt)
Chicken 2: Cock-a-doodle-doo
Today we’ll continue off from last time, looking at the data from the paper Feregrino et al. (2019). We didn’t get much done yesterday because of a side-quest involving chicken breeds.
Since every dataset from the Single Cell Expression Atlas has the same form, I figured it might be convenient to write a function to handle it. In the future I’ll refactor this into a separate script on this blog, instead of repeating it every post.
<- function(dir.path, exp.name) {
load.scea.data # Load in the matrix of counts
<- Matrix::readMM(
counts.mat paste0(
dir.path,'-quantification-raw-files/',
exp.name, '.aggregated_filtered_counts.mtx'
exp.name,
)
)
# Get the gene names for the matrix
<- read.csv(
counts.rows paste0(
dir.path,'-quantification-raw-files/',
exp.name, '.aggregated_filtered_counts.mtx_rows'
exp.name,
),sep='\t',
header=FALSE
$V1
)
# Get the cell ids for the matrix
<- read.csv(
counts.cols paste0(
dir.path,'-quantification-raw-files/',
exp.name, '.aggregated_filtered_counts.mtx_cols'
exp.name,
),sep='\t',
header=FALSE
$V1
)
# Get all cellwise metadata
<- read.csv(
meta.data paste0(
dir.path,'ExpDesign-', exp.name, '.tsv'
),sep='\t',
header=TRUE,
row.names='Assay'
)
rownames(counts.mat) <- counts.rows
colnames(counts.mat) <- counts.cols
# Ensure that the meta.data lines up with the cells
if (!identical(rownames(meta.data), colnames(counts.mat))) {
warning("The metadata does not line up with the cells")
}
return(
SingleCellExperiment(
assays=list(counts=counts.mat),
colData=meta.data
)
) }
<- load.scea.data('./localdata/Datasets/E-CURD-13/', 'E-CURD-13')
sce sce
class: SingleCellExperiment
dim: 13645 7688
metadata(0):
assays(1): counts
rownames(13645): ENSGALG00000000003 ENSGALG00000000011 ...
ENSGALG00000055127 ENSGALG00000055132
rowData names(0):
colnames(7688): SAMN11526603-AAAAAAATTCAG SAMN11526603-AAAAACAAGTAG ...
SAMN11526603-TTTTTTTGTGAG SAMN11526603-TTTTTTTTTTTT
colData names(18): Sample.Characteristic.organism.
Sample.Characteristic.Ontology.Term.organism. ...
Factor.Value.inferred.cell.type...ontology.labels.
Factor.Value.Ontology.Term.inferred.cell.type...ontology.labels.
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
And then, for the purposes of future quality control, we pull the mitochondrial genes using the fruits of our labor from yesterday.
<- useDataset(
mart.archive "ggallus_gene_ensembl",
useMart(
"ensembl",
host="https://apr2022.archive.ensembl.org"
)
)<- getBM(
gene.to.symbol.map filters="ensembl_gene_id",
attributes=c(
"ensembl_gene_id",
"hgnc_symbol"
),values=rownames(sce),
mart=mart.archive
)
gene.to.symbol.map[apply(
"hgnc_symbol"],
gene.to.symbol.map[1,
function(x) grepl("^MT-", x)
),$ensembl_gene_id ]
- 'ENSGALG00000032142'
- 'ENSGALG00000043768'
rowData(sce) <- gene.to.symbol.map
<- grep("MT-", rowData(sce)$hgnc_symbol)
is.mito sce
class: SingleCellExperiment
dim: 13645 7688
metadata(0):
assays(1): counts
rownames(13645): ENSGALG00000000003 ENSGALG00000000011 ...
ENSGALG00000055127 ENSGALG00000055132
rowData names(2): ensembl_gene_id hgnc_symbol
colnames(7688): SAMN11526603-AAAAAAATTCAG SAMN11526603-AAAAACAAGTAG ...
SAMN11526603-TTTTTTTGTGAG SAMN11526603-TTTTTTTTTTTT
colData names(18): Sample.Characteristic.organism.
Sample.Characteristic.Ontology.Term.organism. ...
Factor.Value.inferred.cell.type...ontology.labels.
Factor.Value.Ontology.Term.inferred.cell.type...ontology.labels.
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
And now we can run the standard scuttle::perCellQCMetrics
to get some basic info on the cell quality.
<- perCellQCMetrics(sce, subsets=list(Mito=is.mito)) per.cell.QC
per.cell.QC
DataFrame with 7688 rows and 6 columns
sum detected subsets_Mito_sum
<numeric> <numeric> <numeric>
SAMN11526603-AAAAAAATTCAG 972 679 8
SAMN11526603-AAAAACAAGTAG 1706 1050 9
SAMN11526603-AAAAACCTGCAT 764 554 4
SAMN11526603-AAAAACTCGTAA 1102 794 15
SAMN11526603-AAAAAGGATTCG 1393 844 11
... ... ... ...
SAMN11526603-TTTTTGTTCGGG 2416 1225 14
SAMN11526603-TTTTTTAGAGGG 2510 1332 16
SAMN11526603-TTTTTTTGCCCT 1719 1068 5
SAMN11526603-TTTTTTTGTGAG 898 614 4
SAMN11526603-TTTTTTTTTTTT 869 750 5
subsets_Mito_detected subsets_Mito_percent total
<numeric> <numeric> <numeric>
SAMN11526603-AAAAAAATTCAG 2 0.823045 972
SAMN11526603-AAAAACAAGTAG 2 0.527550 1706
SAMN11526603-AAAAACCTGCAT 2 0.523560 764
SAMN11526603-AAAAACTCGTAA 2 1.361162 1102
SAMN11526603-AAAAAGGATTCG 2 0.789663 1393
... ... ... ...
SAMN11526603-TTTTTGTTCGGG 2 0.579470 2416
SAMN11526603-TTTTTTAGAGGG 2 0.637450 2510
SAMN11526603-TTTTTTTGCCCT 2 0.290867 1719
SAMN11526603-TTTTTTTGTGAG 2 0.445434 898
SAMN11526603-TTTTTTTTTTTT 2 0.575374 869
sum(counts(sce[,1]))
sum(counts(sce[,1])>0)
We can see that the sum
and detected
are the library size and the number of unique genes. It also gives us info on specific subsets, which is why we passed in the mitochondrial subset info.
<- addPerCellQCMetrics(sce, subsets=list(Mito=is.mito)) sce
Using this, we can work out which cells to discard:
<- perCellQCFilters(
reasons
sce, sub.fields=c("subsets_Mito_percent")
)
reasons
DataFrame with 7688 rows and 4 columns
low_lib_size low_n_features high_subsets_Mito_percent discard
<outlier.filter> <outlier.filter> <outlier.filter> <logical>
1 FALSE FALSE FALSE FALSE
2 FALSE FALSE FALSE FALSE
3 FALSE FALSE FALSE FALSE
4 FALSE FALSE TRUE TRUE
5 FALSE FALSE FALSE FALSE
... ... ... ... ...
7684 FALSE FALSE FALSE FALSE
7685 FALSE FALSE FALSE FALSE
7686 FALSE FALSE FALSE FALSE
7687 FALSE FALSE FALSE FALSE
7688 FALSE FALSE FALSE FALSE
I quite like the scuttle::perCellQCFilters
function, since it gives a list of reasons why it would discard each cell. low_lib_size
and low_n_features
may for example correspond to fake cells (empty droplets with background RNA) or doublets and high_subsets_Mito_percent
corresponds to overly mitochondrial data.
In fact, there’s even some hidden metadata attached in the form of the chosen thresholds:
attr(reasons$low_lib_size, "thresholds")
- lower
- 406.582871569237
- higher
- Inf
print(paste0("Discarding ", sum(reasons$discard)))
<- sce[, !reasons$discard] sce
[1] "Discarding 258"
logNormCounts(sce)
class: SingleCellExperiment
dim: 13645 7430
metadata(0):
assays(2): counts logcounts
rownames(13645): ENSGALG00000000003 ENSGALG00000000011 ...
ENSGALG00000055127 ENSGALG00000055132
rowData names(2): ensembl_gene_id hgnc_symbol
colnames(7430): SAMN11526603-AAAAAAATTCAG SAMN11526603-AAAAACAAGTAG ...
SAMN11526603-TTTTTTTGTGAG SAMN11526603-TTTTTTTTTTTT
colData names(25): Sample.Characteristic.organism.
Sample.Characteristic.Ontology.Term.organism. ... total sizeFactor
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):