library(igraph)
library(ggplot2)
Fixed antGLasso
Recap
In previous blog posts we looked at estimating dependencies between microbes in a microbiome (Dataset and Baseline Methodology: Vincent Prost 2021). We saw that my custom method, antGLasso
, did not perform as well as the method we compared against - in fact, it seemed as though antGLasso
was saying nothing of value!
After much investigation, this was due to an error in my implementation, which I have now fixed. Let’s see how it compares, using their assortativity metric!
Load in the Data
Code here
antGLasso
<- as.matrix(
antGLasso.mat read.csv(
"./localdata/antGlasso-output-iter-raws--fast-test.csv",
header=FALSE,
col.names=paste0("C", 1:565)
) )
# These bounds were chosen by eye.
#upper.bound <- 0.15
#lower.bound <- 0.05
<- 0.6
upper.bound <- 0.07
lower.bound sum(abs(antGLasso.mat) > upper.bound) / 2 - 282.5
sum(abs(antGLasso.mat) > lower.bound) / 2 - 282.5
<- exp(0:19 * (log(upper.bound) - log(lower.bound)) / 19 + log(lower.bound)) antGLasso.lambdas
# Construct the regularized antGLasso matrices
<- function(mat., threshold) {
threshold.matrix <- matrix(0, dim(mat.)[[1]], dim(mat.)[[2]])
mat abs(mat.) < threshold] = 0
mat[abs(mat.) > threshold] = 1
mat[diag(mat) <- 0
return(mat)
}<- function(mat) {
num.elements sum(mat != 0)
}<- lapply(
antGLasso.path
antGLasso.lambdas,function(thresh) threshold.matrix(antGLasso.mat, thresh)
)
ZiLN
# Load the data
load("localdata/ll_deep.rda")
# Rename the taxmat columns to something more informative
colnames(taxmat) <- c(
"Domain",
"Phylum",
"Class",
"Order",
"Family",
"Genus"
)
# Load the libraries
source("./localdata/Zi-LN-master/inference.R")
source("./localdata/Zi-LN-master/utils/utils.R")
library("huge")
library("igraph")
# Get a boolean 1135x3957 matrix of whether the species
# was found in the person or not
<- counts > 0
nonzeros
# Get the number of distinct people that possessed each species
<- apply(nonzeros, 2, sum)
num.nonzeros
# Get the total amount of people
<- dim(counts)[1]
total.cells
# Only keep the species who appear in more than 20% of the people
<- (num.nonzeros / total.cells) > 0.2
keep.indices <- as.matrix(counts[, keep.indices])
counts_el write.csv(counts_el, "./localdata/filtered-raw-counts-ziln.csv")
<- taxmat[keep.indices,]
taxmat_el
# Get the zs
options(warn = -1) # turn warnings off because otherwise it's gonna scream...
<- infer_Z(counts_el) zs
# Get the matrix for ZiLN methodology
<- 10^seq(-0.1, -1.1, by=-0.05)
ziln.lambdas <- huge(zs, lambda=ziln.lambdas)$path ziln.path
Conducting Meinshausen & Buhlmann graph estimation (mb)....done
num.elements(ziln.path[[1]])
num.elements(ziln.path[[20]])
Plotting Utility Functions
Code here
<- function(adjacency.graph, taxmat, taxa.level) {
get.assortativity.at.level <- as.integer(as.factor(taxmat[, taxa.level]))
groups return(assortativity(adjacency.graph, groups))
}
<- function(adjacency.graph, taxmat) {
get.assortativity.at.levels <- function(taxa.level) get.assortativity.at.level(
curried.assortativity
adjacency.graph,
taxmat,
taxa.level
)return(
lapply(
colnames(taxmat),
curried.assortativity2:length(colnames(taxmat))]
)[
) }
<- function(path, taxmat, lambdas) {
plot.all.assortativities <- lapply(
graphs lapply(path, graph.adjacency),
as.undirected
)<- lapply(
assortativities
graphs,function(graph) get.assortativity.at.levels(graph, taxmat)
)
# Remove first element as full of NaNs
<- assortativities[2:length(assortativities)]
assortativities <- lambdas[2:length(lambdas)]
lambdas.short
.1 <- as.numeric(lapply(assortativities, function(l) l[[1]]))
assortativities.2 <- as.numeric(lapply(assortativities, function(l) l[[2]]))
assortativities.3 <- as.numeric(lapply(assortativities, function(l) l[[3]]))
assortativities.4 <- as.numeric(lapply(assortativities, function(l) l[[4]]))
assortativities.5 <- as.numeric(lapply(assortativities, function(l) l[[5]]))
assortativitiesggplot(
data.frame(assortativities.1),
aes(x=lambdas.short)
+
) geom_line(aes(y = assortativities.1, color = "Phylum")) +
geom_line(aes(y = assortativities.2, color = "Class")) +
geom_line(aes(y = assortativities.3, color = "Order")) +
geom_line(aes(y = assortativities.4, color = "Family")) +
geom_line(aes(y = assortativities.5, color = "Genus")) +
scale_colour_manual("",
breaks = c("Phylum", "Class", "Order", "Family", "Genus"),
values = c("black", "blue", "maroon", "red", "orange")
+
) theme(legend.position = "top") +
labs(x = "Regularization parameter lambda", y = "Assortativities") +
ggtitle("Assortativities at different taxonomic levels")
}
<- function(paths, taxmat, line.names) {
plot.compared.assortativities <- ggplot()
final.plot <- 10000000
min.edges <- 0
max.edges for (i in 1:length(paths)) {
<- paths[[i]]
path <- lapply(
graphs lapply(path, graph.adjacency),
as.undirected
)<- lapply(
assortativities
graphs,function(graph) get.assortativity.at.levels(graph, taxmat)
)
# Remove first element as full of NaNs
#assortativities <- assortativities[2:length(assortativities)]
#lambdas.short <- lambdas[2:length(lambdas)]
<- sapply(path, num.elements)
x.sparsity <- min(min.edges, min(x.sparsity))
min.edges <- max(max.edges, max(x.sparsity))
max.edges
.1 <- as.numeric(lapply(assortativities, function(l) l[[1]]))
assortativities.2 <- as.numeric(lapply(assortativities, function(l) l[[2]]))
assortativities.3 <- as.numeric(lapply(assortativities, function(l) l[[3]]))
assortativities.4 <- as.numeric(lapply(assortativities, function(l) l[[4]]))
assortativities.5 <- as.numeric(lapply(assortativities, function(l) l[[5]]))
assortativities
<- final.plot +
final.plot geom_line(
aes_string(
x=x.sparsity,
y=assortativities.1,
color=shQuote("Phylum"),
linetype=line.names[[i]]
)+
) geom_line(
aes_string(
x=x.sparsity,
y=assortativities.2,
color=shQuote("Class"),
linetype=line.names[[i]]
)+
) geom_line(
aes_string(
x=x.sparsity,
y=assortativities.3,
color=shQuote("Order"),
linetype=line.names[[i]]
)+
) geom_line(
aes_string(
x=x.sparsity,
y=assortativities.4,
color=shQuote("Family"),
linetype=line.names[[i]]
)+
) geom_line(
aes_string(
x=x.sparsity,
y=assortativities.5,
color=shQuote("Genus"),
linetype=line.names[[i]]
)
)
}
+
final.plot scale_x_log10() +
scale_colour_manual("",
breaks = c("Phylum", "Class", "Order", "Family", "Genus"),
values = c("black", "blue", "maroon", "red", "orange")
+
) theme(legend.position = "top") +
labs(x = "Number of Edges", y = "Assortativities") +
ggtitle("Assortativities at different taxonomic levels")# +
#xlim(min.edges, max.edges)
}
Make Plots
plot.compared.assortativities(
list(antGLasso.path, ziln.path),
taxmat_el,list(shQuote("antGLasso"), shQuote("ziln"))
)
This looks wonderful! antGLasso
and ziln
perform similarly.
(Don’t try to interpret the low-edges happenings on the graph, it’s very noisy. Vincent Prost (2021) considered the number of edges to be ~1200, which conveniently is about the area where our algorithm clearly outperforms it on the low-level taxonomies!1)
References
Footnotes
Assuming you believe assortativity is something to be maximized, whereas really it shouldn’t be (then the graphs would be boring, it would only be useful for predicting phylogeny of unknown species) - instead assortativity is just a validation measure to show that our graphs are sensible.↩︎