suppressWarnings(suppressMessages(library(vroom)))
suppressWarnings(suppressMessages(library(tidyverse)))
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(ggrepel)))
suppressWarnings(suppressMessages(library(magrittr)))
suppressWarnings(suppressMessages(library(FactoMineR)))
suppressWarnings(suppressMessages(library(ComplexHeatmap)))
suppressWarnings(suppressMessages(library(scales)))
suppressWarnings(suppressMessages(library(modelr)))
suppressWarnings(suppressMessages(library(RColorBrewer)))
suppressWarnings(suppressMessages(library(patchwork)))
suppressWarnings(suppressMessages(library(showtext)))
1 Introduction
In scRNA-seq, various tools, such as Monocle3, provide the capability of performing pseudotime analysis. In brief, assume that there are both progenitors and more differentiated progenies in an scRNA-seq dataset. If we consider the most undeferentiated progenitors as the developmental origin (assigning them the number \(0\)), and the most differentiated progenies the developmental ends (assigning them the number 10), then we can assign each intermediate cell within them a number between \(0\) and \(10\). For cells with numbers approaching \(0\) more, they are more similar with the progenitors in terms of their RNA expression patterns and vice versa. Once we assign each cell a number (i.e. a developmental pseudotime point) and order them based on their pseudotime, we can arrange highly variable genes based on their peaking expression patterns (i.e. genes with peaking expression patterns at early stages are placed at the left, etc.).
In bulk RNA-seq, the number of samples is far less than the number of cells in scRNA-seq, where each cell can be regarded as a sample, so the gene expression dynamics along the developmental stages are not so smooth (i.e. jagged) as we have seen in scRNA-seq if we do the same analysis in bulk RNA-seq as in scRNA-seq. Therefore, to make the gene expression dynamics smoother along the developmental stages, we need to obtain more pseudo/interpolated time points than those we have.
Briefly, to achieve this goal, we need to do the following things:
Define the time scale among developmental samples based on their mutual Euclidean distances calculated from their coordinates
(Dim.1, Dim.2)
obtained from their PCA space (i.e. consider the earliest sample as the developmental origin, assign it \(0\), and for the remaining samples, use their Euclidean disntances accumulated from the origin as their developmental time points).Scale the time scale to
(0, 10)
.Fit a spline for each gene based its \((time, expression)\) pairs along the actual developmental stages, and use this fitted spline to interpolate more \((time, expression)\) pairs (using the
loess
method inmodelr
package).For each gene, obtain its PCA coordinate
(Dim.1, Dim.2)
, and then feed all possible signed combinations ofDim.1
andDim.2
of all genes toatan2
to get a sequence of values used to sort genes.Visualize gene expression dynamics along interpolated time points to pick the expected one.
2 Pipeline
<- "Arial"
font_family
<- filter(font_files(), family == font_family)
font_df font_add(
family = font_family,
regular = if ("Regular" %in% font_df[["face"]]) font_df[["file"]][font_df[["face"]] == "Regular"] else stop("no font file found"),
bold = if ("Bold" %in% font_df[["face"]]) font_df[["file"]][font_df[["face"]] == "Bold"] else NULL,
italic = if ("Bold Italic" %in% font_df[["face"]]) font_df[["file"]][font_df[["face"]] == "Bold Italic"] else NULL,
bolditalic = if ("Italic" %in% font_df[["face"]]) font_df[["file"]][font_df[["face"]] == "Italic"] else NULL
)showtext_auto()
# specify input gene expression matrix
# containing one ID column named "GeneID"
# the remaining columns are sample columns named in the form of "SampleID.Replicate" (e.g. Skin.1, Skin.2, etc.)
# SampleID must not contain "."
# Replicate must be one or more integers
<- "./data/RNA_TPM.txt"
expr_file # specify the sample levels, reflecting their actual developmental stages
<- c("DAI0", "DAI3", "DAI6", "DAI9", "DAI12")
sample_dev_order <- 500 time_points_num
<- vroom(expr_file) %>%
expr as.data.frame() %>%
set_rownames(.[["GeneID"]]) %>%
select(-all_of("GeneID")) %>%
distinct()
<- strsplit(names(expr), ".", fixed = T) %>%
sample_df do.call(rbind, .) %>%
as.data.frame() %>%
set_colnames(c("SampleID", "Replicate")) %>%
mutate(Sample = paste0(SampleID, ".", Replicate))
# calculate the mean expression value of each gene within each sample
<- data.frame(GeneID = row.names(expr))
data for (id in unique(sample_df[["SampleID"]])) {
<- filter(sample_df, SampleID == id) %>%
id_reps pull(Sample) %>%
unique()
<- data.frame(Expr = rowMeans(expr[, id_reps]))
id_mean_expr names(id_mean_expr) <- id
<- bind_cols(data, id_mean_expr)
data
}<- as.data.frame(data) %>%
data set_rownames(.[["GeneID"]]) %>%
select(-all_of("GeneID"))
# use row variances to identify the top 3000 most variable genes
# log2-trsanformation is recommended for reducing variance variation among genes
<- log2(data + 1)
data "var"]] <- apply(data, 1, var)
data[[<- data %>%
data arrange(desc(var)) %>%
slice_head(n = 3000) %>%
select(-all_of("var"))
<- data[, sample_dev_order] data
# perform PCA analysis over samples (samples as observations)
# calculate Euclidean distances among samples based on their coordinates (Dim.1, Dim.2) in sample PCA space
# obtain the developmental time scale by accumulating distances of mutual samples
<- PCA(t(data), scale.unit = T, ncp = 5, graph = F)
sample_pca <- sample_pca$ind$coord[, 1:2]
sample_pca_coords
# visualize sample positions in PCA space
<- as.data.frame(sample_pca_coords)
sample_pca_coords_vis "Sample"]] <- row.names(sample_pca_coords_vis)
sample_pca_coords_vis[[<- as.data.frame(sample_pca$eig)
sample_pca_eig_vis
ggplot(sample_pca_coords_vis, aes(Dim.1, Dim.2)) +
geom_point(size = 2) +
geom_text_repel(aes(label = Sample), size = 5, min.segment.length = 3) +
xlab(paste0("PC1 (", round(sample_pca_eig_vis["comp 1", "percentage of variance"]), "%)")) +
ylab(paste0("PC2 (", round(sample_pca_eig_vis["comp 2", "percentage of variance"]), "%)")) +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_text(size = 26),
axis.title.y = element_text(size = 26),
axis.text.x = element_text(size = 24),
axis.text.y = element_text(size = 24),
legend.text = element_text(size = 24),
legend.title = element_text(size = 26),
text = element_text(family = "Arial")
)
<- as.matrix(dist(sample_pca_coords, method = "euclidean"))
sample_dists
# visualize sample distances via heatmap
Heatmap(sample_dists, cluster_rows = F, cluster_columns = F)
# calculate the developmental time scale by accumulating distances of mutual samples along the actual developmental stages
<- cumsum(c(0, sapply(2:ncol(data), function(x) {
raw_timeline - 1, x]
sample_dists[x
})))# scale the raw time scale to (0, 10)
<- scales::rescale(raw_timeline, to = c(0, 10)) new_timeline
# fit a spline for each gene and obtain 500 time points by interpolation
<- as.data.frame(t(scale(t(data))))
data_scale
# interpolate more time points (e.g., 500) to make the expression dynamics smoother along the developmental stages
# based on the fitted spline for each gene (using the loess method in modelr package)
<- function(sample_value, sample_timeline, time_points_num = 500) {
pseudotime_model_fun <- data.frame(time = seq(0, 10, length.out = time_points_num))
grid <- tibble(value = sample_value, time = sample_timeline)
data <- loess(value ~ time, data)
model <- add_predictions(grid, model)
predict return(predict)
}
<- apply(data_scale, 1, pseudotime_model_fun, sample_timeline = new_timeline, time_points_num = time_points_num)
pseudotime_model_res <- lapply(pseudotime_model_res, function(x) {
res "pred"]]
x[[%>%
}) do.call(rbind, .) %>%
as.data.frame() %>%
set_colnames(pseudotime_model_res[[1]][["time"]])
# perform PCA analysis over genes
# use atan2 method to sort genes based on their coordinates (Dim.1, Dim.2) in gene PCA space
<- PCA(res, scale.unit = T, ncp = 5, graph = F)
gene_pca <- gene_pca$ind$coord[, 1:2]
gene_pca_coords <- bind_cols(res, gene_pca_coords)
res
# we have four signed combinations of Dim.1 and Dim.2
"atan2.1"]] <- atan2(res[["Dim.1"]], res[["Dim.2"]])
res[["atan2.2"]] <- atan2(res[["Dim.1"]], -res[["Dim.2"]])
res[["atan2.3"]] <- atan2(-res[["Dim.1"]], res[["Dim.2"]])
res[["atan2.4"]] <- atan2(-res[["Dim.1"]], -res[["Dim.2"]])
res[[
# sort genes based on their atan2 values in ascending order
<- arrange(res, res[["atan2.1"]])
res_order1 <- arrange(res, res[["atan2.2"]])
res_order2 <- arrange(res, res[["atan2.3"]])
res_order3 <- arrange(res, res[["atan2.4"]])
res_order4
# pick the expected one
<- Heatmap(as.matrix(res_order1[, 1:time_points_num]),
p1 cluster_rows = F,
cluster_columns = F,
show_row_names = F,
show_column_names = F,
column_title = "Order1",
heatmap_legend_param = list(title = "Order1", legend_height = unit(2, "cm")),
col = colorRampPalette(rev(brewer.pal(n = 11, name = "RdYlBu")))(100)
)<- Heatmap(as.matrix(res_order2[, 1:time_points_num]),
p2 cluster_rows = F,
cluster_columns = F,
show_row_names = F,
show_column_names = F,
column_title = "Order2",
heatmap_legend_param = list(title = "Order2", legend_height = unit(2, "cm")),
col = colorRampPalette(rev(brewer.pal(n = 11, name = "RdYlBu")))(100)
)<- Heatmap(as.matrix(res_order3[, 1:time_points_num]),
p3 cluster_rows = F,
cluster_columns = F,
show_row_names = F,
show_column_names = F,
column_title = "Order3",
heatmap_legend_param = list(title = "Order3", legend_height = unit(2, "cm")),
col = colorRampPalette(rev(brewer.pal(n = 11, name = "RdYlBu")))(100)
)<- Heatmap(as.matrix(res_order4[, 1:time_points_num]),
p4 cluster_rows = F,
cluster_columns = F,
show_row_names = F,
show_column_names = F,
column_title = "Order4",
heatmap_legend_param = list(title = "Order4", legend_height = unit(2, "cm")),
col = colorRampPalette(rev(brewer.pal(n = 11, name = "RdYlBu")))(100)
)
+ p2 + p3 + p4 p1