library(tidyverse)
library(vroom)
library(magrittr)
library(ggrepel)
library(ggprism)
library(YRUtils)
<- "/home/yangrui/mywd/wutintin_transcriptome/input"
input_dir <- "/home/yangrui/mywd/wutintin_transcriptome/output"
output_dir <- 10
top_n <- "gene_name"
gene_id_column <- "diff_flag"
diff_flag_column <- "log2FoldChange"
logfc_column <- "padj"
padj_column <- "diff_flag"
color_column <- 1
logfc_threshold <- 0.05
padj_threshold # Up, Down, No
<- c("magenta4", "cyan4", "grey25")
diff_flag_colors <- 0.5
geom_point_alpha <- 2
geom_point_size <- 1
geom_line_linewidth <- "grey25"
geom_line_color <- "dashed"
geom_line_linetype <- 1
geom_text_alpha <- 3
min_segment_length <- 5
geom_text_size <- TRUE
theme_prism_panel_border <- "Arial"
font_family <- 16
theme_base_font_size
<- list.files(input_dir, pattern = "\\.(tsv|txt|csv)$", full.names = TRUE, recursive = FALSE)
files for (file in files) {
<- vroom(file) %>%
data select(all_of(c(gene_id_column, diff_flag_column, logfc_column, padj_column))) %>%
set_colnames(c("gene_id", "diff_flag", "logFC", "padj")) %>%
distinct()
<- data %>%
anno_data filter(diff_flag != "NO") %>%
group_by(diff_flag) %>%
slice_max(abs(logFC), n = top_n)
<- count(data, diff_flag) %>%
diff_flag_count_table arrange(diff_flag)
<- paste0(paste0(
plot_title $diff_flag, ": ",
diff_flag_count_table$n
diff_flag_count_tablecollapse = "\n")
),
<- ggplot(data, aes(logFC, -log10(padj), color = diff_flag)) +
p geom_point(alpha = geom_point_alpha, size = geom_point_size) +
geom_vline(
xintercept = c(-logfc_threshold, logfc_threshold),
linewidth = geom_line_linewidth,
color = geom_line_color,
linetype = geom_line_linetype
+
) geom_hline(
yintercept = -log10(padj_threshold),
linewidth = geom_line_linewidth,
color = geom_line_color,
linetype = geom_line_linetype
+
) geom_label_repel(
data = anno_data,
mapping = aes(logFC, -log10(padj), color = diff_flag, label = gene_id),
max.overlaps = 10000, show.legend = FALSE, alpha = geom_text_alpha,
min.segment.length = min_segment_length, size = geom_text_size
+
) labs(
title = plot_title,
x = "logFC", y = "-log10(padj)",
color = paste0("padj < ", padj_threshold, "\nlogFC > ", logfc_threshold)
+
) theme_prism(border = theme_prism_panel_border, base_family = font_family, base_size = theme_base_font_size) +
theme(legend.title = element_text()) +
scale_color_manual(values = setNames(
diff_flag_colors,c(paste0(strsplit(gsub("\\.(tsv|txt|csv)$", "", basename(file)), "_vs_", fixed = TRUE)[[1]], " Up"), "NO")
))
ppreview(p, file = file.path(output_dir, gsub("\\.(tsv|txt|csv)$", ".pdf", basename(file))))
}
1 Introduction
Used to collect various R code snippets.
2 Plot volcano plot in batch
3 Visualize scRNA-Seq expression of a single gene using dot plot and violin plot of Seurat
library(Seurat)
library(tidyverse)
library(YRUtils)
<- "/data/users/yangrui/sc_omics_ref_datasets/human/dataset_2/ana/res/rna_to_umap.rds"
rna_rds_file <- "/home/yangrui/temp"
output_dir <- "BTN3A2"
target_gene_names <- c("Exc. GluN1" = "GluN1", "Exc. GluN2" = "GluN2", "Exc. GluN3" = "GluN3", "Exc. GluN4" = "GluN4", "Exc. GluN5" = "GluN5", "Exc. GluN6" = "GluN6", "Exc. GluN7" = "GluN7", "Exc. GluN8" = "GluN8", "CGE IN " = "CGE IN", "MGE IN " = "MGE IN", "Oligo" = "OPC/Oligo", "MG" = "MG")
target_cell_types
<- readRDS(rna_rds_file)
rna echo_vec(sort(unique(rna[[]]$Name)))
# dot plot
<- DotPlot(rna, features = target_gene_names, idents = target_cell_types) +
p RotatedAxis() +
scale_y_discrete(
limits = rev(target_cell_types),
labels = rev(names(target_cell_types))
+
) labs(x = NULL, y = NULL) +
theme(text = element_text(family = "Arial"))
ppreview(p, file = file.path(output_dir, "dotplot.pdf"))
# violin plot
<- scales::hue_pal()(length(target_cell_types))
ident_colors
<- VlnPlot(rna, features = target_gene_names, idents = target_cell_types, pt.size = 0, cols = ident_colors)
p <- p + geom_jitter(mapping = aes(color = ident), data = p$data, size = 1) +
p scale_x_discrete(
limits = target_cell_types,
labels = names(target_cell_types)
+
) scale_y_continuous(
expand = expansion(0),
limits = c(0, ceiling(max(p$data[[target_gene_names]])))
+
) labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(text = element_text(family = "Arial"))
ppreview(p, file = file.path(output_dir, "violinplot.pdf"))
4 LaTeX spaces
`a b` | \(a \qquad b\) | Two m widths |
`a b` | \(a \quad b\) | One m width |
a \ b |
\(a \ b\) | \(\frac{1}{3}\) m widths |
a \; b |
\(a \; b\) | \(\frac{2}{7}\) m widths |
a \, b |
\(a \, b\) | \(\frac{1}{6}\) m widths |
ab |
\(ab\) | No space |
a \! b |
\(a \! b\) | \(-\frac{1}{6}\) m widths |
\hspace{length} |
\(a \hspace{1cm} b\) | Horizontal space of specified length |
5 clusterProfiler enrichment results interpretation
超几何分布:
假定在 \(N\) 件产品中有 \(M\) 件不合理,即不合格率 \(p=\frac{M}{N}\)。
现在产品中随机抽取 \(n\) 件做检查,发现 \(k\) 件不合格品的概率为
\[P(X=k)=\frac{C^k_M C^{n-k}_{N-M}}{C^n_N},\ k=t, t+1, ...,s,\ s=\min(M, n),\ t=n-(N-M) \ge 0\]
在做 GO 富集时,记 \(N\) 为背景基因集大小(例如可用样本中所有表达的基因或 GO 数据库中的所有基因作为背景基因集),\(m\) 为某一通路下的基因数,\(n\) 为输入做富集的基因数,\(k\) 为属于该通路下的基因数,则依据超几何分布,我们可以计算如下指标:
p.adjust
: adjusted p value (i.e. adjusted \(P(X=k)\)).Count
: \(k\).Gene ratio
: \(\frac{k}{n}\).Background ratio
: \(\frac{m}{N}\).Rich factor
: \(\frac{k}{m}\).Fold enrichment
: \(\frac{\text{Gene ratio}}{\text{Background ratio}}\).
6 Sort GO terms of two clusters based on RichFactor
and p.adjust
and visualize them using dot plot
library(tidyverse)
library(vroom)
library(YRUtils)
<- "go_bp.tsv"
go_file <- c("P2-Sul Up", "P10-Sul Up")
cluster_levels <- "~/temp"
output_dir
<- vroom(go_file) %>%
go_df mutate(Cluster = factor(Cluster, levels = cluster_levels))
# classify terms into five categories based on RichFactor
<- c(paste0(cluster_levels[1], c("@Specific", "@Up")), paste0(paste0(cluster_levels, collapse = "_vs_"), "@Equal"), paste0(cluster_levels[2], c("@Up", "@Specific")))
sample_flag_levels <- go_df %>%
sample_flag_df pivot_wider(id_cols = "ID", names_from = "Cluster", values_from = "RichFactor") %>%
mutate(
SampleFlag = if_else(is.na(.[[cluster_levels[1]]]), paste0(cluster_levels[2], "@Specific"), if_else(is.na(.[[cluster_levels[2]]]), paste0(cluster_levels[1], "@Specific"), if_else(.[[cluster_levels[1]]] > .[[cluster_levels[2]]], paste0(cluster_levels[1], "@Up"), if_else(.[[cluster_levels[1]]] < .[[cluster_levels[2]]], paste0(cluster_levels[2], "@Up"), paste0(cluster_levels[1], "_vs_", cluster_levels[2], "@Equal"))))),
SampleFlag = factor(SampleFlag, levels = sample_flag_levels)
%>%
) pivot_longer(cols = !all_of(c("ID", "SampleFlag")), names_to = "Cluster", values_to = "RichFactor") %>%
na.omit() %>%
distinct()
# sort terms based on RichFactor and p.adjust
<- inner_join(sample_flag_df, distinct(select(go_df, Cluster, ID, p.adjust)), by = c("Cluster", "ID")) %>%
df arrange(
SampleFlag,case_when(
== cluster_levels[1] ~ -RichFactor,
Cluster == cluster_levels[2] ~ RichFactor,
Cluster TRUE ~ RichFactor
),case_when(
== cluster_levels[1] ~ -p.adjust,
Cluster == cluster_levels[2] ~ p.adjust,
Cluster TRUE ~ p.adjust
)
)
<- go_df %>%
go_df mutate(ID = factor(ID, levels = unique(df[["ID"]]))) %>%
arrange(ID) %>%
mutate(Description = factor(Description, levels = rev(unique(Description))))
<- ggplot(go_df, aes(Cluster, Description, size = RichFactor, color = p.adjust)) +
p geom_point() +
scale_color_gradient(low = "#e06663", high = "#327eba") +
scale_y_discrete(position = "right") +
guides(x = guide_axis(angle = 90)) +
labs(x = NULL, y = NULL) +
theme_bw(base_size = 18, base_family = "Arial")
ppreview(p, file = file.path(output_dir, "temp.pdf"))
7 Retrieve UniProt entries using REST API in batch
library(vroom)
library(tidyverse)
library(glue)
library(rvest)
# no more than 25
<- 20
max_length <- 9606
organism_id <- "https://rest.uniprot.org/uniprotkb/search?query=({accession_str})%20AND%20(active:true)%20AND%20(organism_id:{organism_id})&fields=accession,gene_primary,organism_name,organism_id,protein_name,annotation_score,protein_existence,reviewed,ft_intramem,cc_subcellular_location,ft_topo_dom,ft_transmem,go,go_p,go_c,go_f,cc_interaction&format=tsv"
url_template <- "test.tsv"
file
<- vroom(file) %>%
accessions pull(Entry) %>%
na.omit() %>%
unique()
<- split(accessions, ceiling(seq_along(accessions) / max_length))
accession_ls <- tibble()
df for (accession_vec in accession_ls) {
<- paste0(paste0("accession:", accession_vec), collapse = "%20OR%20")
accession_str <- glue(url_template)
url_instance
<- try(item_raw_text <- read_html(url_instance) %>% html_text(), silent = T)
e if ("try-error" %in% class(e)) {
message("invalid accession string: ", accession_str)
else {
} <- vroom(I(item_raw_text), delim = "\t") %>%
item_df mutate(
`Organism (ID)` = as.integer(`Organism (ID)`),
Annotation = as.integer(Annotation)
)<- bind_rows(df, item_df)
df
}
}<- distinct(df) df