Misc R code – NeuroBorder

1 Introduction

Used to collect various R code snippets.

2 Plot volcano plot in batch

library(tidyverse)
library(vroom)
library(magrittr)
library(ggrepel)
library(ggprism)
library(YRUtils)

input_dir <- "/home/yangrui/mywd/wutintin_transcriptome/input"
output_dir <- "/home/yangrui/mywd/wutintin_transcriptome/output"
top_n <- 10
gene_id_column <- "gene_name"
diff_flag_column <- "diff_flag"
logfc_column <- "log2FoldChange"
padj_column <- "padj"
color_column <- "diff_flag"
logfc_threshold <- 1
padj_threshold <- 0.05
# Up, Down, No
diff_flag_colors <- c("magenta4", "cyan4", "grey25")
geom_point_alpha <- 0.5
geom_point_size <- 2
geom_line_linewidth <- 1
geom_line_color <- "grey25"
geom_line_linetype <- "dashed"
geom_text_alpha <- 1
min_segment_length <- 3
geom_text_size <- 5
theme_prism_panel_border <- TRUE
font_family <- "Arial"
theme_base_font_size <- 16

files <- list.files(input_dir, pattern = "\\.(tsv|txt|csv)$", full.names = TRUE, recursive = FALSE)
for (file in files) {
    data <- vroom(file) %>%
        select(all_of(c(gene_id_column, diff_flag_column, logfc_column, padj_column))) %>%
        set_colnames(c("gene_id", "diff_flag", "logFC", "padj")) %>%
        distinct()
    anno_data <- data %>%
        filter(diff_flag != "NO") %>%
        group_by(diff_flag) %>%
        slice_max(abs(logFC), n = top_n)

    diff_flag_count_table <- count(data, diff_flag) %>%
        arrange(diff_flag)
    plot_title <- paste0(paste0(
        diff_flag_count_table$diff_flag, ": ",
        diff_flag_count_table$n
    ), collapse = "\n")

    p <- ggplot(data, aes(logFC, -log10(padj), color = diff_flag)) +
        geom_point(alpha = geom_point_alpha, size = geom_point_size) +
        geom_vline(
            xintercept = c(-logfc_threshold, logfc_threshold),
            linewidth = geom_line_linewidth,
            color = geom_line_color,
            linetype = geom_line_linetype
        ) +
        geom_hline(
            yintercept = -log10(padj_threshold),
            linewidth = geom_line_linewidth,
            color = geom_line_color,
            linetype = geom_line_linetype
        ) +
        geom_label_repel(
            data = anno_data,
            mapping = aes(logFC, -log10(padj), color = diff_flag, label = gene_id),
            max.overlaps = 10000, show.legend = FALSE, alpha = geom_text_alpha,
            min.segment.length = min_segment_length, size = geom_text_size
        ) +
        labs(
            title = plot_title,
            x = "logFC", y = "-log10(padj)",
            color = paste0("padj < ", padj_threshold, "\nlogFC > ", logfc_threshold)
        ) +
        theme_prism(border = theme_prism_panel_border, base_family = font_family, base_size = theme_base_font_size) +
        theme(legend.title = element_text()) +
        scale_color_manual(values = setNames(
            diff_flag_colors,
            c(paste0(strsplit(gsub("\\.(tsv|txt|csv)$", "", basename(file)), "_vs_", fixed = TRUE)[[1]], " Up"), "NO")
        ))

    ppreview(p, file = file.path(output_dir, gsub("\\.(tsv|txt|csv)$", ".pdf", basename(file))))
}

3 Visualize scRNA-Seq expression of a single gene using dot plot and violin plot of Seurat

library(Seurat)
library(tidyverse)
library(YRUtils)

rna_rds_file <- "/data/users/yangrui/sc_omics_ref_datasets/human/dataset_2/ana/res/rna_to_umap.rds"
output_dir <- "/home/yangrui/temp"
target_gene_names <- "BTN3A2"
target_cell_types <- c("Exc. GluN1" = "GluN1", "Exc. GluN2" = "GluN2", "Exc. GluN3" = "GluN3", "Exc. GluN4" = "GluN4", "Exc. GluN5" = "GluN5", "Exc. GluN6" = "GluN6", "Exc. GluN7" = "GluN7", "Exc. GluN8" = "GluN8", "CGE IN " = "CGE IN", "MGE IN " = "MGE IN", "Oligo" = "OPC/Oligo", "MG" = "MG")

rna <- readRDS(rna_rds_file)
echo_vec(sort(unique(rna[[]]$Name)))

# dot plot
p <- DotPlot(rna, features = target_gene_names, idents = target_cell_types) +
    RotatedAxis() +
    scale_y_discrete(
        limits = rev(target_cell_types),
        labels = rev(names(target_cell_types))
    ) +
    labs(x = NULL, y = NULL) +
    theme(text = element_text(family = "Arial"))

ppreview(p, file = file.path(output_dir, "dotplot.pdf"))

# violin plot
ident_colors <- scales::hue_pal()(length(target_cell_types))

p <- VlnPlot(rna, features = target_gene_names, idents = target_cell_types, pt.size = 0, cols = ident_colors)
p <- p + geom_jitter(mapping = aes(color = ident), data = p$data, size = 1) +
    scale_x_discrete(
        limits = target_cell_types,
        labels = names(target_cell_types)
    ) +
    scale_y_continuous(
        expand = expansion(0),
        limits = c(0, ceiling(max(p$data[[target_gene_names]])))
    ) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(text = element_text(family = "Arial"))

ppreview(p, file = file.path(output_dir, "violinplot.pdf"))

4 LaTeX spaces

`a b`	\(a \qquad b\)	Two `m` widths
`a b`	\(a \quad b\)	One `m` width
`a \ b`	\(a \ b\)	\(\frac{1}{3}\) `m` widths
`a \; b`	\(a \; b\)	\(\frac{2}{7}\) `m` widths
`a \, b`	\(a \, b\)	\(\frac{1}{6}\) `m` widths
`ab`	\(ab\)	No space
`a \! b`	\(a \! b\)	\(-\frac{1}{6}\) `m` widths
`\hspace{length}`	\(a \hspace{1cm} b\)	Horizontal space of specified length

5 clusterProfiler enrichment results interpretation

超几何分布：

假定在 \(N\) 件产品中有 \(M\) 件不合理，即不合格率 \(p=\frac{M}{N}\)。

现在产品中随机抽取 \(n\) 件做检查，发现 \(k\) 件不合格品的概率为

\[P(X=k)=\frac{C^k_M C^{n-k}_{N-M}}{C^n_N},\ k=t, t+1, ...,s,\ s=\min(M, n),\ t=n-(N-M) \ge 0\]

在做 GO 富集时，记 \(N\) 为背景基因集大小（例如可用样本中所有表达的基因或 GO 数据库中的所有基因作为背景基因集），\(m\) 为某一通路下的基因数，\(n\) 为输入做富集的基因数，\(k\) 为属于该通路下的基因数，则依据超几何分布，我们可以计算如下指标：

p.adjust: adjusted p value (i.e. adjusted \(P(X=k)\)).
Count: \(k\).
Gene ratio: \(\frac{k}{n}\).
Background ratio: \(\frac{m}{N}\).
Rich factor: \(\frac{k}{m}\).
Fold enrichment: \(\frac{\text{Gene ratio}}{\text{Background ratio}}\).

6 Sort GO terms of two clusters based on `RichFactor` and `p.adjust` and visualize them using dot plot

library(tidyverse)
library(vroom)
library(YRUtils)

go_file <- "go_bp.tsv"
cluster_levels <- c("P2-Sul Up", "P10-Sul Up")
output_dir <- "~/temp"

go_df <- vroom(go_file) %>%
    mutate(Cluster = factor(Cluster, levels = cluster_levels))

# classify terms into five categories based on RichFactor
sample_flag_levels <- c(paste0(cluster_levels[1], c("@Specific", "@Up")), paste0(paste0(cluster_levels, collapse = "_vs_"), "@Equal"), paste0(cluster_levels[2], c("@Up", "@Specific")))
sample_flag_df <- go_df %>%
    pivot_wider(id_cols = "ID", names_from = "Cluster", values_from = "RichFactor") %>%
    mutate(
        SampleFlag = if_else(is.na(.[[cluster_levels[1]]]), paste0(cluster_levels[2], "@Specific"), if_else(is.na(.[[cluster_levels[2]]]), paste0(cluster_levels[1], "@Specific"), if_else(.[[cluster_levels[1]]] > .[[cluster_levels[2]]], paste0(cluster_levels[1], "@Up"), if_else(.[[cluster_levels[1]]] < .[[cluster_levels[2]]], paste0(cluster_levels[2], "@Up"), paste0(cluster_levels[1], "_vs_", cluster_levels[2], "@Equal"))))),
        SampleFlag = factor(SampleFlag, levels = sample_flag_levels)
    ) %>%
    pivot_longer(cols = !all_of(c("ID", "SampleFlag")), names_to = "Cluster", values_to = "RichFactor") %>%
    na.omit() %>%
    distinct()
# sort terms based on RichFactor and p.adjust
df <- inner_join(sample_flag_df, distinct(select(go_df, Cluster, ID, p.adjust)), by = c("Cluster", "ID")) %>%
    arrange(
        SampleFlag,
        case_when(
            Cluster == cluster_levels[1] ~ -RichFactor,
            Cluster == cluster_levels[2] ~ RichFactor,
            TRUE ~ RichFactor
        ),
        case_when(
            Cluster == cluster_levels[1] ~ -p.adjust,
            Cluster == cluster_levels[2] ~ p.adjust,
            TRUE ~ p.adjust
        )
    )

go_df <- go_df %>%
    mutate(ID = factor(ID, levels = unique(df[["ID"]]))) %>%
    arrange(ID) %>%
    mutate(Description = factor(Description, levels = rev(unique(Description))))

p <- ggplot(go_df, aes(Cluster, Description, size = RichFactor, color = p.adjust)) +
    geom_point() +
    scale_color_gradient(low = "#e06663", high = "#327eba") +
    scale_y_discrete(position = "right") +
    guides(x = guide_axis(angle = 90)) +
    labs(x = NULL, y = NULL) +
    theme_bw(base_size = 18, base_family = "Arial")
ppreview(p, file = file.path(output_dir, "temp.pdf"))

7 Retrieve UniProt entries using REST API in batch

library(vroom)
library(tidyverse)
library(glue)
library(rvest)

# no more than 25
max_length <- 20
organism_id <- 9606
url_template <- "https://rest.uniprot.org/uniprotkb/search?query=({accession_str})%20AND%20(active:true)%20AND%20(organism_id:{organism_id})&fields=accession,gene_primary,organism_name,organism_id,protein_name,annotation_score,protein_existence,reviewed,ft_intramem,cc_subcellular_location,ft_topo_dom,ft_transmem,go,go_p,go_c,go_f,cc_interaction&format=tsv"
file <- "test.tsv"

accessions <- vroom(file) %>%
    pull(Entry) %>%
    na.omit() %>%
    unique()

accession_ls <- split(accessions, ceiling(seq_along(accessions) / max_length))
df <- tibble()
for (accession_vec in accession_ls) {
    accession_str <- paste0(paste0("accession:", accession_vec), collapse = "%20OR%20")
    url_instance <- glue(url_template)

    e <- try(item_raw_text <- read_html(url_instance) %>% html_text(), silent = T)
    if ("try-error" %in% class(e)) {
        message("invalid accession string: ", accession_str)
    } else {
        item_df <- vroom(I(item_raw_text), delim = "\t") %>%
            mutate(
                `Organism (ID)` = as.integer(`Organism (ID)`),
                Annotation = as.integer(Annotation)
            )
        df <- bind_rows(df, item_df)
    }
}
df <- distinct(df)