<- "/data/users/yangrui/lab_projs/lym_new"
work_dir
setwd(work_dir)
1 Introduction
At present, this pipeline only allows two sample groups contained in your input files.
2 Quality check
expr_file
must include two ID columns:
protein_groups
: in which each row must be unique;gene_name
: in which each gene name may not be unique.
In addition, it should also contain a number of sample columns containing protein abundance levels.
Each sample name must be in the form of ID_N
, where N
is the replicate number.
sample_file
must include the following three columns:
sample
: with the same sample names as those in the abundance file above in the form ofID_N
;group
: in the form ofID
;replicate
: in the form ofN
.
library(tidyverse)
library(vroom)
library(psych)
library(FactoMineR)
library(ggforce)
library(ggprism)
library(ggalign)
library(YRUtils)
<- "input/BTN3A2_proteomic_quantification_data.tsv"
expr_file <- "input/sample_sheet.tsv"
sample_file <- "qc"
qc_dir # Keep those rows detected in at least N replicates in at least one group
<- 2
expr_sample_num
dir.create(qc_dir, showWarnings = FALSE, recursive = FALSE)
<- vroom(sample_file) %>%
sample_df distinct()
<- vroom(expr_file) %>%
expr_df distinct()
table(duplicated(expr_df$protein_groups))
for (s in sample_df$sample) {
is.na(expr_df[[s]])] <- 0
expr_df[[s]][
}
<- rep(F, nrow(expr_df))
filter_flag for (g in unique(sample_df$group)) {
<- expr_df[, filter(sample_df, group == g) %>% pull(sample) %>% unique(), drop = F]
tmp_df <- filter_flag | (rowSums(tmp_df > 0) >= expr_sample_num)
filter_flag
}<- expr_df[filter_flag, ]
expr_df
<- tibble()
all_vs_none_expr_df <- unique(sample_df$group)
sample_groups for (g in sample_groups) {
<- expr_df[rowSums(expr_df[, filter(sample_df, group == g) %>% pull(sample) %>% unique(), drop = F] > 0) == 0, ] %>%
tmp_df mutate(diff_flag = paste0(sample_groups[sample_groups != g], " Up"))
<- bind_rows(all_vs_none_expr_df, tmp_df)
all_vs_none_expr_df
}<- filter(expr_df, !(protein_groups %in% all_vs_none_expr_df$protein_groups))
both_expr_df vroom_write(all_vs_none_expr_df,
file = gsub("\\.\\w+$", ".all_vs_none.tsv", expr_file),
col_names = TRUE, append = FALSE
)vroom_write(both_expr_df,
file = gsub("\\.\\w+$", ".both.tsv", expr_file),
col_names = TRUE, append = FALSE
)<- log2(expr_df[, unique(sample_df$sample)] + 1)
log_expr_df
# Correlation
<- corr.test(log_expr_df, use = "pairwise", method = "pearson", adjust = "BH")
cor_res
<- ggheatmap(
p $r,
cor_reswidth = ncol(cor_res$r) * unit(10, "mm"),
height = nrow(cor_res$r) * unit(10, "mm")
+
) scheme_align(free_spaces = "t") +
scale_fill_gradient2(
low = "blue", mid = "white", high = "red",
midpoint = 0, limits = c(-1, 1),
breaks = c(-1, -0.5, 0, 0.5, 1)
+
) labs(fill = "R") +
guides(x = guide_axis(angle = 45)) +
theme(
text = element_text(size = 20, family = "Arial", color = "black"),
axis.text = element_text(size = 20, family = "Arial", color = "black")
+
) anno_top() +
ggalign(
data = gsub("_\\d+$", "", colnames(cor_res$r)),
size = unit(4, "mm")
+
) geom_tile(aes(y = 1, fill = factor(value))) +
scale_y_continuous(breaks = NULL, name = NULL, expand = expansion(0)) +
labs(fill = "Sample") +
theme(
text = element_text(size = 20, family = "Arial", color = "black"),
axis.text = element_text(size = 20, family = "Arial", color = "black")
+
) with_quad(scheme_align(guides = "t"), NULL)
ppreview(p, file = file.path(qc_dir, "correlation.pdf"))
# PCA
<- PCA(t(log_expr_df), ncp = 10, scale.unit = TRUE, graph = FALSE)
pca
<- as.data.frame(pca$ind$coord)
pca_coord $sample <- row.names(pca_coord)
pca_coord$group <- factor(gsub("_\\d+$", "", pca_coord$sample))
pca_coord<- as.data.frame(pca$eig)
pca_eig
<- ggplot(pca_coord, aes(Dim.1, Dim.2)) +
p geom_point(aes(color = group), size = 4) +
xlab(paste0("PC1 (", round(pca_eig["comp 1", "percentage of variance"]), "%)")) +
ylab(paste0("PC2 (", round(pca_eig["comp 2", "percentage of variance"]), "%)")) +
geom_mark_ellipse(aes(fill = group), color = NA, alpha = 0.25) +
theme_prism(base_family = "Arial", border = TRUE, base_size = 20)
ppreview(p, file = file.path(qc_dir, "pca.pdf"))
3 Differential abundance analysis
For spectra count (MS2), the distribution of which can be approximated by a Poisson distribution. In this case, DESeq2
, edgeR
, etc. may be used.
For signal intensity (MS1), the distribution of which can be approximated by a Normal distribution. In this case, limma
may be used.
3.1 DEP2 (limma
-based method)
library(DEP2)
library(patchwork)
library(ggplot2)
library(vroom)
library(tidyverse)
library(ggridges)
library(YRUtils)
<- "input/BTN3A2_proteomic_quantification_data.both.tsv"
expr_file <- "input/sample_sheet.tsv"
sample_file <- "de"
output_dir
dir.create(output_dir, showWarnings = FALSE, recursive = FALSE)
<- vroom(sample_file) %>%
sample_df distinct()
<- vroom(expr_file) %>%
expr_df distinct()
### 1. Perform differential expression analysis over genes detected in both treatment and control
# For duplicated IDs, add tailed numbers, delimiter is "."
<- make_unique(expr_df, names = "gene_name", ids = "protein_groups", delim = ";")
expr_df # Take expression columns
<- which(names(expr_df) %in% sample_df$sample)
expr_cols
## 1. Construct SummarizedExperiment object
<- make_se_parse(expr_df,
se columns = expr_cols,
mode = "delim", sep = "_",
log2transform = T
)
## 2. Filtering
# Filtering out reverse, contaminant, and low-quality features with many missing values
# Allow up to N missing values in each condition
<- 1
mv_num
# filter_se() also provides filtering options based on filter_formula,
# which can be used to filter features based on given columns
<- filter_se(se, thr = mv_num)
filter_se
<- (plot_frequency(se) + ggtitle("Protein identifications overlap before filter")) /
p plot_frequency(filter_se) + ggtitle("Protein identifications overlap after filter"))
(ppreview(p, file = file.path(output_dir, "filter_freq_hists.pdf"))
# Even after filtering, a considerable proportion of missing values may still remain in the assay
plot_missval(filter_se)
## 3. Normalization
# log1p --> variance stabilizing transformation (vst)
<- normalize_vsn(filter_se)
norm_se
<- plot_normalization(norm_se)
p ppreview(p, file = file.path(output_dir, "norm_boxplot.pdf"))
# Density and CumSum plots of intensities of proteins with and without missing values
plot_detect(norm_se)
## 4. Imputation
# 1. Some may be suitable for missing at random
# In this case, both small and large values may be missing,
# so imputing relatively larger values is reasonable
# 2. Some may be suitable for missing not at random
# In this case, I think a large proportion of missing values are so small that they cannot be detected,
# so using imputation with cautions, it may introduce unwanted bias
# You should have a deep comprehension for the sources of missing values
# c("QRILC", "bpca", "knn", "MLE", "MinDet", "MinProb", "man", "min", "zero", "mixed", "nbavg", "RF", "GSimp")
# It seems that "MLE" is really time-consuming
# Remove method "mixed" because it needs specifying which rows are missing at random (the remaining rows are missing not at random), and specifying two methods for dealing with missing at random and missing not at random respectively
# Remove method "MLE" because it seems that we need to adjust parameters more carefully otherwise the imputation values of NAs are much larger than the normal
<- c("QRILC", "bpca", "knn", "MinDet", "MinProb", "man", "min", "zero", "nbavg", "RF", "GSimp")
impute_methods
names(impute_methods) <- impute_methods
# Keep only those sample-feature pairs having NAs
<- assay(norm_se) %>%
nas_df as.data.frame() %>%
mutate(gene_name = row.names(.)) %>%
pivot_longer(cols = !gene_name, names_to = "label", values_to = "value") %>%
mutate(is_na = is.na(value)) %>%
filter(is_na) %>%
select(gene_name, label)
# Keep only those sample-feature pairs not having NAs
<- nas_df %>%
non_nas mutate(id = paste0(label, ":", gene_name)) %>%
pull(id) %>%
unique()
# Try each imputation method
<- lapply(impute_methods, function(x) {
impute_se_ls message(paste0("\n\nrunning ", x, " ..."))
impute(norm_se, fun = x)
})
<- tibble()
imps for (m in names(impute_se_ls)) {
<- assay(impute_se_ls[[m]]) %>%
tmp_df as.data.frame() %>%
mutate(gene_name = row.names(.)) %>%
pivot_longer(cols = !gene_name, names_to = "label", values_to = "value") %>%
left_join(as.data.frame(colData(impute_se_ls[[m]])[, c("label", "condition")]), by = "label") %>%
right_join(nas_df, by = c("gene_name", "label")) %>%
mutate(method = m)
<- bind_rows(imps, tmp_df)
imps
}<- assay(norm_se) %>%
non_imps as.data.frame() %>%
mutate(gene_name = row.names(.)) %>%
pivot_longer(cols = !gene_name, names_to = "label", values_to = "value") %>%
left_join(as.data.frame(colData(norm_se)[, c("label", "condition")]), by = "label") %>%
mutate(id = paste0(label, ":", gene_name), method = "non_impute") %>%
filter(!(id %in% non_nas)) %>%
select(!id)
# Check the distribution of imputation values of NAs under each method
<- ggplot(bind_rows(imps, non_imps), aes(x = value, y = factor(method, level = unique(method)))) +
p geom_density_ridges(
fill = "#027AD450", scale = 1.2,
jittered_points = TRUE, position = position_points_jitter(height = 0),
point_shape = "|", point_size = 2, point_alpha = 1, alpha = 0.7
+
) ylab("Impute method") +
xlab("log2 Intensity") +
theme_classic()
ppreview(p, file = file.path(output_dir, "impute_density.pdf"))
## 5. Hypothesis testing
# T-test using limma
# Test all the other samples vs. control
<- "Control"
control_sample <- 0.05
padj_th <- 1
lfc_th
<- "BTN3A2_vs_Control"
contrast
<- strsplit(contrast, "_vs_")[[1]]
pair for (m in names(impute_se_ls)) {
<- impute_se_ls[[m]]
impute_se
<- test_diff(impute_se, type = "control", control = control_sample, fdr.type = "BH")
diff_se <- add_rejections(diff_se, alpha = padj_th, lfc = lfc_th)
sig_se saveRDS(sig_se, file = file.path(output_dir, paste0(contrast, "_", m, "_de.rds")))
<- as.data.frame(sig_se@elementMetadata)
clean_degs names(clean_degs)[grep("_diff$", names(clean_degs))] <- "log2FoldChange"
names(clean_degs)[grep("_p.adj$", names(clean_degs))] <- "padj"
<- clean_degs %>%
clean_degs group_by(gene_name) %>%
slice_min(num_NAs) %>%
slice_max(sequence_number) %>%
slice_min(padj) %>%
slice_max(log2FoldChange) %>%
slice_sample(n = 1) %>%
ungroup() %>%
arrange(desc(log2FoldChange)) %>%
mutate(diff_flag = if_else(padj < padj_th,
if_else(abs(log2FoldChange) > lfc_th,
if_else(log2FoldChange > lfc_th,
paste0(pair[1], " Up"),
paste0(pair[2], " Up")
),"NO"
),"NO"
))
vroom_write(clean_degs, file = file.path(output_dir, paste0(contrast, "_", m, "_de.tsv")))
<- plot_volcano(sig_se, contrast = contrast, adjusted = T, add_threshold_line = "intersect", pCutoff = padj_th, fcCutoff = lfc_th, add_names = F)
p ppreview(p, file = file.path(output_dir, paste0(contrast, "_", m, "_volcano.pdf")))
<- plot_heatmap(sig_se, manual_contrast = contrast, kmeans = T, k = 2, split_order = c(2, 1), show_row_names = F, show_row_dend = F, heatmap_width = unit(6, "cm"), heatmap_height = unit(0.04, "mm") * nrow(sig_se@elementMetadata))
p ppreview(p, file = file.path(output_dir, paste0(contrast, "_", m, "_heatmap.pdf")))
}
3.2 Only using limma
package
For the following two methods, there must be two columns with names protein_name
(which must be unique) and gene_name
in the expression table file, in which all sample column names must be in the form ID_repN
, which will be matched using the regular expression [a-zA-Z0-9]+_rep[0-9]+
.
- Compare two samples at a time
library(vroom)
library(limma)
library(tidyverse)
library(magrittr)
<- "expression_sheet.tsv"
expr_file # keep those proteins detected in at least N replicates in at least one sample
<- 2
min_expr_num <- 0.05
padj_th <- 1
logfc_th # the first is the control sample
<- c("WT", "S541N")
group_levels
<- vroom(expr_file) %>%
expr_df as.data.frame() %>%
set_rownames(.[["protein_name"]])
<- names(expr_df)[str_detect(names(expr_df), "[a-zA-Z0-9]+_rep[0-9]+")]
samples names(samples) <- gsub("_rep[0-9]+$", "", samples)
<- expr_df[, samples]
expr_mat is.na(expr_mat)] <- 0
expr_mat[
<- rep(FALSE, nrow(expr_mat))
flag for (group in unique(names(samples))) {
<- flag | (rowSums(expr_mat[, samples[names(samples) == group]] > 0) >= min_expr_num)
flag
}<- expr_mat[flag, ]
expr_mat
<- expr_mat %>%
expr_mat_long_df mutate(protein_name = row.names(.)) %>%
pivot_longer(cols = -protein_name, names_to = "sample", values_to = "value") %>%
mutate(group = gsub("_rep[0-9]+$", "", sample))
<- expr_mat_long_df %>%
mean_df filter(value != 0) %>%
group_by(protein_name, group) %>%
reframe(mean = mean(value))
<- left_join(expr_mat_long_df, mean_df, by = c("protein_name", "group")) %>%
expr_mat_filled_df mutate(value = if_else(!is.na(mean) & (value == 0), mean, value))
<- expr_mat_filled_df %>%
expr_mat select(-all_of(c("group", "mean"))) %>%
pivot_wider(names_from = "sample", values_from = "value") %>%
as.data.frame() %>%
set_rownames(.[["protein_name"]]) %>%
select(-protein_name)
<- expr_mat[, samples]
expr_mat <- inner_join(expr_df, expr_mat %>% mutate(protein_name = row.names(.)), suffix = c(".raw", ".mean_filled"), by = "protein_name")
expr_df
<- log2(expr_mat + 1)
expr_mat
<- factor(names(samples), levels = group_levels)
group_list <- model.matrix(~ 0 + group_list)
design colnames(design) <- levels(group_list)
row.names(design) <- colnames(expr_mat)
<- t(combn(group_levels, 2)) %>%
contrasts as.data.frame() %>%
mutate(contrast = paste0(V2, "-", V1)) %>%
pull(contrast)
<- lmFit(expr_mat, design)
fit <- makeContrasts(contrasts = contrasts, levels = design)
contrast_matrix <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2, trend = TRUE, robust = TRUE)
fit2
# check valid coefficients
colnames(fit2$coefficients)
<- "S541N-WT"
coef
<- topTable(fit2, coef = coef, number = Inf) %>%
res mutate(protein_name = row.names(.))
<- inner_join(expr_df, res, by = "protein_name") %>%
res mutate(
P.Value = if_else(P.Value == 0, .Machine$double.xmin, P.Value),
adj.P.Val = if_else(adj.P.Val == 0, .Machine$double.xmin, adj.P.Val),
diff_flag = if_else(adj.P.Val < 0.05,
if_else(abs(logFC) > logfc_th,
if_else(logFC > logfc_th, paste0(strsplit(coef, "-")[[1]][1], " Up"), paste0(strsplit(coef, "-")[[1]][2], " Up")),
"NO"
),"NO"
)%>%
) arrange(diff_flag)
vroom_write(res, file = paste0(coef, ".tsv"), col_names = TRUE, append = FALSE)
- Compare all possible pairs at a time
library(vroom)
library(limma)
library(tidyverse)
library(magrittr)
library(gtools)
<- "test"
work_dir <- "xbf_expression_sheet.tsv"
expr_file # keep those proteins detected in at least N replicates in at least one sample
<- 2
min_expr_num <- 0.05
padj_th <- 1
logfc_th
setwd(work_dir)
<- vroom(expr_file) %>%
expr_df as.data.frame() %>%
set_rownames(.[["protein_name"]])
<- names(expr_df)[str_detect(names(expr_df), "[a-zA-Z0-9]+_rep[0-9]+")]
samples names(samples) <- gsub("_rep[0-9]+$", "", samples)
<- permutations(length(unique(names(samples))), 2, v = unique(names(samples)))
sample_pair_combns
for (i in seq_len(nrow(sample_pair_combns))) {
<- sample_pair_combns[i, ]
group_levels <- samples[names(samples) %in% group_levels]
pair_samples
<- expr_df[, pair_samples]
expr_mat is.na(expr_mat)] <- 0
expr_mat[
<- rep(FALSE, nrow(expr_mat))
flag for (group in group_levels) {
<- flag | (rowSums(expr_mat[, samples[names(samples) == group]] > 0) >= min_expr_num)
flag
}<- expr_mat[flag, ]
expr_mat
<- expr_mat %>%
expr_mat_long_df mutate(protein_name = row.names(.)) %>%
pivot_longer(cols = -protein_name, names_to = "sample", values_to = "value") %>%
mutate(group = gsub("_rep[0-9]+$", "", sample))
<- expr_mat_long_df %>%
mean_df filter(value != 0) %>%
group_by(protein_name, group) %>%
reframe(mean = mean(value))
<- left_join(expr_mat_long_df, mean_df, by = c("protein_name", "group")) %>%
expr_mat_filled_df mutate(value = if_else(!is.na(mean) & (value == 0), mean, value))
<- expr_mat_filled_df %>%
expr_mat select(-all_of(c("group", "mean"))) %>%
pivot_wider(names_from = "sample", values_from = "value") %>%
as.data.frame() %>%
set_rownames(.[["protein_name"]]) %>%
select(-protein_name)
<- expr_mat[, pair_samples]
expr_mat <- log2(expr_mat + 1)
expr_mat
<- factor(names(pair_samples), levels = group_levels)
group_list <- model.matrix(~ 0 + group_list)
design colnames(design) <- levels(group_list)
row.names(design) <- colnames(expr_mat)
<- t(combn(group_levels, 2)) %>%
contrasts as.data.frame() %>%
mutate(contrast = paste0(V2, "-", V1)) %>%
pull(contrast)
<- lmFit(expr_mat, design)
fit <- makeContrasts(contrasts = contrasts, levels = design)
contrast_matrix <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2, trend = TRUE, robust = TRUE)
fit2
<- colnames(fit2$coefficients)
coef
<- topTable(fit2, coef = coef, number = Inf) %>%
res mutate(protein_name = row.names(.))
<- inner_join(expr_df[, c("protein_name", "gene_name")], res, by = "protein_name") %>%
res mutate(
P.Value = if_else(P.Value == 0, .Machine$double.xmin, P.Value),
adj.P.Val = if_else(adj.P.Val == 0, .Machine$double.xmin, adj.P.Val),
diff_flag = if_else(adj.P.Val < 0.05,
if_else(abs(logFC) > logfc_th,
if_else(logFC > logfc_th, paste0(strsplit(coef, "-")[[1]][1], " Up"), paste0(strsplit(coef, "-")[[1]][2], " Up")),
"NO"
),"NO"
)%>%
) arrange(diff_flag)
vroom_write(res, file = paste0(gsub("-", "_vs_", coef), ".tsv"), col_names = TRUE, append = FALSE)
}
4 Merge DE files
library(vroom)
library(tidyverse)
<- c(
files "input/BTN3A2_proteomic_quantification_data.all_vs_none.tsv",
"de/BTN3A2_vs_Control_bpca_de.tsv"
)<- "de/BTN3A2_vs_Control_bpca_de.all_vs_none.tsv"
output_file
<- tibble()
df for (file in files) {
<- bind_rows(
df
df,vroom(file) %>%
select(protein_name, gene_name, diff_flag) %>%
filter(diff_flag != "NO")
)
}<- distinct(df) %>%
df arrange(diff_flag)
vroom_write(df, file = output_file, col_names = TRUE, append = FALSE)
5 Retrieve sub-cellular location info from UniProt and Synaptome database
library(vroom)
library(glue)
library(tidyverse)
library(rvest)
library(stringr)
library(synaptome.db)
<- 10116
organism_id <- "https://rest.uniprot.org/uniprotkb/search?query=(accession:{accession})%20AND%20(active:true)%20AND%20(organism_id:{organism_id})&fields=accession,gene_primary,organism_name,organism_id,protein_name,annotation_score,protein_existence,reviewed,ft_intramem,cc_subcellular_location,ft_topo_dom,ft_transmem,go,go_p,go_c,go_f&format=tsv"
url_template <- "de/BTN3A2_vs_Control_bpca_de.all_vs_none.tsv"
de_file <- "subcellular_location"
output_dir <- c("protein_name", "gene_name", "diff_flag")
keep_cols <- c("BTN3A2 Up", "Control Up")
keep_de_types
dir.create(output_dir, showWarnings = FALSE, recursive = FALSE)
<- vroom(de_file) %>%
df select(all_of(keep_cols)) %>%
filter(diff_flag %in% keep_de_types)
<- tibble()
res_df for (accession in unique(df$protein_name)) {
<- glue(url_template)
url_instance <- try(item_raw_text <- read_html(url_instance) %>% html_text(), silent = T)
e if ("try-error" %in% class(e)) {
message("invalid accession: ", accession)
else {
} <- vroom(I(item_raw_text), delim = "\t")
item_df <- bind_rows(res_df, item_df)
res_df
}
}<- distinct(res_df)
res_df
"scl_secret"]] <- grepl("secret", res_df[["Subcellular location [CC]"]], ignore.case = TRUE)
res_df[["scl_membrane"]] <- grepl("membrane", res_df[["Subcellular location [CC]"]], ignore.case = TRUE)
res_df[["scl_secret_membrane"]] <- res_df[["scl_secret"]] | res_df[["scl_membrane"]]
res_df[[
"uniprot_scl_synapse"]] <- str_extract_all(res_df[["Subcellular location [CC]"]], regex("[\\w-]*synap[\\w-]*", ignore_case = TRUE)) %>%
res_df[[sapply(function(x) {
paste0(unique(na.omit(x)), collapse = ";")
})"uniprot_go_synapse"]] <- str_extract_all(res_df[["Gene Ontology (GO)"]], regex("[\\w-]*synap[\\w-]*", ignore_case = T)) %>%
res_df[[sapply(function(x) {
paste0(unique(na.omit(x)), collapse = ";")
})<- left_join(res_df,
res_df findGeneByCompartmentPaperCnt(cnt = 1) %>%
filter(RatName %in% res_df[["Gene Names (primary)"]]) %>%
select(RatName, Localisation) %>%
distinct() %>%
group_by(RatName) %>%
reframe(synaptome_db_synapse = paste0(unique(Localisation), collapse = ";")),
by = c("Gene Names (primary)" = "RatName")
)"final_synapse"]] <- apply(res_df, 1, function(x) {
res_df[[<- na.omit(c(x["uniprot_scl_synapse"], x["uniprot_go_synapse"], x["synaptome_db_synapse"]))
vec paste0(vec[vec != ""], collapse = ";")
simplify = T)
}, "scl_secret_membrane_final_synapse"]] <- res_df[["scl_secret_membrane"]] & (res_df[["final_synapse"]] != "")
res_df[[
<- filter(res_df, scl_secret_membrane_final_synapse)
target_res_df "final_synapse_label"]] <- sapply(target_res_df[["final_synapse"]], function(x) {
target_res_df[[<- NA
vec if (grepl("postsynap|post-synap", x, ignore.case = TRUE)) {
<- c(vec, "Postsynapse")
vec
}if (grepl("presynap|pre-synap", x, ignore.case = TRUE)) {
<- c(vec, "Presynapse")
vec
}if (grepl("synap", x, ignore.case = TRUE)) {
<- c(vec, "Synapse")
vec
}if (grepl("Synaptic_Vesicle", x, ignore.case = TRUE)) {
<- c(vec, "Synaptic vesicle")
vec
}<- na.omit(vec)
vec if (length(vec) == 0) {
stop("no matching")
}if (length(vec) > 1) {
<- vec[vec != "Synapse"]
vec
}paste0(sort(unique(vec)), collapse = "/")
})
<- inner_join(df, res_df, by = c("protein_name" = "Entry"))
res_df <- inner_join(df, target_res_df, by = c("protein_name" = "Entry"))
target_res_df vroom_write(res_df, file = file.path(output_dir, gsub("\\.\\w+$", ".scl.raw.tsv", basename(de_file))))
vroom_write(target_res_df, file = file.path(output_dir, gsub("\\.\\w+$", ".scl.target.tsv", basename(de_file))))
6 GO enrichment analysis
library(vroom)
library(tidyverse)
library(clusterProfiler)
library(AnnotationDbi)
<- "/data/biodatabase/species/mRatBN7/genome/anno/org.Rn.eg.db.sqlite"
go_db_file <- "de/BTN3A2_vs_Control_bpca_de.all_vs_none.tsv"
degs_file <- "go"
root_dir
<- file.path(root_dir, gsub("\\.\\w+$", "", basename(degs_file)))
output_dir dir.create(output_dir, recursive = TRUE)
<- vroom(degs_file)
degs <- degs[, c("gene_name", "diff_flag")] %>%
gene_set_ls filter(diff_flag != "NO") %>%
split(.[["diff_flag"]]) %>%
lapply(function(x) {
unique(na.omit(x[["gene_name"]]))
})
<- loadDb(go_db_file)
orgdb <- compareCluster(gene_set_ls,
ego fun = "enrichGO",
keyType = "SYMBOL",
OrgDb = orgdb,
ont = "ALL",
pAdjustMethod = "BH",
minGSSize = 10,
maxGSSize = 1000,
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = FALSE,
pool = FALSE
)nrow(ego@compareClusterResult)
saveRDS(ego, file = file.path(output_dir, "GO_ALL.rds"))
vroom_write(ego@compareClusterResult,
file = file.path(output_dir, "GO_ALL.tsv"),
col_names = TRUE, append = FALSE
)
7 Plot expression heatmap
The followings are just various examples:
library(ComplexHeatmap)
library(vroom)
library(tidyverse)
library(magrittr)
library(circlize)
library(YRUtils)
<- "/data/users/yangrui/lab_projs/lym_new/input/sample_sheet.tsv"
sample_file <- "/data/users/yangrui/lab_projs/lym_new/de/BTN3A2_vs_Control_bpca_de.all_vs_none.tsv"
gene_file <- "/data/users/yangrui/lab_projs/lym_new/input/BTN3A2_proteomic_quantification_data.tsv"
quant_file <- "/data/users/yangrui/lab_projs/lym_new/subcellular_location/BTN3A2_vs_Control_bpca_de.all_vs_none.scl.target.cys.tsv"
label_file <- "/home/yangrui/temp"
output_dir
<- vroom(sample_file)
sample_df <- vroom(gene_file) %>%
gene_df filter(diff_flag == "BTN3A2 Up")
<- vroom(label_file)
label_df
<- vroom(quant_file) %>%
quant_df select(all_of(c("protein_name", sample_df$sample))) %>%
filter(protein_name %in% gene_df$protein_name) %>%
as.data.frame() %>%
set_rownames(.[["protein_name"]]) %>%
select(-protein_name)
is.na(quant_df)] <- 0
quant_df[<- scale(t(log2(quant_df + 1)))
quant_mat
<- colorRamp2(c(floor(min(quant_mat)), 0, ceiling(max(quant_mat))), c("skyblue", "white", "orange"))
col_fun <- c("BTN3A2_1", "BTN3A2_2", "BTN3A2_3", "Control_1", "Control_2", "Control_3")
row_order <- gsub("_\\d+$", "", row.names(quant_mat))
row_split <- rowAnnotation(
row_anno Sample = gsub("_\\d+$", "", row.names(quant_mat)),
show_legend = FALSE,
show_annotation_name = FALSE,
simple_anno_size = unit(3, "mm"),
col = list(Sample = c("BTN3A2" = "orange", "Control" = "skyblue"))
)<- columnAnnotation(
col_anno Gene = anno_mark(
at = sapply(label_df$protein_name, function(x) {
which(colnames(quant_mat) == x)
},simplify = TRUE, USE.NAMES = FALSE
),labels = label_df$gene_name
)
)
<- Heatmap(
p
quant_mat,col = col_fun,
cluster_columns = TRUE,
cluster_column_slices = TRUE,
cluster_rows = FALSE,
cluster_row_slices = FALSE,
show_column_names = FALSE,
show_column_dend = FALSE,
show_row_names = FALSE,
row_order = row_order,
row_names_side = "left",
row_split = row_split,
row_gap = unit(0, "mm"),
left_annotation = row_anno,
top_annotation = col_anno,
heatmap_legend_param = list(title = "Expression")
)ppreview(p, file = file.path(output_dir, "heatmap.pdf"))
library(ComplexHeatmap)
library(vroom)
library(tidyverse)
library(magrittr)
library(circlize)
library(YRUtils)
<- "temp/S541N-WT.tsv"
quant_file <- c("Ddx5", "Gapdh", "Actb", "Actg1", "Actn1", "Dhx9", "Draxin", "Pelp1", "Hnrnpa3")
labeled_genes <- "temp"
output_dir
<- expand.grid(c("S541N", "WT"), "_rep", 1:3, ".raw") %>%
sample_df mutate(sample = paste0(Var1, Var2, Var3, Var4)) %>%
rename(group = Var1) %>%
select(group, sample)
<- vroom(quant_file) %>%
quant_df filter(diff_flag == "S541N Up") %>%
select(all_of(c("protein_name", "gene_name", sample_df$sample))) %>%
as.data.frame() %>%
set_rownames(.[["protein_name"]])
<- quant_df %>%
quant_mat select(-protein_name, -gene_name)
is.na(quant_mat)] <- 0
quant_mat[<- scale(t(log2(quant_mat + 1)))
quant_mat
<- colorRamp2(c(floor(min(quant_mat)), 0, ceiling(max(quant_mat))), c("skyblue", "white", "orange"))
col_fun <- c("S541N_rep1.raw", "S541N_rep2.raw", "S541N_rep3.raw", "WT_rep1.raw", "WT_rep2.raw", "WT_rep3.raw")
row_order <- gsub("_rep\\d+.raw$", "", row.names(quant_mat))
row_split <- rowAnnotation(
left_anno Sample = gsub("_rep\\d+.raw$", "", row.names(quant_mat)),
show_legend = FALSE,
show_annotation_name = FALSE,
simple_anno_size = unit(3, "mm"),
col = list(Sample = c("S541N" = "#EE6A50", "WT" = "#76EEC6"))
)
<- quant_df %>%
quant_df mutate(
group = if_else(is.na(gene_name), "Other",
if_else(str_detect(gene_name, "^Rpl"), "Rpl",
if_else(str_detect(gene_name, "^Rps"), "Rps",
if_else(str_detect(gene_name, "^Tuba"), "Tuba",
if_else(str_detect(gene_name, "^Tubb"), "Tubb", "Other")
)
)
)
),group = factor(group, levels = c("Rpl", "Rps", "Tuba", "Tubb", "Other"))
)<- quant_df$group
column_split <- columnAnnotation(
bottom_anno Group = quant_df$group,
show_legend = TRUE,
show_annotation_name = FALSE,
simple_anno_size = unit(3, "mm"),
col = list(Group = c("Rpl" = "#B23AEE", "Rps" = "#B4EEB4", "Tuba" = "#EE7600", "Tubb" = "#00B2EE", "Other" = "#EEA2AD"))
)
<- filter(quant_df, gene_name %in% labeled_genes) %>%
label_df select(protein_name, gene_name)
<- columnAnnotation(
top_anno Gene = anno_mark(
at = sapply(label_df$protein_name, function(x) {
which(colnames(quant_mat) == x)
},simplify = TRUE, USE.NAMES = FALSE
),labels = label_df$gene_name
)
)
<- Heatmap(
p
quant_mat,col = col_fun,
cluster_columns = TRUE,
cluster_column_slices = FALSE,
cluster_rows = FALSE,
cluster_row_slices = FALSE,
show_column_names = FALSE,
show_column_dend = FALSE,
show_row_names = FALSE,
row_order = row_order,
row_names_side = "left",
row_split = row_split,
row_gap = unit(0, "mm"),
column_gap = unit(0, "mm"),
column_split = column_split,
left_annotation = left_anno,
top_annotation = top_anno,
bottom_annotation = bottom_anno,
column_title = NULL,
column_title_rot = 90,
column_title_side = "bottom",
heatmap_legend_param = list(title = "scaled LFQ intensity")
)
ppreview(p, file = file.path(output_dir, "heatmap.pdf"))
library(ComplexHeatmap)
library(vroom)
library(tidyverse)
library(magrittr)
library(circlize)
library(YRUtils)
<- "temp/data.tsv"
quant_file <- "temp"
output_dir
<- vroom(quant_file) %>%
quant_df as.data.frame() %>%
set_rownames(.[["protein"]]) %>%
select(-protein)
<- colorRamp2(c(floor(min(quant_df) * 10) / 10, 1.0, ceiling(max(quant_df) * 10) / 10), c("skyblue", "white", "orange"))
col_fun <- c(
column_order "WT",
"P152A", "H70Q", "A200V", "Y48N", "A200T",
"D99G", "L04P", "N199I", "N199H", "L54P", "K217R", "R18K", "R198Q", "R213W",
"C203S", "C206F", "C206Y", "V63L", "L66H", "P201S", "K209N", "R112P", "Q82H"
)<- tibble(
column_order_df cluster = c("WT", rep("NO", 5), rep("LOF", 9), rep("GOF", 9)),
protein = column_order
%>%
) mutate(
cluster = factor(cluster, levels = c("WT", "NO", "LOF", "GOF")),
protein = factor(protein, levels = colnames(quant_df))
%>%
) arrange(protein)
<- c("GluA1_rep1", "GluA1_rep2", "GluA1_rep3", "GluA2_rep1", "GluA2_rep2", "GluA2_rep3", "GABAARAlpha1_rep1", "GABAARAlpha1_rep2", "GABAARAlpha1_rep3", "GABAARBeta1_rep1", "GABAARBeta1_rep2", "GABAARBeta1_rep3")
row_order <- gsub("_rep\\d+$", "", row.names(quant_df))
row_split <- factor(row_split, levels = unique(row_split))
row_split <- rowAnnotation(
left_anno Sample = gsub("_rep\\d+$", "", row.names(quant_df)),
show_legend = FALSE,
show_annotation_name = FALSE,
simple_anno_size = unit(3, "mm"),
col = list(Sample = c("GluA1" = "#76EEC6", "GluA2" = "#EEAEEE", "GABAARAlpha1" = "#9F79EE", "GABAARBeta1" = "#EE799F"))
)<- columnAnnotation(
top_anno Type = column_order_df$cluster,
show_legend = FALSE,
show_annotation_name = FALSE,
simple_anno_size = unit(3, "mm"),
col = list(Type = c("WT" = "#BEBEBE", "NO" = "#EED8AE", "LOF" = "#EE8262", "GOF" = "#BCD2EE"))
)<- "Arial"
font_family
# `gpar()` is used to handle grid graphical parameters
# for how to use `gpar()`, see `?gpar`
# for ComplexHeatmap, you can set some parameters globally using `ht_opt`
# e.g. `ht_opt$legend_title_gp <- gpar(fontfamily = "Times New Roman")`
<- Heatmap(
p
quant_df,col = col_fun,
cluster_columns = FALSE,
cluster_column_slices = FALSE,
cluster_rows = FALSE,
cluster_row_slices = FALSE,
show_column_names = TRUE,
show_column_dend = FALSE,
show_row_names = FALSE,
show_row_dend = FALSE,
column_order = column_order,
column_split = column_order_df$cluster,
row_order = row_order,
row_split = row_split,
row_gap = unit(0.8, "mm"),
column_gap = unit(0.8, "mm"),
left_annotation = left_anno,
top_anno = top_anno,
height = nrow(quant_df) * unit(5, "mm"),
width = ncol(quant_df) * unit(5, "mm"),
column_title_rot = 90,
row_title_rot = 0,
heatmap_legend_param = list(
title = "normalized expression level",
title_gp = gpar(fontfamily = font_family),
labels_gp = gpar(fontfamily = font_family)
),row_names_gp = gpar(fontfamily = font_family),
column_names_gp = gpar(fontfamily = font_family),
row_title_gp = gpar(fontfamily = font_family),
column_title_gp = gpar(fontfamily = font_family)
)ppreview(p, file = file.path(output_dir, "heatmap.pdf"))
8 Some examples about usages of UniProt and STRING APIs
- Retrieve subcellular locations and interactions from UniProt
library(vroom)
library(glue)
library(tidyverse)
library(rvest)
library(stringr)
<- "test"
work_dir <- 9606
organism_id <- "https://rest.uniprot.org/uniprotkb/search?query=(accession:{accession})%20AND%20(active:true)%20AND%20(organism_id:{organism_id})&fields=accession,gene_primary,organism_name,organism_id,protein_name,annotation_score,protein_existence,reviewed,ft_intramem,cc_subcellular_location,ft_topo_dom,ft_transmem,go,go_p,go_c,go_f,cc_interaction&format=tsv"
url_template <- "degs/xbfGlia_vs_xbfAstro.tsv"
de_file <- "subcellular_locations_and_interactions"
output_dir
setwd(work_dir)
dir.create(output_dir, showWarnings = FALSE, recursive = FALSE)
<- vroom(de_file) %>%
df select(all_of(c("protein_name", "gene_name", "diff_flag"))) %>%
filter(diff_flag != "NO")
<- tibble()
res_df for (accession in unique(df$protein_name)) {
<- glue(url_template)
url_instance <- try(item_raw_text <- read_html(url_instance) %>% html_text(), silent = T)
e if ("try-error" %in% class(e)) {
message("invalid accession: ", accession)
else {
} <- vroom(I(item_raw_text), delim = "\t") %>%
item_df mutate(
`Organism (ID)` = as.integer(`Organism (ID)`),
Annotation = as.integer(Annotation)
)<- bind_rows(res_df, item_df)
res_df
}
}<- distinct(res_df)
res_df
"scl_secret"]] <- grepl("secret", res_df[["Subcellular location [CC]"]], ignore.case = TRUE)
res_df[["scl_membrane"]] <- grepl("membrane", res_df[["Subcellular location [CC]"]], ignore.case = TRUE)
res_df[["scl_extracellular"]] <- grepl("extracellular", res_df[["Subcellular location [CC]"]], ignore.case = TRUE)
res_df[["scl_pass"]] <- res_df[["scl_secret"]] | res_df[["scl_membrane"]] | res_df[["scl_extracellular"]]
res_df[[
<- filter(res_df, scl_pass)
res_df nrow(res_df)
<- inner_join(df, res_df, by = c("protein_name" = "Entry")) %>%
res_df arrange(diff_flag)
vroom_write(res_df, file = file.path(output_dir, gsub("\\.(txt|tsv)$", ".secreted.tsv", basename(de_file))), col_names = TRUE, append = FALSE)
<- res_df %>%
res_df separate_rows(all_of("Interacts with"), sep = "; ") %>%
mutate(
interacted_protein = `Interacts with`,
interacted_protein = if_else(
str_detect(interacted_protein, ".*\\[[a-zA-Z0-9-]+\\]$"),
str_extract(interacted_protein, "(?<=\\[)[a-zA-Z0-9-]+(?=\\])"),
interacted_protein
),interacted_protein = gsub("-[0-9]+$", "", interacted_protein)
%>%
) filter(!is.na(interacted_protein))
<- tibble()
interact_res_df for (accession in unique(res_df[["interacted_protein"]])) {
<- glue(url_template)
url_instance <- try(item_raw_text <- read_html(url_instance) %>% html_text(), silent = T)
e if ("try-error" %in% class(e)) {
message("invalid accession: ", accession)
else {
} <- vroom(I(item_raw_text), delim = "\t") %>%
item_df mutate(
`Organism (ID)` = as.integer(`Organism (ID)`),
Annotation = as.integer(Annotation)
)<- bind_rows(interact_res_df, item_df)
interact_res_df
}
}<- distinct(interact_res_df)
interact_res_df
"scl_secret"]] <- grepl("secret", interact_res_df[["Subcellular location [CC]"]], ignore.case = TRUE)
interact_res_df[["scl_membrane"]] <- grepl("membrane", interact_res_df[["Subcellular location [CC]"]], ignore.case = TRUE)
interact_res_df[["scl_extracellular"]] <- grepl("extracellular", interact_res_df[["Subcellular location [CC]"]], ignore.case = TRUE)
interact_res_df[["scl_pass"]] <- interact_res_df[["scl_secret"]] | interact_res_df[["scl_membrane"]] | interact_res_df[["scl_extracellular"]]
interact_res_df[[
<- filter(interact_res_df, scl_pass) %>%
interact_res_df mutate(interact_database = "UniProt")
nrow(interact_res_df)
<- inner_join(res_df, interact_res_df, by = c("interacted_protein" = "Entry"), suffix = c(".dep", ".interacted"))
final_res_df vroom_write(final_res_df, file = file.path(output_dir, gsub("\\.(txt|tsv)$", ".interacted.uniprot.tsv", basename(de_file))), col_names = TRUE, append = FALSE)
- Retrieve protein interactions from STRING
library(vroom)
library(glue)
library(tidyverse)
library(rvest)
library(stringr)
library(jsonlite)
<- "test"
work_dir <- 9606
organism_id <- "https://rest.uniprot.org/uniprotkb/search?query=(accession:{accession})%20AND%20(active:true)%20AND%20(organism_id:{organism_id})&fields=accession,gene_primary,organism_name,organism_id,protein_name,annotation_score,protein_existence,reviewed,ft_intramem,cc_subcellular_location,ft_topo_dom,ft_transmem,go,go_p,go_c,go_f,cc_interaction&format=tsv"
url_template <- "curl --request POST 'https://rest.uniprot.org/idmapping/run' --form 'ids=\"{ids}\"' --form 'from=\"UniProtKB_AC-ID\"' --form 'to=\"UniProtKB\"'"
submit_cmd <- "curl -i 'https://rest.uniprot.org/idmapping/status/{submit_cmd_instance_response_job_id}'"
status_cmd <- "curl -s 'https://rest.uniprot.org/idmapping/results/{submit_cmd_instance_response_job_id}'"
result_cmd <- 20
max_length <- "string_db/9606.protein.physical.links.full.v12.0.txt.gz"
network_file <- "string_db/9606.protein.info.v12.0.txt.gz"
info_file <- "string_db/9606.protein.aliases.v12.0.txt.gz"
alias_file <- "subcellular_locations_and_interactions/xbfGlia_vs_xbfAstro.secreted.tsv"
secret_file <- "subcellular_locations_and_interactions"
output_dir
setwd(work_dir)
dir.create(output_dir, showWarnings = FALSE, recursive = FALSE)
<- vroom(info_file) %>%
info_df select(all_of(c("#string_protein_id", "preferred_name"))) %>%
distinct()
<- vroom(secret_file) %>%
secret_df inner_join(info_df, by = c("gene_name" = "preferred_name"))
<- vroom(network_file) %>%
network_df filter(protein1 %in% secret_df[["#string_protein_id"]]) %>%
inner_join(info_df, by = c("protein2" = "#string_protein_id"))
<- c("Ensembl_HGNC_uniprot_ids", "UniProt_ID", "UniProt_AC")
source_levels <- vroom(alias_file) %>%
alias_df filter((`#string_protein_id` %in% network_df$protein2) & (source %in% source_levels)) %>%
mutate(source = factor(source, levels = source_levels)) %>%
group_by(`#string_protein_id`) %>%
slice_min(source) %>%
slice_sample(n = 1) %>%
ungroup()
<- inner_join(network_df, alias_df, by = c("protein2" = "#string_protein_id"))
network_df
<- unique(network_df$alias)
uniq_ids <- tibble()
mpt_df for (id_vec in split(uniq_ids, ceiling(seq_along(uniq_ids) / max_length))) {
<- paste0(id_vec, collapse = ",")
ids
<- glue(submit_cmd)
submit_cmd_instance <- system(submit_cmd_instance, intern = TRUE, ignore.stdout = FALSE, ignore.stderr = FALSE, wait = TRUE)
submit_cmd_instance_response <- fromJSON(submit_cmd_instance_response)
submit_cmd_instance_response_ls <- submit_cmd_instance_response_ls$jobId
submit_cmd_instance_response_job_id
<- glue(status_cmd)
status_cmd_instance while (TRUE) {
<- system(status_cmd_instance, intern = TRUE, ignore.stdout = FALSE, ignore.stderr = FALSE, wait = TRUE)
status_cmd_instance_response <- trimws(status_cmd_instance_response)
status_cmd_instance_response <- fromJSON(status_cmd_instance_response[length(status_cmd_instance_response)])
status_cmd_instance_response_ls if ((status_cmd_instance_response[1] == "HTTP/2 303") && (status_cmd_instance_response_ls$jobStatus == "FINISHED")) {
message("results can be retrieved!")
break
}Sys.sleep(3)
}
<- glue(result_cmd)
result_cmd_instance <- system(result_cmd_instance, intern = TRUE, ignore.stdout = FALSE, ignore.stderr = FALSE, wait = TRUE)
result_cmd_instance_response <- fromJSON(result_cmd_instance_response)
result_cmd_instance_response_ls if (length(result_cmd_instance_response_ls$results) == 0) {
stop("no ID was mapped successfully")
}<- result_cmd_instance_response_ls$results
tmp_df <- bind_rows(mpt_df, tmp_df)
mpt_df
}<- distinct(mpt_df)
mpt_df
<- inner_join(network_df, mpt_df, by = c("alias" = "from"))
network_df
<- tibble()
interact_res_df for (accession in unique(network_df$to)) {
<- glue(url_template)
url_instance <- try(item_raw_text <- read_html(url_instance) %>% html_text(), silent = T)
e if ("try-error" %in% class(e)) {
message("invalid accession: ", accession)
else {
} <- vroom(I(item_raw_text), delim = "\t") %>%
item_df mutate(
`Organism (ID)` = as.integer(`Organism (ID)`),
Annotation = as.integer(Annotation)
)<- bind_rows(interact_res_df, item_df)
interact_res_df
}
}<- distinct(interact_res_df)
interact_res_df
"scl_secret"]] <- grepl("secret", interact_res_df[["Subcellular location [CC]"]], ignore.case = TRUE)
interact_res_df[["scl_membrane"]] <- grepl("membrane", interact_res_df[["Subcellular location [CC]"]], ignore.case = TRUE)
interact_res_df[["scl_extracellular"]] <- grepl("extracellular", interact_res_df[["Subcellular location [CC]"]], ignore.case = TRUE)
interact_res_df[["scl_pass"]] <- interact_res_df[["scl_secret"]] | interact_res_df[["scl_membrane"]] | interact_res_df[["scl_extracellular"]]
interact_res_df[[
<- filter(interact_res_df, scl_pass) %>%
interact_res_df mutate(interact_database = "STRING")
nrow(interact_res_df)
<- inner_join(secret_df, network_df, by = c("#string_protein_id" = "protein1")) %>%
final_res_df inner_join(interact_res_df, by = c("to" = "Entry"), suffix = c(".dep", ".interacted"))
vroom_write(final_res_df, file = file.path(output_dir, gsub("\\.secreted\\.(txt|tsv)$", ".interacted.string.tsv", basename(secret_file))), col_names = TRUE, append = FALSE)