wget -c https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/764/305/GCF_011764305.1_ASM1176430v1.1/GCF_011764305.1_ASM1176430v1.1_genomic.fna.gz
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/764/305/GCF_011764305.1_ASM1176430v1.1/GCF_011764305.1_ASM1176430v1.1_genomic.gff.gz
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/764/305/GCF_011764305.1_ASM1176430v1.1/GCF_011764305.1_ASM1176430v1.1_genomic.gtf.gz
1 Introduction
A little example for how to use limma to analyze Agilent mRNA microarray data.
2 Download ferret (Mustela putorius furo, 9669) reference files
3 Extract transcript sequences
# -w: write a fasta file with spliced exons for each transcript
gffread -w GCF_011764305.1_ASM1176430v1.1_genomic.transcripts.fna -g GCF_011764305.1_ASM1176430v1.1_genomic.fna GCF_011764305.1_ASM1176430v1.1_genomic.gff
4 Build blast database
makeblastdb -in GCF_011764305.1_ASM1176430v1.1_genomic.transcripts.fna -parse_seqids -taxid 9669 -blastdb_version 5 -title "GCF_011764305.1_ASM1176430v1.1 Ferret (Mustela putorius furo) spliced transcripts" -dbtype nucl
5 Prepare mRNA microarray data
library(GEOquery)
library(limma)
library(tidyverse)
library(vroom)
<- "GSE60687"
gse_accession <- "./data/microarray"
out_dir
<- getGEO(gse_accession, GSEMatrix = T, getGPL = T)
gse <- gse[[paste0(gse_accession, "_series_matrix.txt.gz")]]@phenoData@data %>%
sample_df mutate(
sample_id = paste0(
str_extract_all(description, "A(17|19)", simplify = T), ".",
str_extract_all(description, "[A-Z]{2,}", simplify = T), ".",
str_extract_all(description, "^\\d", simplify = T),
str_extract_all(description, "_2", simplify = T)
),sample_file = file.path(out_dir, paste0(sample_id, ".txt.gz")),
wget_cmd = paste0("wget -c -O ", sample_file, " ", supplementary_file)
%>%
) arrange(sample_id)
sapply(sample_df$wget_cmd, function(x) {
system(x)
})
vroom_write(sample_df, file = file.path(out_dir, "samples.tsv"))
<- getGEO(gse[[paste0(gse_accession, "_series_matrix.txt.gz")]]@annotation)
gpl <- filter(gpl@dataTable@table, SPOT_ID != "CONTROL") %>%
gpl_fna mutate(fna = paste0(">", ID, ":", COL, ":", ROW, ":", NAME, ":", CONTROL_TYPE, ":", ACCESSION_STRING, ":", CHROMOSOMAL_LOCATION, "\n", SEQUENCE)) %>%
pull(fna) %>%
unique()
vroom_write(gpl@dataTable@table, file = file.path(out_dir, "gpl.tsv"))
vroom_write_lines(gpl_fna, file = file.path(out_dir, "gpl.fna"))
6 Run blastn
# GPL probe sequences against transcripts
blastn -task megablast -db ../genome/blastdb/GCF_011764305.1_ASM1176430v1.1_genomic.transcripts.fna -query gpl.fna -outfmt "6 qseqid sseqid evalue bitscore pident qcovs stitle" -dust no -max_target_seqs 1 -num_threads 16 -out gpl.blastn.txt
7 Attach gene symbols to GPL probes
library(rtracklayer)
library(vroom)
library(tidyverse)
<- "./data/genome/GCF_011764305.1_ASM1176430v1.1_genomic.gff"
gff_file <- "./data/microarray/gpl.blastn.txt"
gpl_blastn_file <- "./data/microarray"
out_dir
<- as.data.frame(import(gff_file, version = "3")) %>%
gff select(all_of(c(
"seqnames", "start", "end", "width", "strand",
"source", "type", "ID", "Dbxref", "Name", "gbkey",
"gene", "gene_biotype", "Parent", "transcript_id"
%>%
))) distinct()
<- vroom(gpl_blastn_file, col_names = c("qseqid", "sseqid", "evalue", "bitscore", "pident", "qcovs", "stitle")) %>%
gpl_blastn distinct() %>%
group_by(qseqid) %>%
slice_min(evalue) %>%
slice_max(bitscore) %>%
slice_max(pident) %>%
slice_max(qcovs) %>%
ungroup()
table(duplicated(gpl_blastn$qseqid))
<- left_join(gpl_blastn, gff, by = c("sseqid" = "ID"))
df <- left_join(df, filter(gff, type == "gene") %>%
df select(all_of(c("seqnames", "gene", "gene_biotype"))) %>%
distinct(),
by = join_by(seqnames, gene),
suffix = c(".GPL_blastn", ".Parent_gene")
)
<- lapply(df$Dbxref, function(x) {
Dbxref_ls if (length(x) > 0) {
<- strsplit(x, split = ":", fixed = T) %>%
y do.call(rbind, .)
setNames(y[, 2], y[, 1]) %>%
as.list() %>%
as.data.frame()
else {
} data.frame()
}
})<- lapply(Dbxref_ls, names) %>%
Dbxref_names unlist() %>%
unique()
<- lapply(Dbxref_ls, function(x) {
Dbxref_df if (nrow(x) == 0) {
setNames(
rep(NA, length(Dbxref_names)),
Dbxref_names%>%
) as.list() %>%
as.data.frame()
else {
}
x
}%>% do.call(bind_rows, .)
})
names(Dbxref_df) <- paste0("Dbxref_", names(Dbxref_df))
<- bind_cols(df, Dbxref_df)
df
vroom_write(df, file = file.path(out_dir, "gpl.with_gene_symbols.tsv"), delim = "\t")
8 Differential analysis using limma
library(limma)
library(vroom)
library(tidyverse)
<- "./data/microarray/samples.tsv"
sample_file <- "./data/microarray/gpl.with_gene_symbols.tsv"
gpl_file <- "./data/degs"
out_dir
dir.create(out_dir)
<- vroom(gpl_file, delim = "\t")
gpl $ProbeName <- sapply(gpl$qseqid, function(x) {
gplstrsplit(x, ":")[[1]][4]
})<- gpl %>%
gpl select(all_of(c(
"ProbeName", "Dbxref_GeneID", "gene",
"gene_biotype.GPL_blastn", "gene_biotype.Parent_gene"
%>%
))) mutate(GeneBioType = if_else(is.na(gene_biotype.Parent_gene),
gene_biotype.GPL_blastn,
gene_biotype.Parent_gene%>%
)) select(all_of(c("ProbeName", "Dbxref_GeneID", "gene", "GeneBioType"))) %>%
distinct()
names(gpl) <- c("ProbeName", "EntrezID", "Symbol", "GeneBioType")
table(duplicated(gpl$ProbeName))
<- vroom(sample_file)
sample_df
# read in data
# here, we are reading in single-channel Agilent (foreground: median signal; background: median signal) intensity data
# so source = "agilent" and green.only = T
# here, we read in the extra column gIsWellAboveBG, which records whether the intensity of each spot is considered above the background level for that array
<- read.maimages(
x gsub(
"~/mywd/agilent_mrna_expression_microarray_analysis_using_limma",
".",
gsub("\\.gz$", "", sample_df$sample_file)
),source = "agilent", green.only = T,
other.columns = "gIsWellAboveBG"
)<- x
x_copy
# gene annotation
$genes <- left_join(x$genes, gpl, by = "ProbeName")
xall(x$genes$ProbeName == x_copy$genes$ProbeName)
# background correction and normalization
# at this step, we need control probes to be existed in the dataset
<- backgroundCorrect(x, method = "normexp")
y <- normalizeBetweenArrays(y, method = "quantile")
y
# gene filtering
# filter out control probes
<- y$genes$ControlType == 1L
Control # filter out probes without Symbol
<- is.na(y$genes$Symbol)
NoSymbol # keep probes that express in at least 3 arrays (because there are at least 3 replicates in each array)
<- rowSums(y$other$gIsWellAboveBG > 0) >= 3
IsExpr
<- y[!Control & !NoSymbol & IsExpr, ]
yfilt
<- c("EntrezID", "Symbol")
genes_colnames <- colnames(yfilt$E)
E_colnames <- bind_cols(
exprMat $genes[, genes_colnames],
yfiltas.data.frame(yfilt$E)
)<- exprMat %>%
exprMat_dedup arrange(Symbol) %>%
group_by(EntrezID, Symbol) %>%
reframe(across(everything(), mean))
$genes <- exprMat_dedup[, genes_colnames]
yfilt$E <- as.matrix(exprMat_dedup[, E_colnames])
yfilt
# differential expression
<- gsub("\\.[_0-9]+$", "", sample_df$sample_id)
treatments <- unique(treatments)
levels <- factor(treatments, levels = levels)
treatments <- model.matrix(~ 0 + treatments)
design colnames(design) <- levels
<- lmFit(yfilt, design = design)
fit <- expand.grid(x = levels, y = levels) %>%
contrast_pairs mutate(
flag = if_else(x != y, T, F),
pair = paste0(x, "-", y)
%>%
) filter(flag) %>%
pull(pair) %>%
unique()
<- makeContrasts(contrasts = contrast_pairs, levels = design)
contrast_matrix <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2, trend = T, robust = T)
fit2
for (pair in contrast_pairs) {
<- topTable(fit2, coef = pair, number = Inf, adjust.method = "BH", p.value = 1)
res vroom_write(res, file = file.path(out_dir, paste0(pair, ".tsv")), delim = "\t")
}
9 Filter DEGs and plot volcanos
library(vroom)
library(tidyverse)
library(YRUtils)
set_fonts()
<- "./data/degs"
degs_dir <- "./data/clean_degs/padj0.05_logfc1"
out_dir <- c("#FF4757", "#546DE5", "#D2DAE2")
sig_colors <- 0.05
padj_th <- 1
logfc_th
dir.create(out_dir, recursive = T)
<- list.files(degs_dir, pattern = "\\.tsv$", full.names = T, recursive = F)
degs_files for (degs_file in degs_files) {
<- strsplit(gsub("\\.[a-zA-Z0-9]+$", "", basename(degs_file)),
pair split = "-", fixed = T
1]]
)[[
<- vroom(degs_file, delim = "\t", col_names = T)
degs_df <- degs_df %>%
clean_degs_df group_by(Symbol) %>%
slice_min(adj.P.Val, n = 1) %>%
slice_max(abs(logFC), n = 1) %>%
slice_sample(n = 1) %>%
ungroup() %>%
mutate(
diff_flag = if_else(adj.P.Val < padj_th,
if_else(logFC > logfc_th,
paste0(pair[1], " Up"),
if_else(logFC < -logfc_th,
paste0(pair[2], " Up"),
"NO"
)
),"NO"
),diff_flag = factor(diff_flag, levels = c(
paste0(pair[1], " Up"),
paste0(pair[2], " Up"),
"NO"
))
)
vroom_write(clean_degs_df, file = file.path(out_dir, basename(degs_file)))
if (any(duplicated(clean_degs_df$Symbol))) {
stop("Duplicated items still existed after filtering for ", degs_file)
}
<- count(clean_degs_df, diff_flag) %>%
diff_counts mutate(show_text = paste0(diff_flag, ": ", n))
<- paste0(
plot_title paste0(pair, collapse = "_vs_"), "\n",
paste0(diff_counts$show_text, collapse = " ")
)
<- ggplot(clean_degs_df) +
p geom_point(aes(logFC, -log10(adj.P.Val), color = diff_flag),
alpha = 0.5, size = 2
+
) geom_vline(
xintercept = c(-logfc_th, logfc_th),
linewidth = 1, col = "grey25", linetype = "dashed"
+
) geom_hline(
yintercept = -log10(padj_th),
linewidth = 1, col = "grey25", linetype = "dashed"
+
) scale_color_manual(values = setNames(sig_colors, levels(clean_degs_df$diff_flag))) +
labs(
title = plot_title,
x = "log2 Fold Change", y = "-log10(p.adjust)",
color = paste0("p.adjust < ", padj_th, "\n|log2(FC)| > ", logfc_th)
+
) theme_classic() +
theme(
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(size = 26),
axis.title.y = element_text(size = 26),
axis.text.x = element_text(size = 24),
axis.text.y = element_text(size = 24),
legend.text = element_text(size = 24),
legend.title = element_text(size = 26),
text = element_text(family = "Arial")
)
ppreview(p, file = file.path(
out_dir,gsub("\\.[a-zA-Z0-9]+$", ".pdf", basename(degs_file))
)) }
10 GO analysis
10.1 Prepare ferret gene IDs
library(vroom)
library(tidyverse)
<- "./data/microarray/gpl.with_gene_symbols.tsv"
gpl_file <- "./data/go"
out_dir
dir.create(out_dir)
<- vroom(gpl_file, delim = "\t")
gpl $ProbeName <- sapply(gpl$qseqid, function(x) {
gplstrsplit(x, ":")[[1]][4]
})<- gpl %>%
gpl select(all_of(c(
"ProbeName", "Dbxref_GeneID", "gene",
"gene_biotype.GPL_blastn", "gene_biotype.Parent_gene"
%>%
))) mutate(GeneBioType = if_else(is.na(gene_biotype.Parent_gene),
gene_biotype.GPL_blastn,
gene_biotype.Parent_gene%>%
)) select(all_of(c("ProbeName", "Dbxref_GeneID", "gene", "GeneBioType"))) %>%
distinct()
names(gpl) <- c("ProbeName", "EntrezID", "Symbol", "GeneBioType")
vroom_write_lines(as.character(sort(unique(gpl$EntrezID))),
file = file.path(out_dir, "ferret_entrezids.txt")
)
10.2 Prepare orthologous gene set between mm10 and ferret
library(vroom)
library(tidyverse)
library(magrittr)
<- "./data/go/ferret_mm10.orthologs.tsv"
ferret_mm_orthologs_file <- "./data/go/ferret_ncbi_dataset.tsv"
ferret_dataset_file
<- vroom(ferret_dataset_file) %>%
ferret_dataset_df select(all_of(c("NCBI GeneID", "Symbol", "Gene Type", "Gene Group Identifier"))) %>%
set_colnames(c("ferret_EntrezID", "ferret_Symbol", "ferret_GeneType", "Ortholog_Group_Identifier")) %>%
na.omit() %>%
mutate_all(as.character) %>%
distinct()
<- vroom(ferret_mm_orthologs_file) %>%
ferret_mm_orthologs_df select(all_of(c("NCBI GeneID", "Symbol", "Gene Group Identifier"))) %>%
set_colnames(c("mm10_EntrezID", "mm10_Symbol", "Ortholog_Group_Identifier")) %>%
na.omit() %>%
mutate_all(as.character) %>%
distinct()
<- inner_join(ferret_dataset_df, ferret_mm_orthologs_df, by = "Ortholog_Group_Identifier")
df vroom_write(df, file = file.path(dirname(ferret_mm_orthologs_file), "ferret_mm10.orthologs.clean.tsv"))
10.3 Attach orthologous mm10 gene IDs to DEGs of ferret
library(vroom)
library(tidyverse)
<- "./data/go/ferret_mm10.orthologs.clean.tsv"
ferret_mm_orthologs_file <- c(
degs_files "./data/clean_degs/padj0.05_logfc1/A17.VZ-A19.VZ.tsv",
"./data/clean_degs/padj0.05_logfc1/A17.ISVZ-A19.ISVZ.tsv",
"./data/clean_degs/padj0.05_logfc1/A17.OSVZ-A19.OSVZ.tsv"
)
<- vroom(ferret_mm_orthologs_file) %>%
ferret_mm_orthologs_df mutate_all(as.character)
for (degs_file in degs_files) {
vroom(degs_file) %>%
select(all_of(c(
"EntrezID", "Symbol",
"logFC", "P.Value", "adj.P.Val", "diff_flag"
%>%
))) mutate(EntrezID = as.character(EntrezID)) %>%
inner_join(ferret_mm_orthologs_df, by = c("EntrezID" = "ferret_EntrezID")) %>%
arrange(desc(logFC)) %>%
distinct() %>%
filter(ferret_GeneType == "PROTEIN_CODING") %>%
vroom_write(file = gsub("tsv$", "with_mm10_IDs.tsv", degs_file))
}
11 Check the differential expression concordance between samples of mink and samples of ferret
library(vroom)
library(tidyverse)
<- "H:/ubuntu_ssd_backup/projects/mink/proj/rna/degs/res/only_degs/mm10/P2_Gyr_vs_P2_Sul.txt"
mink_degs_file <- c(
degs_files "./data/clean_degs/padj0.05_logfc1/A17.VZ-A19.VZ.with_mm10_IDs.tsv",
"./data/clean_degs/padj0.05_logfc1/A17.ISVZ-A19.ISVZ.with_mm10_IDs.tsv",
"./data/clean_degs/padj0.05_logfc1/A17.OSVZ-A19.OSVZ.with_mm10_IDs.tsv"
)
<- vroom(mink_degs_file) %>%
mink_degs select(all_of(c("final_gene_name", "diff_flag"))) %>%
distinct()
<- tibble()
degs_df for (degs_file in degs_files) {
<- vroom(degs_file) %>%
tmp_df select(all_of(c("mm10_Symbol", "logFC"))) %>%
mutate(tissue = gsub(
"A17\\.[A-Z]+-A19\\.|\\.with_mm10_IDs\\.tsv$",
"", basename(degs_file)
%>%
)) distinct()
<- bind_rows(degs_df, tmp_df)
degs_df
}<- inner_join(mink_degs, degs_df, by = c("final_gene_name" = "mm10_Symbol")) %>%
df mutate(
tissue = factor(tissue, levels = c("VZ", "ISVZ", "OSVZ")),
logFC_sign = if_else(logFC >= 0, "+", "-")
)
ggplot(df) +
geom_jitter(aes(tissue, logFC, color = diff_flag))